blob: 85d59897cf39426c5d9804770e95fc7d4b467cae [file] [log] [blame]
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +02001#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3# Copyright © 2020 Orange
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +02004# Copyright © 2020 Nokia
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +02005#
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17
18"""
19Kubernetes readiness check.
20
21Checks if a container is ready or if a job is finished.
22The check is done according to the name of the container, not the name of
23its parent (Job, Deployment, StatefulSet, DaemonSet).
24"""
25
26import getopt
27import logging
28import os
29import sys
30import time
31import random
32
othman touijer5274bc72021-11-15 11:19:33 +010033from kubernetes import client, config
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020034from kubernetes.client.rest import ApiException
35
othman touijer5274bc72021-11-15 11:19:33 +010036# extract ns from env variable
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020037namespace = os.environ['NAMESPACE']
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020038
39# setup logging
40log = logging.getLogger(__name__)
41handler = logging.StreamHandler(sys.stdout)
42formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
43handler.setFormatter(formatter)
44handler.setLevel(logging.INFO)
45log.addHandler(handler)
46log.setLevel(logging.INFO)
47
othman touijer5274bc72021-11-15 11:19:33 +010048config.load_incluster_config()
49coreV1Api = client.CoreV1Api()
50api = client.AppsV1Api()
51batchV1Api = client.BatchV1Api()
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020052
53def is_job_complete(job_name):
54 """
55 Check if Job is complete.
56
57 Args:
58 job_name (str): the name of the Job.
59
60 Returns:
61 True if job is complete, false otherwise
62 """
63 complete = False
64 log.info("Checking if %s is complete", job_name)
65 try:
66 response = batchV1Api.read_namespaced_job_status(job_name, namespace)
67 if response.status.succeeded == 1:
68 job_status_type = response.status.conditions[0].type
69 if job_status_type == "Complete":
70 complete = True
71 log.info("%s is complete", job_name)
72 else:
73 log.info("%s is NOT complete", job_name)
74 else:
75 log.info("%s has not succeeded yet", job_name)
76 except ApiException as exc:
77 log.error("Exception when calling read_namespaced_job_status: %s\n",
78 exc)
79 return complete
80
81
82def wait_for_statefulset_complete(statefulset_name):
83 """
84 Check if StatefulSet is running.
85
86 Args:
87 statefulset_name (str): the name of the StatefulSet.
88
89 Returns:
90 True if StatefulSet is running, false otherwise
91 """
92 complete = False
93 try:
94 response = api.read_namespaced_stateful_set(statefulset_name,
95 namespace)
96 status = response.status
97 if (status.replicas == response.spec.replicas and
98 status.ready_replicas == response.spec.replicas and
99 status.observed_generation == response.metadata.generation):
100 log.info("Statefulset %s is ready", statefulset_name)
101 complete = True
102 else:
103 log.info("Statefulset %s is NOT ready", statefulset_name)
104 except ApiException as exc:
105 log.error("Exception when waiting for Statefulset status: %s\n", exc)
106 return complete
107
108
109def wait_for_deployment_complete(deployment_name):
110 """
111 Check if Deployment is running.
112
113 Args:
114 deployment_name (str): the name of the Deployment.
115
116 Returns:
117 True if Deployment is running, false otherwise
118 """
119 complete = False
120 try:
121 response = api.read_namespaced_deployment(deployment_name, namespace)
122 status = response.status
123 if (status.unavailable_replicas is None and
124 (status.updated_replicas is None or
125 status.updated_replicas == response.spec.replicas) and
126 status.replicas == response.spec.replicas and
127 status.ready_replicas == response.spec.replicas and
128 status.observed_generation == response.metadata.generation):
129 log.info("Deployment %s is ready", deployment_name)
130 complete = True
131 else:
132 log.info("Deployment %s is NOT ready", deployment_name)
133 except ApiException as exc:
134 log.error("Exception when waiting for deployment status: %s\n", exc)
135 return complete
136
137
138def wait_for_daemonset_complete(daemonset_name):
139 """
140 Check if DaemonSet is running.
141
142 Args:
143 daemonset_name (str): the name of the DaemonSet.
144
145 Returns:
146 True if DaemonSet is running, false otherwise
147 """
148 complete = False
149 try:
Sylvain Desbureauxb468f082020-07-22 17:47:06 +0200150 response = api.read_namespaced_daemon_set(
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200151 daemonset_name, namespace)
152 status = response.status
153 if status.desired_number_scheduled == status.number_ready:
154 log.info("DaemonSet: %s/%s nodes ready --> %s is ready",
155 status.number_ready, status.desired_number_scheduled,
156 daemonset_name)
157 complete = True
158 else:
159 log.info("DaemonSet: %s/%s nodes ready --> %s is NOT ready",
160 status.number_ready, status.desired_number_scheduled,
161 daemonset_name)
162 except ApiException as exc:
163 log.error("Exception when waiting for DaemonSet status: %s\n", exc)
164 return complete
165
166
167def is_ready(container_name):
168 """
169 Check if a container is ready.
170
171 For a container owned by a Job, it means the Job is complete.
172 Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
173 running with the right number of replicas
174
175 Args:
176 container_name (str): the name of the container.
177
178 Returns:
179 True if container is ready, false otherwise
180 """
181 ready = False
182 log.info("Checking if %s is ready", container_name)
183 try:
184 response = coreV1Api.list_namespaced_pod(namespace=namespace,
185 watch=False)
186 for item in response.items:
187 # container_statuses can be None, which is non-iterable.
188 if item.status.container_statuses is None:
189 continue
190 for container in item.status.container_statuses:
191 if container.name == container_name:
192 name = read_name(item)
193 if item.metadata.owner_references[0].kind == "StatefulSet":
194 ready = wait_for_statefulset_complete(name)
195 elif item.metadata.owner_references[0].kind == "ReplicaSet":
196 deployment_name = get_deployment_name(name)
197 ready = wait_for_deployment_complete(deployment_name)
198 elif item.metadata.owner_references[0].kind == "Job":
199 ready = is_job_complete(name)
200 elif item.metadata.owner_references[0].kind == "DaemonSet":
201 ready = wait_for_daemonset_complete(
202 item.metadata.owner_references[0].name)
203 return ready
204 except ApiException as exc:
205 log.error("Exception when calling list_namespaced_pod: %s\n", exc)
206 return ready
207
rope252d8ce8e52021-08-31 21:11:24 +0100208def service_mesh_job_check(container_name):
209 """
210 Check if a Job's primary container is complete. Used for ensuring the sidecar can be killed after Job completion.
211 Args:
212 container_name (str): the name of the Job's primary container.
213
214 Returns:
215 True if job's container is in the completed state, false otherwise
216 """
217 complete = False
218 log.info("Checking if %s is complete", container_name)
219 try:
220 response = coreV1Api.list_namespaced_pod(namespace=namespace, watch=False)
221 for item in response.items:
222 # container_statuses can be None, which is non-iterable.
223 if item.status.container_statuses is None:
224 continue
225 for container in item.status.container_statuses:
226 if container.name == container_name:
227 name = read_name(item)
228 log.info("Container Details %s ", container)
229 log.info("Container Status %s ", container.state.terminated)
230 if container.state.terminated is None:
231 continue
232 log.info("Container Status Reason %s ", container.state.terminated.reason)
233 if container.state.terminated.reason == 'Completed':
234 complete = True
235 log.info("%s is complete", container_name)
236 else:
237 log.info("%s is NOT complete", container_name)
238 except ApiException as exc:
239 log.error("Exception when calling read_namespaced_job_status: %s\n",
240 exc)
241 return complete
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200242
243def read_name(item):
244 """
245 Return the name of the owner's item.
246
247 Args:
248 item (str): the item.
249
250 Returns:
251 the name of first owner's item
252 """
253 return item.metadata.owner_references[0].name
254
255
256def get_deployment_name(replicaset):
257 """
258 Return the name of the Deployment owning the ReplicatSet.
259
260 Args:
261 replicaset (str): the ReplicatSet.
262
263 Returns:
264 the name of the Deployment owning the ReplicatSet
265 """
Sylvain Desbureauxb468f082020-07-22 17:47:06 +0200266 api_response = api.read_namespaced_replica_set_status(replicaset,
267 namespace)
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200268 deployment_name = read_name(api_response)
269 return deployment_name
270
rope252d8ce8e52021-08-31 21:11:24 +0100271def quitquitquit_post(apiurl):
272 URL = apiurl
273 response = requests.post(url = URL)
274 responseStatus = response.ok
275 try:
276 if responseStatus is True:
277 log.info("quitquitquit returned True")
278 return True
279 else:
280 log.info("quitquitquit returned False")
281 return False
282 except:
283 log.info("quitquitquit call failed with exception")
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200284
285DEF_TIMEOUT = 10
rope252d8ce8e52021-08-31 21:11:24 +0100286DEF_URL = "http://127.0.0.1:15020/quitquitquit"
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200287DESCRIPTION = "Kubernetes container readiness check utility"
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200288USAGE = "Usage: ready.py [-t <timeout>] -c <container_name> .. | -j <job_name> .. \n" \
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200289 "where\n" \
290 "<timeout> - wait for container readiness timeout in min, " \
291 "default is " + str(DEF_TIMEOUT) + "\n" \
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200292 "<container_name> - name of the container to wait for\n" \
293 "<job_name> - name of the job to wait for\n"
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200294
295
296def main(argv):
297 """
rope252d8ce8e52021-08-31 21:11:24 +0100298 Checks if a container is ready, if a job is finished or if the main container of a job has completed.
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200299 The check is done according to the name of the container, not the name of
300 its parent (Job, Deployment, StatefulSet, DaemonSet).
301
302 Args:
303 argv: the command line
304 """
305 # args are a list of container names
306 container_names = []
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200307 job_names = []
rope252d8ce8e52021-08-31 21:11:24 +0100308 service_mesh_job_container_names = []
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200309 timeout = DEF_TIMEOUT
rope252d8ce8e52021-08-31 21:11:24 +0100310 url = DEF_URL
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200311 try:
rope252d8ce8e52021-08-31 21:11:24 +0100312 opts, _args = getopt.getopt(argv, "hj:c:t:s:u:", ["container-name=",
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200313 "timeout=",
rope252d8ce8e52021-08-31 21:11:24 +0100314 "service-mesh-check=",
315 "url=",
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200316 "job-name=",
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200317 "help"])
318 for opt, arg in opts:
319 if opt in ("-h", "--help"):
320 print("{}\n\n{}".format(DESCRIPTION, USAGE))
321 sys.exit()
322 elif opt in ("-c", "--container-name"):
323 container_names.append(arg)
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200324 elif opt in ("-j", "--job-name"):
325 job_names.append(arg)
rope252d8ce8e52021-08-31 21:11:24 +0100326 elif opt in ("-s", "--service-mesh-check"):
327 service_mesh_job_container_names.append(arg)
328 elif opt in ("-u", "--url"):
329 url = arg
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200330 elif opt in ("-t", "--timeout"):
331 timeout = float(arg)
332 except (getopt.GetoptError, ValueError) as exc:
333 print("Error parsing input parameters: {}\n".format(exc))
334 print(USAGE)
335 sys.exit(2)
rope252d8ce8e52021-08-31 21:11:24 +0100336 if container_names.__len__() == 0 and job_names.__len__() == 0 and service_mesh_job_container_names.__len__() == 0:
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200337 print("Missing required input parameter(s)\n")
338 print(USAGE)
339 sys.exit(2)
340
341 for container_name in container_names:
342 timeout = time.time() + timeout * 60
343 while True:
344 ready = is_ready(container_name)
345 if ready is True:
346 break
347 if time.time() > timeout:
348 log.warning("timed out waiting for '%s' to be ready",
349 container_name)
350 sys.exit(1)
351 else:
352 # spread in time potentially parallel execution in multiple
353 # containers
354 time.sleep(random.randint(5, 11))
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200355 for job_name in job_names:
356 timeout = time.time() + timeout * 60
357 while True:
358 ready = is_job_complete(job_name)
359 if ready is True:
360 break
361 if time.time() > timeout:
362 log.warning("timed out waiting for '%s' to be ready",
363 job_name)
364 sys.exit(1)
365 else:
366 # spread in time potentially parallel execution in multiple
367 # containers
368 time.sleep(random.randint(5, 11))
rope252d8ce8e52021-08-31 21:11:24 +0100369 for service_mesh_job_container_name in service_mesh_job_container_names:
370 timeout = time.time() + timeout * 60
371 while True:
372 ready = service_mesh_job_check(service_mesh_job_container_name)
373 if ready is True:
374 sideCarKilled = quitquitquit_post(url)
375 if sideCarKilled is True:
376 log.info("Side Car Killed through QuitQuitQuit API")
377 else:
378 log.info("Side Car Failed to be Killed through QuitQuitQuit API")
379 break
380 if time.time() > timeout:
381 log.warning("timed out waiting for '%s' to be ready",
382 job_name)
383 sys.exit(1)
384 else:
385 # spread in time potentially parallel execution in multiple
386 # containers
387 time.sleep(random.randint(5, 11))
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200388
389if __name__ == "__main__":
390 main(sys.argv[1:])