blob: d9fcfbe296146f866f9ff7773435e4509405a4f0 [file] [log] [blame]
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +02001#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3# Copyright © 2020 Orange
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +02004# Copyright © 2020 Nokia
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +02005#
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17
18"""
19Kubernetes readiness check.
20
21Checks if a container is ready or if a job is finished.
22The check is done according to the name of the container, not the name of
23its parent (Job, Deployment, StatefulSet, DaemonSet).
24"""
25
26import getopt
27import logging
28import os
29import sys
30import time
31import random
othman touijer87a99b12021-11-24 10:41:24 +010032import requests
Andreas Geissler0adc4b02023-08-28 13:43:28 +020033import socket
34from contextlib import closing
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020035
othman touijer5274bc72021-11-15 11:19:33 +010036from kubernetes import client, config
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020037from kubernetes.client.rest import ApiException
38
othman touijer5274bc72021-11-15 11:19:33 +010039# extract ns from env variable
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020040namespace = os.environ['NAMESPACE']
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020041
42# setup logging
43log = logging.getLogger(__name__)
44handler = logging.StreamHandler(sys.stdout)
45formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
46handler.setFormatter(formatter)
47handler.setLevel(logging.INFO)
48log.addHandler(handler)
49log.setLevel(logging.INFO)
50
othman touijer5274bc72021-11-15 11:19:33 +010051config.load_incluster_config()
Andreas Geissler0adc4b02023-08-28 13:43:28 +020052# use for local testing:
53#config.load_kube_config()
othman touijer5274bc72021-11-15 11:19:33 +010054coreV1Api = client.CoreV1Api()
55api = client.AppsV1Api()
56batchV1Api = client.BatchV1Api()
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020057
58def is_job_complete(job_name):
59 """
60 Check if Job is complete.
61
62 Args:
63 job_name (str): the name of the Job.
64
65 Returns:
66 True if job is complete, false otherwise
67 """
68 complete = False
69 log.info("Checking if %s is complete", job_name)
70 try:
71 response = batchV1Api.read_namespaced_job_status(job_name, namespace)
72 if response.status.succeeded == 1:
73 job_status_type = response.status.conditions[0].type
74 if job_status_type == "Complete":
75 complete = True
76 log.info("%s is complete", job_name)
77 else:
78 log.info("%s is NOT complete", job_name)
79 else:
80 log.info("%s has not succeeded yet", job_name)
81 except ApiException as exc:
82 log.error("Exception when calling read_namespaced_job_status: %s\n",
83 exc)
84 return complete
85
86
87def wait_for_statefulset_complete(statefulset_name):
88 """
89 Check if StatefulSet is running.
90
91 Args:
92 statefulset_name (str): the name of the StatefulSet.
93
94 Returns:
95 True if StatefulSet is running, false otherwise
96 """
97 complete = False
98 try:
99 response = api.read_namespaced_stateful_set(statefulset_name,
100 namespace)
101 status = response.status
102 if (status.replicas == response.spec.replicas and
103 status.ready_replicas == response.spec.replicas and
104 status.observed_generation == response.metadata.generation):
105 log.info("Statefulset %s is ready", statefulset_name)
106 complete = True
107 else:
108 log.info("Statefulset %s is NOT ready", statefulset_name)
109 except ApiException as exc:
110 log.error("Exception when waiting for Statefulset status: %s\n", exc)
111 return complete
112
113
114def wait_for_deployment_complete(deployment_name):
115 """
116 Check if Deployment is running.
117
118 Args:
119 deployment_name (str): the name of the Deployment.
120
121 Returns:
122 True if Deployment is running, false otherwise
123 """
124 complete = False
125 try:
126 response = api.read_namespaced_deployment(deployment_name, namespace)
127 status = response.status
128 if (status.unavailable_replicas is None and
129 (status.updated_replicas is None or
130 status.updated_replicas == response.spec.replicas) and
131 status.replicas == response.spec.replicas and
132 status.ready_replicas == response.spec.replicas and
133 status.observed_generation == response.metadata.generation):
134 log.info("Deployment %s is ready", deployment_name)
135 complete = True
136 else:
137 log.info("Deployment %s is NOT ready", deployment_name)
138 except ApiException as exc:
139 log.error("Exception when waiting for deployment status: %s\n", exc)
140 return complete
141
142
143def wait_for_daemonset_complete(daemonset_name):
144 """
145 Check if DaemonSet is running.
146
147 Args:
148 daemonset_name (str): the name of the DaemonSet.
149
150 Returns:
151 True if DaemonSet is running, false otherwise
152 """
153 complete = False
154 try:
Sylvain Desbureauxb468f082020-07-22 17:47:06 +0200155 response = api.read_namespaced_daemon_set(
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200156 daemonset_name, namespace)
157 status = response.status
158 if status.desired_number_scheduled == status.number_ready:
159 log.info("DaemonSet: %s/%s nodes ready --> %s is ready",
160 status.number_ready, status.desired_number_scheduled,
161 daemonset_name)
162 complete = True
163 else:
164 log.info("DaemonSet: %s/%s nodes ready --> %s is NOT ready",
165 status.number_ready, status.desired_number_scheduled,
166 daemonset_name)
167 except ApiException as exc:
168 log.error("Exception when waiting for DaemonSet status: %s\n", exc)
169 return complete
170
171
172def is_ready(container_name):
173 """
174 Check if a container is ready.
175
176 For a container owned by a Job, it means the Job is complete.
177 Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
178 running with the right number of replicas
179
180 Args:
181 container_name (str): the name of the container.
182
183 Returns:
184 True if container is ready, false otherwise
185 """
186 ready = False
187 log.info("Checking if %s is ready", container_name)
188 try:
189 response = coreV1Api.list_namespaced_pod(namespace=namespace,
190 watch=False)
191 for item in response.items:
192 # container_statuses can be None, which is non-iterable.
193 if item.status.container_statuses is None:
194 continue
195 for container in item.status.container_statuses:
196 if container.name == container_name:
197 name = read_name(item)
198 if item.metadata.owner_references[0].kind == "StatefulSet":
199 ready = wait_for_statefulset_complete(name)
200 elif item.metadata.owner_references[0].kind == "ReplicaSet":
201 deployment_name = get_deployment_name(name)
202 ready = wait_for_deployment_complete(deployment_name)
203 elif item.metadata.owner_references[0].kind == "Job":
204 ready = is_job_complete(name)
205 elif item.metadata.owner_references[0].kind == "DaemonSet":
206 ready = wait_for_daemonset_complete(
207 item.metadata.owner_references[0].name)
208 return ready
209 except ApiException as exc:
210 log.error("Exception when calling list_namespaced_pod: %s\n", exc)
211 return ready
212
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200213def is_pod_ready(pod_name):
214 """
215 Check if a pod is ready.
216
217 For a pod owned by a Job, it means the Job is complete.
218 Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
219 running with the right number of replicas
220
221 Args:
222 pod_name (str): the name of the pod.
223
224 Returns:
225 True if pod is ready, false otherwise
226 """
227 ready = False
228 log.info("Checking if %s is ready", pod_name)
229 try:
230 response = coreV1Api.list_namespaced_pod(namespace=namespace,
231 watch=False)
232 for item in response.items:
233 if (item.metadata.name.startswith(pod_name)):
234 name = read_name(item)
235 if item.metadata.owner_references[0].kind == "StatefulSet":
236 ready = wait_for_statefulset_complete(name)
237 elif item.metadata.owner_references[0].kind == "ReplicaSet":
238 deployment_name = get_deployment_name(name)
239 ready = wait_for_deployment_complete(deployment_name)
240 elif item.metadata.owner_references[0].kind == "Job":
241 ready = is_job_complete(name)
242 elif item.metadata.owner_references[0].kind == "DaemonSet":
243 ready = wait_for_daemonset_complete(
244 item.metadata.owner_references[0].name)
245 return ready
246 except ApiException as exc:
247 log.error("Exception when calling list_namespaced_pod: %s\n", exc)
248 return ready
249
250def is_app_ready(app_name):
251 """
252 Check if a pod with app-label is ready.
253
254 For a pod owned by a Job, it means the Job is complete.
255 Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
256 running with the right number of replicas
257
258 Args:
259 app_name (str): the app label of the pod.
260
261 Returns:
262 True if pod is ready, false otherwise
263 """
264 ready = False
265 log.info("Checking if pod with app-label %s is ready", app_name)
266 try:
267 response = coreV1Api.list_namespaced_pod(namespace=namespace,
268 watch=False)
269 for item in response.items:
270 if item.metadata.labels.get('app', "NOKEY") == app_name:
271 name = read_name(item)
272 if item.metadata.owner_references[0].kind == "StatefulSet":
273 ready = wait_for_statefulset_complete(name)
274 elif item.metadata.owner_references[0].kind == "ReplicaSet":
275 deployment_name = get_deployment_name(name)
276 ready = wait_for_deployment_complete(deployment_name)
277 elif item.metadata.owner_references[0].kind == "Job":
278 ready = is_job_complete(name)
279 elif item.metadata.owner_references[0].kind == "DaemonSet":
280 ready = wait_for_daemonset_complete(
281 item.metadata.owner_references[0].name)
282 return ready
283 except ApiException as exc:
284 log.error("Exception when calling list_namespaced_pod: %s\n", exc)
285 return ready
286
rope252d8ce8e52021-08-31 21:11:24 +0100287def service_mesh_job_check(container_name):
288 """
289 Check if a Job's primary container is complete. Used for ensuring the sidecar can be killed after Job completion.
290 Args:
291 container_name (str): the name of the Job's primary container.
292
293 Returns:
294 True if job's container is in the completed state, false otherwise
295 """
296 complete = False
297 log.info("Checking if %s is complete", container_name)
298 try:
299 response = coreV1Api.list_namespaced_pod(namespace=namespace, watch=False)
300 for item in response.items:
301 # container_statuses can be None, which is non-iterable.
302 if item.status.container_statuses is None:
303 continue
304 for container in item.status.container_statuses:
othman touijer960cab92021-12-02 18:17:45 +0100305 if container.name == container_name and item.status.phase == "Running":
rope252d8ce8e52021-08-31 21:11:24 +0100306 name = read_name(item)
307 log.info("Container Details %s ", container)
308 log.info("Container Status %s ", container.state.terminated)
othman touijeraabacc12021-12-02 15:24:57 +0100309
310 if container.state.terminated:
311 log.info("Container Terminated with reason %s ", container.state.terminated.reason)
312 complete = True
313
rope252d8ce8e52021-08-31 21:11:24 +0100314 except ApiException as exc:
315 log.error("Exception when calling read_namespaced_job_status: %s\n",
316 exc)
317 return complete
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200318
319def read_name(item):
320 """
321 Return the name of the owner's item.
322
323 Args:
324 item (str): the item.
325
326 Returns:
327 the name of first owner's item
328 """
329 return item.metadata.owner_references[0].name
330
331
332def get_deployment_name(replicaset):
333 """
334 Return the name of the Deployment owning the ReplicatSet.
335
336 Args:
337 replicaset (str): the ReplicatSet.
338
339 Returns:
340 the name of the Deployment owning the ReplicatSet
341 """
Sylvain Desbureauxb468f082020-07-22 17:47:06 +0200342 api_response = api.read_namespaced_replica_set_status(replicaset,
343 namespace)
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200344 deployment_name = read_name(api_response)
345 return deployment_name
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200346
347def check_socket(host, port):
348 with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
349 if sock.connect_ex((host, port)) == 0:
350 print("Port is open")
351 return True
352 else:
353 print("Port is not open")
354 return False
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200355
rope252d8ce8e52021-08-31 21:11:24 +0100356def quitquitquit_post(apiurl):
357 URL = apiurl
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200358 if check_socket("127.0.0.1", 15020) is False:
359 log.info("no sidecar exists, exiting")
360 return True
rope252d8ce8e52021-08-31 21:11:24 +0100361 response = requests.post(url = URL)
362 responseStatus = response.ok
363 try:
364 if responseStatus is True:
365 log.info("quitquitquit returned True")
366 return True
367 else:
368 log.info("quitquitquit returned False")
369 return False
370 except:
371 log.info("quitquitquit call failed with exception")
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200372
373DEF_TIMEOUT = 10
rope252d8ce8e52021-08-31 21:11:24 +0100374DEF_URL = "http://127.0.0.1:15020/quitquitquit"
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200375DESCRIPTION = "Kubernetes container readiness check utility"
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200376USAGE = "Usage: ready.py [-t <timeout>] -c <container_name> .. | -j <job_name> .. " \
377 "| -p <pod_name> .. | -a <app_name> .. \n" \
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200378 "where\n" \
379 "<timeout> - wait for container readiness timeout in min, " \
380 "default is " + str(DEF_TIMEOUT) + "\n" \
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200381 "<container_name> - name of the container to wait for\n" \
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200382 "<pod_name> - name of the pod to wait for\n" \
383 "<app_name> - app label of the pod to wait for\n" \
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200384 "<job_name> - name of the job to wait for\n"
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200385
386
387def main(argv):
388 """
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200389 Checks if a container or pod is ready,
390 if a job is finished or if the main container of a job has completed.
391 The check is done according to the name of the container op pod,
392 not the name of its parent (Job, Deployment, StatefulSet, DaemonSet).
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200393
394 Args:
395 argv: the command line
396 """
397 # args are a list of container names
398 container_names = []
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200399 pod_names = []
400 app_names = []
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200401 job_names = []
rope252d8ce8e52021-08-31 21:11:24 +0100402 service_mesh_job_container_names = []
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200403 timeout = DEF_TIMEOUT
rope252d8ce8e52021-08-31 21:11:24 +0100404 url = DEF_URL
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200405 try:
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200406 opts, _args = getopt.getopt(argv, "hj:c:p:a:t:s:u:", ["container-name=",
407 "pod-name",
408 "app-name",
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200409 "timeout=",
rope252d8ce8e52021-08-31 21:11:24 +0100410 "service-mesh-check=",
411 "url=",
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200412 "job-name=",
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200413 "help"])
414 for opt, arg in opts:
415 if opt in ("-h", "--help"):
416 print("{}\n\n{}".format(DESCRIPTION, USAGE))
417 sys.exit()
418 elif opt in ("-c", "--container-name"):
419 container_names.append(arg)
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200420 elif opt in ("-p", "--pod-name"):
421 pod_names.append(arg)
422 elif opt in ("-a", "--app-name"):
423 app_names.append(arg)
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200424 elif opt in ("-j", "--job-name"):
425 job_names.append(arg)
rope252d8ce8e52021-08-31 21:11:24 +0100426 elif opt in ("-s", "--service-mesh-check"):
427 service_mesh_job_container_names.append(arg)
428 elif opt in ("-u", "--url"):
429 url = arg
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200430 elif opt in ("-t", "--timeout"):
431 timeout = float(arg)
432 except (getopt.GetoptError, ValueError) as exc:
433 print("Error parsing input parameters: {}\n".format(exc))
434 print(USAGE)
435 sys.exit(2)
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200436 if container_names.__len__() == 0 and job_names.__len__() == 0 and pod_names.__len__() == 0 \
437 and app_names.__len__() == 0 and service_mesh_job_container_names.__len__() == 0:
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200438 print("Missing required input parameter(s)\n")
439 print(USAGE)
440 sys.exit(2)
441
442 for container_name in container_names:
443 timeout = time.time() + timeout * 60
444 while True:
445 ready = is_ready(container_name)
446 if ready is True:
447 break
448 if time.time() > timeout:
449 log.warning("timed out waiting for '%s' to be ready",
450 container_name)
451 sys.exit(1)
452 else:
453 # spread in time potentially parallel execution in multiple
454 # containers
455 time.sleep(random.randint(5, 11))
Andreas Geissler0adc4b02023-08-28 13:43:28 +0200456 for pod_name in pod_names:
457 timeout = time.time() + timeout * 60
458 while True:
459 ready = is_pod_ready(pod_name)
460 if ready is True:
461 break
462 if time.time() > timeout:
463 log.warning("timed out waiting for '%s' to be ready",
464 pod_name)
465 sys.exit(1)
466 else:
467 # spread in time potentially parallel execution in multiple
468 # containers
469 time.sleep(random.randint(5, 11))
470 for app_name in app_names:
471 timeout = time.time() + timeout * 60
472 while True:
473 ready = is_app_ready(app_name)
474 if ready is True:
475 break
476 if time.time() > timeout:
477 log.warning("timed out waiting for '%s' to be ready",
478 pod_name)
479 sys.exit(1)
480 else:
481 # spread in time potentially parallel execution in multiple
482 # containers
483 time.sleep(random.randint(5, 11))
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200484 for job_name in job_names:
485 timeout = time.time() + timeout * 60
486 while True:
487 ready = is_job_complete(job_name)
488 if ready is True:
489 break
490 if time.time() > timeout:
491 log.warning("timed out waiting for '%s' to be ready",
492 job_name)
493 sys.exit(1)
494 else:
495 # spread in time potentially parallel execution in multiple
496 # containers
497 time.sleep(random.randint(5, 11))
rope252d8ce8e52021-08-31 21:11:24 +0100498 for service_mesh_job_container_name in service_mesh_job_container_names:
499 timeout = time.time() + timeout * 60
500 while True:
501 ready = service_mesh_job_check(service_mesh_job_container_name)
502 if ready is True:
503 sideCarKilled = quitquitquit_post(url)
504 if sideCarKilled is True:
505 log.info("Side Car Killed through QuitQuitQuit API")
506 else:
507 log.info("Side Car Failed to be Killed through QuitQuitQuit API")
508 break
509 if time.time() > timeout:
510 log.warning("timed out waiting for '%s' to be ready",
othman touijeraabacc12021-12-02 15:24:57 +0100511 service_mesh_job_container_name)
rope252d8ce8e52021-08-31 21:11:24 +0100512 sys.exit(1)
513 else:
514 # spread in time potentially parallel execution in multiple
515 # containers
516 time.sleep(random.randint(5, 11))
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200517
518if __name__ == "__main__":
519 main(sys.argv[1:])