blob: 681f3fd50230b3d1097cf6f8d0051583e39fcbb3 [file] [log] [blame]
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +02001#!/usr/bin/env python3
2# -*- coding: utf-8 -*-
3# Copyright © 2020 Orange
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +02004# Copyright © 2020 Nokia
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +02005#
6# Licensed under the Apache License, Version 2.0 (the "License");
7# you may not use this file except in compliance with the License.
8# You may obtain a copy of the License at
9#
10# http://www.apache.org/licenses/LICENSE-2.0
11#
12# Unless required by applicable law or agreed to in writing, software
13# distributed under the License is distributed on an "AS IS" BASIS,
14# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15# See the License for the specific language governing permissions and
16# limitations under the License.
17
18"""
19Kubernetes readiness check.
20
21Checks if a container is ready or if a job is finished.
22The check is done according to the name of the container, not the name of
23its parent (Job, Deployment, StatefulSet, DaemonSet).
24"""
25
26import getopt
27import logging
28import os
29import sys
30import time
31import random
othman touijer87a99b12021-11-24 10:41:24 +010032import requests
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020033
othman touijer5274bc72021-11-15 11:19:33 +010034from kubernetes import client, config
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020035from kubernetes.client.rest import ApiException
36
othman touijer5274bc72021-11-15 11:19:33 +010037# extract ns from env variable
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020038namespace = os.environ['NAMESPACE']
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020039
40# setup logging
41log = logging.getLogger(__name__)
42handler = logging.StreamHandler(sys.stdout)
43formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
44handler.setFormatter(formatter)
45handler.setLevel(logging.INFO)
46log.addHandler(handler)
47log.setLevel(logging.INFO)
48
othman touijer5274bc72021-11-15 11:19:33 +010049config.load_incluster_config()
50coreV1Api = client.CoreV1Api()
51api = client.AppsV1Api()
52batchV1Api = client.BatchV1Api()
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +020053
54def is_job_complete(job_name):
55 """
56 Check if Job is complete.
57
58 Args:
59 job_name (str): the name of the Job.
60
61 Returns:
62 True if job is complete, false otherwise
63 """
64 complete = False
65 log.info("Checking if %s is complete", job_name)
66 try:
67 response = batchV1Api.read_namespaced_job_status(job_name, namespace)
68 if response.status.succeeded == 1:
69 job_status_type = response.status.conditions[0].type
70 if job_status_type == "Complete":
71 complete = True
72 log.info("%s is complete", job_name)
73 else:
74 log.info("%s is NOT complete", job_name)
75 else:
76 log.info("%s has not succeeded yet", job_name)
77 except ApiException as exc:
78 log.error("Exception when calling read_namespaced_job_status: %s\n",
79 exc)
80 return complete
81
82
83def wait_for_statefulset_complete(statefulset_name):
84 """
85 Check if StatefulSet is running.
86
87 Args:
88 statefulset_name (str): the name of the StatefulSet.
89
90 Returns:
91 True if StatefulSet is running, false otherwise
92 """
93 complete = False
94 try:
95 response = api.read_namespaced_stateful_set(statefulset_name,
96 namespace)
97 status = response.status
98 if (status.replicas == response.spec.replicas and
99 status.ready_replicas == response.spec.replicas and
100 status.observed_generation == response.metadata.generation):
101 log.info("Statefulset %s is ready", statefulset_name)
102 complete = True
103 else:
104 log.info("Statefulset %s is NOT ready", statefulset_name)
105 except ApiException as exc:
106 log.error("Exception when waiting for Statefulset status: %s\n", exc)
107 return complete
108
109
110def wait_for_deployment_complete(deployment_name):
111 """
112 Check if Deployment is running.
113
114 Args:
115 deployment_name (str): the name of the Deployment.
116
117 Returns:
118 True if Deployment is running, false otherwise
119 """
120 complete = False
121 try:
122 response = api.read_namespaced_deployment(deployment_name, namespace)
123 status = response.status
124 if (status.unavailable_replicas is None and
125 (status.updated_replicas is None or
126 status.updated_replicas == response.spec.replicas) and
127 status.replicas == response.spec.replicas and
128 status.ready_replicas == response.spec.replicas and
129 status.observed_generation == response.metadata.generation):
130 log.info("Deployment %s is ready", deployment_name)
131 complete = True
132 else:
133 log.info("Deployment %s is NOT ready", deployment_name)
134 except ApiException as exc:
135 log.error("Exception when waiting for deployment status: %s\n", exc)
136 return complete
137
138
139def wait_for_daemonset_complete(daemonset_name):
140 """
141 Check if DaemonSet is running.
142
143 Args:
144 daemonset_name (str): the name of the DaemonSet.
145
146 Returns:
147 True if DaemonSet is running, false otherwise
148 """
149 complete = False
150 try:
Sylvain Desbureauxb468f082020-07-22 17:47:06 +0200151 response = api.read_namespaced_daemon_set(
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200152 daemonset_name, namespace)
153 status = response.status
154 if status.desired_number_scheduled == status.number_ready:
155 log.info("DaemonSet: %s/%s nodes ready --> %s is ready",
156 status.number_ready, status.desired_number_scheduled,
157 daemonset_name)
158 complete = True
159 else:
160 log.info("DaemonSet: %s/%s nodes ready --> %s is NOT ready",
161 status.number_ready, status.desired_number_scheduled,
162 daemonset_name)
163 except ApiException as exc:
164 log.error("Exception when waiting for DaemonSet status: %s\n", exc)
165 return complete
166
167
168def is_ready(container_name):
169 """
170 Check if a container is ready.
171
172 For a container owned by a Job, it means the Job is complete.
173 Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is
174 running with the right number of replicas
175
176 Args:
177 container_name (str): the name of the container.
178
179 Returns:
180 True if container is ready, false otherwise
181 """
182 ready = False
183 log.info("Checking if %s is ready", container_name)
184 try:
185 response = coreV1Api.list_namespaced_pod(namespace=namespace,
186 watch=False)
187 for item in response.items:
188 # container_statuses can be None, which is non-iterable.
189 if item.status.container_statuses is None:
190 continue
191 for container in item.status.container_statuses:
192 if container.name == container_name:
193 name = read_name(item)
194 if item.metadata.owner_references[0].kind == "StatefulSet":
195 ready = wait_for_statefulset_complete(name)
196 elif item.metadata.owner_references[0].kind == "ReplicaSet":
197 deployment_name = get_deployment_name(name)
198 ready = wait_for_deployment_complete(deployment_name)
199 elif item.metadata.owner_references[0].kind == "Job":
200 ready = is_job_complete(name)
201 elif item.metadata.owner_references[0].kind == "DaemonSet":
202 ready = wait_for_daemonset_complete(
203 item.metadata.owner_references[0].name)
204 return ready
205 except ApiException as exc:
206 log.error("Exception when calling list_namespaced_pod: %s\n", exc)
207 return ready
208
rope252d8ce8e52021-08-31 21:11:24 +0100209def service_mesh_job_check(container_name):
210 """
211 Check if a Job's primary container is complete. Used for ensuring the sidecar can be killed after Job completion.
212 Args:
213 container_name (str): the name of the Job's primary container.
214
215 Returns:
216 True if job's container is in the completed state, false otherwise
217 """
218 complete = False
219 log.info("Checking if %s is complete", container_name)
220 try:
221 response = coreV1Api.list_namespaced_pod(namespace=namespace, watch=False)
222 for item in response.items:
223 # container_statuses can be None, which is non-iterable.
224 if item.status.container_statuses is None:
225 continue
226 for container in item.status.container_statuses:
othman touijer960cab92021-12-02 18:17:45 +0100227 if container.name == container_name and item.status.phase == "Running":
rope252d8ce8e52021-08-31 21:11:24 +0100228 name = read_name(item)
229 log.info("Container Details %s ", container)
230 log.info("Container Status %s ", container.state.terminated)
othman touijeraabacc12021-12-02 15:24:57 +0100231
232 if container.state.terminated:
233 log.info("Container Terminated with reason %s ", container.state.terminated.reason)
234 complete = True
235
rope252d8ce8e52021-08-31 21:11:24 +0100236 except ApiException as exc:
237 log.error("Exception when calling read_namespaced_job_status: %s\n",
238 exc)
239 return complete
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200240
241def read_name(item):
242 """
243 Return the name of the owner's item.
244
245 Args:
246 item (str): the item.
247
248 Returns:
249 the name of first owner's item
250 """
251 return item.metadata.owner_references[0].name
252
253
254def get_deployment_name(replicaset):
255 """
256 Return the name of the Deployment owning the ReplicatSet.
257
258 Args:
259 replicaset (str): the ReplicatSet.
260
261 Returns:
262 the name of the Deployment owning the ReplicatSet
263 """
Sylvain Desbureauxb468f082020-07-22 17:47:06 +0200264 api_response = api.read_namespaced_replica_set_status(replicaset,
265 namespace)
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200266 deployment_name = read_name(api_response)
267 return deployment_name
268
rope252d8ce8e52021-08-31 21:11:24 +0100269def quitquitquit_post(apiurl):
270 URL = apiurl
271 response = requests.post(url = URL)
272 responseStatus = response.ok
273 try:
274 if responseStatus is True:
275 log.info("quitquitquit returned True")
276 return True
277 else:
278 log.info("quitquitquit returned False")
279 return False
280 except:
281 log.info("quitquitquit call failed with exception")
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200282
283DEF_TIMEOUT = 10
rope252d8ce8e52021-08-31 21:11:24 +0100284DEF_URL = "http://127.0.0.1:15020/quitquitquit"
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200285DESCRIPTION = "Kubernetes container readiness check utility"
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200286USAGE = "Usage: ready.py [-t <timeout>] -c <container_name> .. | -j <job_name> .. \n" \
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200287 "where\n" \
288 "<timeout> - wait for container readiness timeout in min, " \
289 "default is " + str(DEF_TIMEOUT) + "\n" \
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200290 "<container_name> - name of the container to wait for\n" \
291 "<job_name> - name of the job to wait for\n"
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200292
293
294def main(argv):
295 """
rope252d8ce8e52021-08-31 21:11:24 +0100296 Checks if a container is ready, if a job is finished or if the main container of a job has completed.
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200297 The check is done according to the name of the container, not the name of
298 its parent (Job, Deployment, StatefulSet, DaemonSet).
299
300 Args:
301 argv: the command line
302 """
303 # args are a list of container names
304 container_names = []
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200305 job_names = []
rope252d8ce8e52021-08-31 21:11:24 +0100306 service_mesh_job_container_names = []
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200307 timeout = DEF_TIMEOUT
rope252d8ce8e52021-08-31 21:11:24 +0100308 url = DEF_URL
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200309 try:
rope252d8ce8e52021-08-31 21:11:24 +0100310 opts, _args = getopt.getopt(argv, "hj:c:t:s:u:", ["container-name=",
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200311 "timeout=",
rope252d8ce8e52021-08-31 21:11:24 +0100312 "service-mesh-check=",
313 "url=",
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200314 "job-name=",
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200315 "help"])
316 for opt, arg in opts:
317 if opt in ("-h", "--help"):
318 print("{}\n\n{}".format(DESCRIPTION, USAGE))
319 sys.exit()
320 elif opt in ("-c", "--container-name"):
321 container_names.append(arg)
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200322 elif opt in ("-j", "--job-name"):
323 job_names.append(arg)
rope252d8ce8e52021-08-31 21:11:24 +0100324 elif opt in ("-s", "--service-mesh-check"):
325 service_mesh_job_container_names.append(arg)
326 elif opt in ("-u", "--url"):
327 url = arg
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200328 elif opt in ("-t", "--timeout"):
329 timeout = float(arg)
330 except (getopt.GetoptError, ValueError) as exc:
331 print("Error parsing input parameters: {}\n".format(exc))
332 print(USAGE)
333 sys.exit(2)
rope252d8ce8e52021-08-31 21:11:24 +0100334 if container_names.__len__() == 0 and job_names.__len__() == 0 and service_mesh_job_container_names.__len__() == 0:
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200335 print("Missing required input parameter(s)\n")
336 print(USAGE)
337 sys.exit(2)
338
339 for container_name in container_names:
340 timeout = time.time() + timeout * 60
341 while True:
342 ready = is_ready(container_name)
343 if ready is True:
344 break
345 if time.time() > timeout:
346 log.warning("timed out waiting for '%s' to be ready",
347 container_name)
348 sys.exit(1)
349 else:
350 # spread in time potentially parallel execution in multiple
351 # containers
352 time.sleep(random.randint(5, 11))
Krzysztof Kuzmickie342ced2020-08-10 15:56:21 +0200353 for job_name in job_names:
354 timeout = time.time() + timeout * 60
355 while True:
356 ready = is_job_complete(job_name)
357 if ready is True:
358 break
359 if time.time() > timeout:
360 log.warning("timed out waiting for '%s' to be ready",
361 job_name)
362 sys.exit(1)
363 else:
364 # spread in time potentially parallel execution in multiple
365 # containers
366 time.sleep(random.randint(5, 11))
rope252d8ce8e52021-08-31 21:11:24 +0100367 for service_mesh_job_container_name in service_mesh_job_container_names:
368 timeout = time.time() + timeout * 60
369 while True:
370 ready = service_mesh_job_check(service_mesh_job_container_name)
371 if ready is True:
372 sideCarKilled = quitquitquit_post(url)
373 if sideCarKilled is True:
374 log.info("Side Car Killed through QuitQuitQuit API")
375 else:
376 log.info("Side Car Failed to be Killed through QuitQuitQuit API")
377 break
378 if time.time() > timeout:
379 log.warning("timed out waiting for '%s' to be ready",
othman touijeraabacc12021-12-02 15:24:57 +0100380 service_mesh_job_container_name)
rope252d8ce8e52021-08-31 21:11:24 +0100381 sys.exit(1)
382 else:
383 # spread in time potentially parallel execution in multiple
384 # containers
385 time.sleep(random.randint(5, 11))
Sylvain Desbureaux2faa6e62020-05-11 15:03:44 +0200386
387if __name__ == "__main__":
388 main(sys.argv[1:])