Sylvain Desbureaux | 2faa6e6 | 2020-05-11 15:03:44 +0200 | [diff] [blame^] | 1 | #!/usr/bin/env python3 |
| 2 | # -*- coding: utf-8 -*- |
| 3 | # Copyright © 2020 Orange |
| 4 | # |
| 5 | # Licensed under the Apache License, Version 2.0 (the "License"); |
| 6 | # you may not use this file except in compliance with the License. |
| 7 | # You may obtain a copy of the License at |
| 8 | # |
| 9 | # http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | # |
| 11 | # Unless required by applicable law or agreed to in writing, software |
| 12 | # distributed under the License is distributed on an "AS IS" BASIS, |
| 13 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 14 | # See the License for the specific language governing permissions and |
| 15 | # limitations under the License. |
| 16 | |
| 17 | """ |
| 18 | Kubernetes readiness check. |
| 19 | |
| 20 | Checks if a container is ready or if a job is finished. |
| 21 | The check is done according to the name of the container, not the name of |
| 22 | its parent (Job, Deployment, StatefulSet, DaemonSet). |
| 23 | """ |
| 24 | |
| 25 | import getopt |
| 26 | import logging |
| 27 | import os |
| 28 | import sys |
| 29 | import time |
| 30 | import random |
| 31 | |
| 32 | from kubernetes import client |
| 33 | from kubernetes.client.rest import ApiException |
| 34 | |
| 35 | # extract env variables. |
| 36 | namespace = os.environ['NAMESPACE'] |
| 37 | cert = os.environ['CERT'] |
| 38 | host = os.environ['KUBERNETES_SERVICE_HOST'] |
| 39 | token_path = os.environ['TOKEN'] |
| 40 | |
| 41 | with open(token_path, 'r') as token_file: |
| 42 | token = token_file.read().replace('\n', '') |
| 43 | |
| 44 | # setup logging |
| 45 | log = logging.getLogger(__name__) |
| 46 | handler = logging.StreamHandler(sys.stdout) |
| 47 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') |
| 48 | handler.setFormatter(formatter) |
| 49 | handler.setLevel(logging.INFO) |
| 50 | log.addHandler(handler) |
| 51 | log.setLevel(logging.INFO) |
| 52 | |
| 53 | configuration = client.Configuration() |
| 54 | configuration.host = "https://" + host |
| 55 | configuration.ssl_ca_cert = cert |
| 56 | configuration.api_key['authorization'] = token |
| 57 | configuration.api_key_prefix['authorization'] = 'Bearer' |
| 58 | coreV1Api = client.CoreV1Api(client.ApiClient(configuration)) |
| 59 | api_instance = client.ExtensionsV1beta1Api(client.ApiClient(configuration)) |
| 60 | api = client.AppsV1beta1Api(client.ApiClient(configuration)) |
| 61 | batchV1Api = client.BatchV1Api(client.ApiClient(configuration)) |
| 62 | |
| 63 | |
| 64 | def is_job_complete(job_name): |
| 65 | """ |
| 66 | Check if Job is complete. |
| 67 | |
| 68 | Args: |
| 69 | job_name (str): the name of the Job. |
| 70 | |
| 71 | Returns: |
| 72 | True if job is complete, false otherwise |
| 73 | """ |
| 74 | complete = False |
| 75 | log.info("Checking if %s is complete", job_name) |
| 76 | try: |
| 77 | response = batchV1Api.read_namespaced_job_status(job_name, namespace) |
| 78 | if response.status.succeeded == 1: |
| 79 | job_status_type = response.status.conditions[0].type |
| 80 | if job_status_type == "Complete": |
| 81 | complete = True |
| 82 | log.info("%s is complete", job_name) |
| 83 | else: |
| 84 | log.info("%s is NOT complete", job_name) |
| 85 | else: |
| 86 | log.info("%s has not succeeded yet", job_name) |
| 87 | except ApiException as exc: |
| 88 | log.error("Exception when calling read_namespaced_job_status: %s\n", |
| 89 | exc) |
| 90 | return complete |
| 91 | |
| 92 | |
| 93 | def wait_for_statefulset_complete(statefulset_name): |
| 94 | """ |
| 95 | Check if StatefulSet is running. |
| 96 | |
| 97 | Args: |
| 98 | statefulset_name (str): the name of the StatefulSet. |
| 99 | |
| 100 | Returns: |
| 101 | True if StatefulSet is running, false otherwise |
| 102 | """ |
| 103 | complete = False |
| 104 | try: |
| 105 | response = api.read_namespaced_stateful_set(statefulset_name, |
| 106 | namespace) |
| 107 | status = response.status |
| 108 | if (status.replicas == response.spec.replicas and |
| 109 | status.ready_replicas == response.spec.replicas and |
| 110 | status.observed_generation == response.metadata.generation): |
| 111 | log.info("Statefulset %s is ready", statefulset_name) |
| 112 | complete = True |
| 113 | else: |
| 114 | log.info("Statefulset %s is NOT ready", statefulset_name) |
| 115 | except ApiException as exc: |
| 116 | log.error("Exception when waiting for Statefulset status: %s\n", exc) |
| 117 | return complete |
| 118 | |
| 119 | |
| 120 | def wait_for_deployment_complete(deployment_name): |
| 121 | """ |
| 122 | Check if Deployment is running. |
| 123 | |
| 124 | Args: |
| 125 | deployment_name (str): the name of the Deployment. |
| 126 | |
| 127 | Returns: |
| 128 | True if Deployment is running, false otherwise |
| 129 | """ |
| 130 | complete = False |
| 131 | try: |
| 132 | response = api.read_namespaced_deployment(deployment_name, namespace) |
| 133 | status = response.status |
| 134 | if (status.unavailable_replicas is None and |
| 135 | (status.updated_replicas is None or |
| 136 | status.updated_replicas == response.spec.replicas) and |
| 137 | status.replicas == response.spec.replicas and |
| 138 | status.ready_replicas == response.spec.replicas and |
| 139 | status.observed_generation == response.metadata.generation): |
| 140 | log.info("Deployment %s is ready", deployment_name) |
| 141 | complete = True |
| 142 | else: |
| 143 | log.info("Deployment %s is NOT ready", deployment_name) |
| 144 | except ApiException as exc: |
| 145 | log.error("Exception when waiting for deployment status: %s\n", exc) |
| 146 | return complete |
| 147 | |
| 148 | |
| 149 | def wait_for_daemonset_complete(daemonset_name): |
| 150 | """ |
| 151 | Check if DaemonSet is running. |
| 152 | |
| 153 | Args: |
| 154 | daemonset_name (str): the name of the DaemonSet. |
| 155 | |
| 156 | Returns: |
| 157 | True if DaemonSet is running, false otherwise |
| 158 | """ |
| 159 | complete = False |
| 160 | try: |
| 161 | response = api_instance.read_namespaced_daemon_set( |
| 162 | daemonset_name, namespace) |
| 163 | status = response.status |
| 164 | if status.desired_number_scheduled == status.number_ready: |
| 165 | log.info("DaemonSet: %s/%s nodes ready --> %s is ready", |
| 166 | status.number_ready, status.desired_number_scheduled, |
| 167 | daemonset_name) |
| 168 | complete = True |
| 169 | else: |
| 170 | log.info("DaemonSet: %s/%s nodes ready --> %s is NOT ready", |
| 171 | status.number_ready, status.desired_number_scheduled, |
| 172 | daemonset_name) |
| 173 | except ApiException as exc: |
| 174 | log.error("Exception when waiting for DaemonSet status: %s\n", exc) |
| 175 | return complete |
| 176 | |
| 177 | |
| 178 | def is_ready(container_name): |
| 179 | """ |
| 180 | Check if a container is ready. |
| 181 | |
| 182 | For a container owned by a Job, it means the Job is complete. |
| 183 | Otherwise, it means the parent (Deployment, StatefulSet, DaemonSet) is |
| 184 | running with the right number of replicas |
| 185 | |
| 186 | Args: |
| 187 | container_name (str): the name of the container. |
| 188 | |
| 189 | Returns: |
| 190 | True if container is ready, false otherwise |
| 191 | """ |
| 192 | ready = False |
| 193 | log.info("Checking if %s is ready", container_name) |
| 194 | try: |
| 195 | response = coreV1Api.list_namespaced_pod(namespace=namespace, |
| 196 | watch=False) |
| 197 | for item in response.items: |
| 198 | # container_statuses can be None, which is non-iterable. |
| 199 | if item.status.container_statuses is None: |
| 200 | continue |
| 201 | for container in item.status.container_statuses: |
| 202 | if container.name == container_name: |
| 203 | name = read_name(item) |
| 204 | if item.metadata.owner_references[0].kind == "StatefulSet": |
| 205 | ready = wait_for_statefulset_complete(name) |
| 206 | elif item.metadata.owner_references[0].kind == "ReplicaSet": |
| 207 | deployment_name = get_deployment_name(name) |
| 208 | ready = wait_for_deployment_complete(deployment_name) |
| 209 | elif item.metadata.owner_references[0].kind == "Job": |
| 210 | ready = is_job_complete(name) |
| 211 | elif item.metadata.owner_references[0].kind == "DaemonSet": |
| 212 | ready = wait_for_daemonset_complete( |
| 213 | item.metadata.owner_references[0].name) |
| 214 | return ready |
| 215 | except ApiException as exc: |
| 216 | log.error("Exception when calling list_namespaced_pod: %s\n", exc) |
| 217 | return ready |
| 218 | |
| 219 | |
| 220 | def read_name(item): |
| 221 | """ |
| 222 | Return the name of the owner's item. |
| 223 | |
| 224 | Args: |
| 225 | item (str): the item. |
| 226 | |
| 227 | Returns: |
| 228 | the name of first owner's item |
| 229 | """ |
| 230 | return item.metadata.owner_references[0].name |
| 231 | |
| 232 | |
| 233 | def get_deployment_name(replicaset): |
| 234 | """ |
| 235 | Return the name of the Deployment owning the ReplicatSet. |
| 236 | |
| 237 | Args: |
| 238 | replicaset (str): the ReplicatSet. |
| 239 | |
| 240 | Returns: |
| 241 | the name of the Deployment owning the ReplicatSet |
| 242 | """ |
| 243 | api_response = api_instance.read_namespaced_replica_set_status(replicaset, |
| 244 | namespace) |
| 245 | deployment_name = read_name(api_response) |
| 246 | return deployment_name |
| 247 | |
| 248 | |
| 249 | DEF_TIMEOUT = 10 |
| 250 | DESCRIPTION = "Kubernetes container readiness check utility" |
| 251 | USAGE = "Usage: ready.py [-t <timeout>] -c <container_name> " \ |
| 252 | "[-c <container_name> ...]\n" \ |
| 253 | "where\n" \ |
| 254 | "<timeout> - wait for container readiness timeout in min, " \ |
| 255 | "default is " + str(DEF_TIMEOUT) + "\n" \ |
| 256 | "<container_name> - name of the container to wait for\n" |
| 257 | |
| 258 | |
| 259 | def main(argv): |
| 260 | """ |
| 261 | Checks if a container is ready or if a job is finished. |
| 262 | The check is done according to the name of the container, not the name of |
| 263 | its parent (Job, Deployment, StatefulSet, DaemonSet). |
| 264 | |
| 265 | Args: |
| 266 | argv: the command line |
| 267 | """ |
| 268 | # args are a list of container names |
| 269 | container_names = [] |
| 270 | timeout = DEF_TIMEOUT |
| 271 | try: |
| 272 | opts, _args = getopt.getopt(argv, "hc:t:", ["container-name=", |
| 273 | "timeout=", |
| 274 | "help"]) |
| 275 | for opt, arg in opts: |
| 276 | if opt in ("-h", "--help"): |
| 277 | print("{}\n\n{}".format(DESCRIPTION, USAGE)) |
| 278 | sys.exit() |
| 279 | elif opt in ("-c", "--container-name"): |
| 280 | container_names.append(arg) |
| 281 | elif opt in ("-t", "--timeout"): |
| 282 | timeout = float(arg) |
| 283 | except (getopt.GetoptError, ValueError) as exc: |
| 284 | print("Error parsing input parameters: {}\n".format(exc)) |
| 285 | print(USAGE) |
| 286 | sys.exit(2) |
| 287 | if container_names.__len__() == 0: |
| 288 | print("Missing required input parameter(s)\n") |
| 289 | print(USAGE) |
| 290 | sys.exit(2) |
| 291 | |
| 292 | for container_name in container_names: |
| 293 | timeout = time.time() + timeout * 60 |
| 294 | while True: |
| 295 | ready = is_ready(container_name) |
| 296 | if ready is True: |
| 297 | break |
| 298 | if time.time() > timeout: |
| 299 | log.warning("timed out waiting for '%s' to be ready", |
| 300 | container_name) |
| 301 | sys.exit(1) |
| 302 | else: |
| 303 | # spread in time potentially parallel execution in multiple |
| 304 | # containers |
| 305 | time.sleep(random.randint(5, 11)) |
| 306 | |
| 307 | |
| 308 | if __name__ == "__main__": |
| 309 | main(sys.argv[1:]) |