blob: 0bfe013f4ba5e9579653c20b653bccf2e4033a81 [file] [log] [blame]
Petr Ospalýc6115ad2019-08-05 17:38:04 +02001#!/bin/sh
2
3PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
4
5#
6# globals and defaults
7#
8
9NAMESPACE=
10OVERRIDES=
11HELM_CHART_RELEASE_NAME=
12HELM_DELETE_ALL=
13HELM_SKIP_DEPLOY=
14VOLUME_STORAGE=
15HELM_TIMEOUT=3600
16RELEASE_PREFIX=onap
17
18#
19# control variables
20#
21
22CMD=$(basename "$0")
23COLOR_ON_RED='\033[0;31;1m'
24COLOR_ON_GREEN='\033[0;32;1m'
25COLOR_OFF='\033[0m'
26
27
28#
29# functions
30#
31
32help()
33{
34cat <<EOF
35${CMD} - simple tool for fixing onap helm deployment
36
37DESCRIPTION
38 This script does nothing smart or special it just tries to
39 redeploy onap component. It can fix only problems related to
40 race conditions or timeouts. Nothing else. It will not fix
41 broken ONAP - there is no such ambition - that effort should
42 be directed in the upstream.
43
44USAGE
45 ${CMD} -h|--help
46 This help
47
48 ${CMD} -n|--namespace <namespace>
49 (-f|--file <override>)...
50 (-s|--storage <directory>)|--no-storage-deletion
51 [-p|--release-prefix <release prefix>]
52 [-t|--timeout <secs>]
53 [(-c|--component <component release name>)...|
54 (-D|--delete-all)]
55 [-C|--clean-only]
56
Petr Ospalý7e1861c2019-11-15 15:39:42 +010057EXAMPLES
58
59 Usage 1: (simple heuristics - redeploy failed components):
Petr Ospalýc6115ad2019-08-05 17:38:04 +020060 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs
61
Petr Ospalý7e1861c2019-11-15 15:39:42 +010062 Usage 2: (redeploy ONLY explicitly listed components):
Petr Ospalý6d3fbfa2019-10-30 14:01:00 +010063 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \\
Petr Ospalýc6115ad2019-08-05 17:38:04 +020064 -c onap-aaf -c onap-sdc -c onap-portal
65
Petr Ospalý7e1861c2019-11-15 15:39:42 +010066 Usage 3: (delete EVERYTHING and redeploy):
67 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs --delete-all
Petr Ospalýc6115ad2019-08-05 17:38:04 +020068
Petr Ospalý7e1861c2019-11-15 15:39:42 +010069 Usage 4: (delete EVERYTHING and DO NOT redeploy - clean env.)
70 ${CMD} -n onap -s /dockerdata-nfs --delete-all --clean-only
Petr Ospalýc6115ad2019-08-05 17:38:04 +020071
Petr Ospalý7e1861c2019-11-15 15:39:42 +010072NOTES
Petr Ospalýc6115ad2019-08-05 17:38:04 +020073
Petr Ospalý7e1861c2019-11-15 15:39:42 +010074 Namespace argument (always) and at least one override file (if you don't
75 use '--delete-all') are mandatory for this script to execute. Also you must
76 provide path to the storage ('--storage') OR explicitly request to not
77 delete file storage of the component ('--no-storage-deletion').
Petr Ospalýc6115ad2019-08-05 17:38:04 +020078
Petr Ospalý7e1861c2019-11-15 15:39:42 +010079 The storage should be a directory where persistent volume resides. It will
80 work only if the component created the persistent volume with the same
81 filename as its release name. Otherwise no files are deleted. The exception
82 is when '--delete-all' is used - in that case all content of the storage is
83 deleted (because ONAP is not consistent with the volume directory names
84 - e.g.: sdnc).
Petr Ospalýc6115ad2019-08-05 17:38:04 +020085
Petr Ospalý7e1861c2019-11-15 15:39:42 +010086 '--file' can be used multiple of times and it is used for override files
87 which are passed on to helm. The order is significant because if two
88 override files modify one value the latest one is used. This option is
89 ignored if '--clean-only' is used.
Petr Ospalýc6115ad2019-08-05 17:38:04 +020090
Petr Ospalý7e1861c2019-11-15 15:39:42 +010091 CAUTION 1: filename of an override file cannot contain whitespace! This is
92 actually helm/onap deploy plugin issue which does not handle such files. So
93 I dropped the more complicated version of this script when there is no
94 reason to support something on what will helm deploy choke anyway.
Petr Ospalýc6115ad2019-08-05 17:38:04 +020095
Petr Ospalý7e1861c2019-11-15 15:39:42 +010096 '--prefix' option is helm release argument - it is actually prefix when you
97 list the helm releases - helm is little confusing here.
Petr Ospalýc6115ad2019-08-05 17:38:04 +020098
Petr Ospalý7e1861c2019-11-15 15:39:42 +010099 CAUTION 2: By default release prefix is 'onap' - if you deployed release
100 'onap' and now run this script with different prefix then it will skip all
101 'onap-*' components and will deploy a new release with new prefix - BEWARE
102 TO USE PROPER RELEASE PREFIX!
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200103
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100104 Timeout sets the waiting time for helm deploy per component.
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200105
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100106 '--component' references to the release name of the chart which you want to
107 redeploy excplicitly - otherwise 'ALL FAILED' components will be
108 redeployed. You can target more than one component at once - just use the
109 argument multiple times.
110
111 Component option is mutually exclusive with the '--delete-all' which will
112 delete all components - healthy or not. Actually it will delete the whole
113 NAMESPACE and everything in it. Also to be sure it will cleanup all
114 orphaned images and volumes on all kubernetes nodes.
115
116 '--clean-only' can be used with any usage: heuristics, explicit component
117 list or with '--delete-all'. It basically just skips the last step - the
118 actual redeploy.
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200119EOF
120}
121
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100122use_help()
123{
124 printf "Try help: ${CMD} --help\n"
125}
126
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200127msg()
128{
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100129 printf "${COLOR_ON_GREEN}INFO: $@ ${COLOR_OFF}\n"
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200130}
131
132error()
133{
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100134 printf "${COLOR_ON_RED}ERROR: $@ ${COLOR_OFF}\n"
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200135}
136
137# remove all successfully completed jobs
138clean_jobs()
139{
140 kubectl get jobs -n ${NAMESPACE} \
141 --ignore-not-found=true \
142 --no-headers=true | \
143 while read -r _job _completion _duration _age ; do
144 _done=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $1;}')
145 _desired=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $2;}')
146 if [ "$_desired" -eq "$_done" ] ; then
147 delete_job "$_job"
148 fi
149 done
150}
151
152get_failed_labels()
153{
154 get_labels 'status.phase==Failed'
155}
156
157# arg: [optional: selector]
158get_labels()
159{
160 if [ -n "$1" ] ; then
161 _selector="--field-selector=${1}"
162 else
163 _selector=
164 fi
165
166 kubectl get pods -n ${NAMESPACE} \
167 --show-labels=true \
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200168 ${_selector} \
169 --ignore-not-found=true \
170 --no-headers=true | \
171 while read -r _pod _ready _status _restart _age _labels ; do
172 [ -z "$_labels" ] && break
173 for _label in $(echo "$_labels" | tr ',' ' ') ; do
174 case "$_label" in
175 release=*)
176 _label=$(echo "$_label" | sed 's/release=//')
177 echo "$_label"
178 ;;
179 esac
180 done
181 done | sort -u
182}
183
184# arg: <release name>
185helm_undeploy()
186{
187 msg "Undeploy helm release name: ${1}"
188 helm undeploy ${1} --purge
189}
190
191# arg: <job name>
192delete_job()
193{
194 kubectl delete job -n ${NAMESPACE} \
195 --cascade=true \
196 --now=true \
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200197 --wait=true \
198 ${1}
199
200 # wait for job to be deleted
201 _output=start
202 while [ -n "$_output" ] && sleep 1 ; do
203 _output=$(kubectl get pods -n ${NAMESPACE} \
204 --ignore-not-found=true \
205 --no-headers=true \
206 --selector="job-name=${1}")
207 done
208}
209
210# arg: <resource> <release name>
211delete_resource()
212{
213 _resource="$1"
214 _release="$2"
215
216 msg "Delete ${_resource} for ${_release}..."
217 {
218 kubectl get ${_resource} -n ${NAMESPACE} \
219 --ignore-not-found=true \
220 --selector="release=${_release}" \
221 --no-headers=true
222
223 # this is due to missing "release" label in some pods
224 # grep for the rescue...
225 kubectl get ${_resource} -n ${NAMESPACE} \
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100226 --no-headers=true | grep "^${_release}[-]"
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200227 } | awk '{print $1}' | sort -u | while read -r _name _rest ; do
228 echo "Deleting '${_name}'"
229 kubectl delete ${_resource} -n ${NAMESPACE} \
230 --cascade=true \
231 --now=true \
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200232 --wait=true \
233 ${_name} \
234 2>&1 | grep -iv 'not[[:space:]]*found'
235
236 # wait for resource to be deleted
237 _output=start
238 while [ -n "$_output" ] && sleep 1 ; do
239 _output=$(kubectl get ${_resource} -n ${NAMESPACE} \
240 --ignore-not-found=true \
241 --no-headers=true \
242 --field-selector="metadata.name=${_name}")
243 done
244 done
245}
246
247delete_namespace()
248{
249 msg "Delete the whole namespace: ${NAMESPACE}"
250 kubectl delete namespace \
251 --cascade=true \
252 --now=true \
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200253 --wait=true \
254 "$NAMESPACE"
255
256 # wait for namespace to be deleted
257 _output=start
258 while [ -n "$_output" ] && sleep 1 ; do
259 _output=$(kubectl get all -n ${NAMESPACE} \
260 --ignore-not-found=true \
261 --no-headers=true)
262 done
263}
264
265# arg: [optional: subdir]
266delete_storage()
267{
268 _node=$(kubectl get nodes \
269 --selector=node-role.kubernetes.io/worker \
270 -o wide \
271 --no-headers=true | \
272 awk '{print $6}' | head -n 1)
273
274 if [ -z "$_node" ] ; then
275 error "Could not list kubernetes nodes - SKIPPING DELETION"
276 else
277 if [ -n "$1" ] ; then
278 msg "Delete directory '${VOLUME_STORAGE}/${1}' on $_node"
279 ssh -T $_node <<EOF
280rm -rf "${VOLUME_STORAGE}/${1}"
281EOF
282 else
283 msg "Delete directories '${VOLUME_STORAGE}/*' on $_node"
284 ssh -T $_node <<EOF
285find "${VOLUME_STORAGE}" -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \;
286EOF
287 fi
288 fi
289}
290
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100291docker_cleanup()
292{
293 _nodes=$(kubectl get nodes \
294 --selector=node-role.kubernetes.io/worker \
295 -o wide \
296 --no-headers=true | \
297 awk '{print $6}')
298
299 if [ -z "$_nodes" ] ; then
300 error "Could not list kubernetes nodes - SKIPPING docker cleanup"
301 return
302 fi
303
304 for _node in $_nodes ; do
305 msg "Docker cleanup on $_node"
306 {
307 ssh -T $_node >/dev/null <<EOF
308if which docker >/dev/null ; then
309 docker system prune --force --all --volumes
310fi
311EOF
312 } &
313 done
314
315 msg "We are waiting now for docker cleanup to finish on all nodes..."
316 wait
317}
318
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200319# arg: <release name>
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100320undeploy_component()
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200321{
322 _chart=$(echo "$1" | sed 's/[^-]*-//')
323 helm_undeploy ${1}
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100324
325 # for all kubernetes resources: kubectl api-resources
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200326 # TODO: does deleted secret per component break something?
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100327 for x in jobs \
328 deployments \
329 services \
330 replicasets \
331 statefulsets \
332 daemonsets \
333 pods \
334 pvc \
335 pv \
336 ;
337 do
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200338 delete_resource ${x} ${1}
339 done
340
341 if [ -n "$VOLUME_STORAGE" ] ; then
342 msg "Persistent volume data deletion in directory: ${VOLUME_STORAGE}/${1}"
343 delete_storage "$1"
344 fi
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100345}
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200346
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100347# arg: <release name>
348deploy_component()
349{
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200350 # TODO: until I can verify that this does the same for this component as helm deploy
351 #msg "Redeployment of the component ${1}..."
352 #helm install "local/${_chart}" --name ${1} --namespace ${NAMESPACE} --wait --timeout ${HELM_TIMEOUT}
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100353 error "NOT IMPLEMENTED"
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200354}
355
356
357#
358# arguments
359#
360
361state=nil
362arg_namespace=
363arg_overrides=
364arg_timeout=
365arg_storage=
366arg_nostorage=
367arg_components=
368arg_prefix=
369arg_deleteall=
370arg_cleanonly=
371while [ -n "$1" ] ; do
372 case $state in
373 nil)
374 case "$1" in
375 -h|--help)
376 help
377 exit 0
378 ;;
379 -n|--namespace)
380 state=namespace
381 ;;
382 -f|--file)
383 state=override
384 ;;
385 -t|--timeout)
386 state=timeout
387 ;;
388 -s|--storage)
389 state=storage
390 ;;
391 --no-storage-deletion)
392 if [ -n "$arg_storage" ] ; then
393 error "Usage of storage argument together with no storage deletion option!"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100394 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200395 exit 1
396 elif [ -z "$arg_nostorage" ] ; then
397 arg_nostorage=nostorage
398 else
399 error "Duplicit argument for no storage option! (IGNORING)"
400 fi
401 ;;
402 -c|--component)
403 if [ -n "$arg_deleteall" ] ; then
404 error "'Delete all components' used already - argument mismatch"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100405 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200406 exit 1
407 fi
408 state=component
409 ;;
410 -D|--delete-all)
411 if [ -n "$arg_components" ] ; then
412 error "Explicit component(s) provided already - argument mismatch"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100413 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200414 exit 1
415 elif [ -z "$arg_deleteall" ] ; then
416 arg_deleteall=deleteall
417 else
418 error "Duplicit argument for 'delete all' option! (IGNORING)"
419 fi
420 ;;
421 -p|--prefix)
422 state=prefix
423 ;;
424 -C|--clean-only)
425 if [ -z "$arg_cleanonly" ] ; then
426 arg_cleanonly=cleanonly
427 else
428 error "Duplicit argument for 'clean only' option! (IGNORING)"
429 fi
430 ;;
431 *)
432 error "Unknown parameter: $1"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100433 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200434 exit 1
435 ;;
436 esac
437 ;;
438 namespace)
439 if [ -z "$arg_namespace" ] ; then
440 arg_namespace="$1"
441 state=nil
442 else
443 error "Duplicit argument for namespace!"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100444 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200445 exit 1
446 fi
447 ;;
448 override)
449 if ! [ -f "$1" ] ; then
450 error "Wrong filename for override file: $1"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100451 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200452 exit 1
453 fi
454 arg_overrides="${arg_overrides} -f $1"
455 state=nil
456 ;;
457 component)
458 arg_components="${arg_components} $1"
459 state=nil
460 ;;
461 prefix)
462 if [ -z "$arg_prefix" ] ; then
463 arg_prefix="$1"
464 state=nil
465 else
466 error "Duplicit argument for release prefix!"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100467 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200468 exit 1
469 fi
470 ;;
471 timeout)
472 if [ -z "$arg_timeout" ] ; then
473 if ! echo "$1" | grep -q '^[0-9]\+$' ; then
474 error "Timeout must be an integer: $1"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100475 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200476 exit 1
477 fi
478 arg_timeout="$1"
479 state=nil
480 else
481 error "Duplicit argument for timeout!"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100482 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200483 exit 1
484 fi
485 ;;
486 storage)
487 if [ -n "$arg_nostorage" ] ; then
488 error "Usage of storage argument together with no storage deletion option!"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100489 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200490 exit 1
491 elif [ -z "$arg_storage" ] ; then
492 arg_storage="$1"
493 state=nil
494 else
495 error "Duplicit argument for storage!"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100496 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200497 exit 1
498 fi
499 ;;
500 esac
501 shift
502done
503
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100504# sanity checks
505
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200506if [ -z "$arg_namespace" ] ; then
507 error "Missing namespace"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100508 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200509 exit 1
510else
511 NAMESPACE="$arg_namespace"
512fi
513
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100514if [ -z "$arg_overrides" ] && [ -z "$arg_cleanonly" ] ; then
515 error "Missing override file(s) or use '--clean-only'"
516 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200517 exit 1
518else
519 OVERRIDES="$arg_overrides"
520fi
521
522if [ -n "$arg_prefix" ] ; then
523 RELEASE_PREFIX="$arg_prefix"
524fi
525
526if [ -n "$arg_timeout" ] ; then
527 HELM_TIMEOUT="$arg_timeout"
528fi
529
530if [ -n "$arg_storage" ] ; then
531 VOLUME_STORAGE="$arg_storage"
532elif [ -z "$arg_nostorage" ] ; then
533 error "Missing storage argument! If it is intended then use '--no-storage-deletion' option"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100534 use_help
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200535 exit 1
536fi
537
538if [ -n "$arg_components" ] ; then
539 HELM_CHART_RELEASE_NAME="$arg_components"
540fi
541
542if [ -n "$arg_deleteall" ] ; then
543 HELM_DELETE_ALL=yes
544fi
545
546if [ -n "$arg_cleanonly" ] ; then
547 HELM_SKIP_DEPLOY=yes
548fi
549
550
551#
552# main
553#
554
555# if --delete-all is used then redeploy all components (the current namespace is deleted)
556if [ -n "$HELM_DELETE_ALL" ] ; then
557 # undeploy helm release (prefix)
558 helm_undeploy "$RELEASE_PREFIX"
559
560 # we will delete the whole namespace
561 delete_namespace
562
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100563 # we will cleanup docker on each node
564 docker_cleanup
565
566 # we will delete the content of storage (volumes)
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200567 if [ -n "$VOLUME_STORAGE" ] ; then
568 delete_storage
569 fi
570# delete and redeploy explicit or failed components...
571else
572 # if a helm chart release name was given then just redeploy said component and quit
573 if [ -n "$HELM_CHART_RELEASE_NAME" ] ; then
574 msg "Explicitly asked for component redeploy: ${HELM_CHART_RELEASE_NAME}"
575 _COMPONENTS="$HELM_CHART_RELEASE_NAME"
576 # simple heuristics: redeploy only failed components
577 else
578 msg "Delete successfully completed jobs..."
579 clean_jobs
580
581 msg "Find failed components..."
582 _COMPONENTS=$(get_failed_labels)
583 fi
584
585 for _component in ${_COMPONENTS} ; do
586 if echo "$_component" | grep -q "^${RELEASE_PREFIX}-" ; then
587 msg "Redeploy component: ${_component}"
Petr Ospalý7e1861c2019-11-15 15:39:42 +0100588 undeploy_component ${_component}
Petr Ospalýc6115ad2019-08-05 17:38:04 +0200589 else
590 error "Component release name '${_component}' does not match release prefix: ${RELEASE_PREFIX} (SKIP)"
591 fi
592 done
593fi
594
595if [ -z "$HELM_SKIP_DEPLOY" ] ; then
596 # TODO: this is suboptimal - find a way how to deploy only the affected component...
597 msg "Redeploy onap..."
598 msg helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
599 helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
600else
601 msg "Clean only option used: Skipping redeploy..."
602fi
603
604msg DONE
605
606exit $?
607