blob: b030fcac4d34912c0277f616747fbe6471c496e6 [file] [log] [blame]
Petr Ospalýc6115ad2019-08-05 17:38:04 +02001#!/bin/sh
2
3PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
4
5#
6# globals and defaults
7#
8
9NAMESPACE=
10OVERRIDES=
11HELM_CHART_RELEASE_NAME=
12HELM_DELETE_ALL=
13HELM_SKIP_DEPLOY=
14VOLUME_STORAGE=
15HELM_TIMEOUT=3600
16RELEASE_PREFIX=onap
17
18#
19# control variables
20#
21
22CMD=$(basename "$0")
23COLOR_ON_RED='\033[0;31;1m'
24COLOR_ON_GREEN='\033[0;32;1m'
25COLOR_OFF='\033[0m'
26
27
28#
29# functions
30#
31
32help()
33{
34cat <<EOF
35${CMD} - simple tool for fixing onap helm deployment
36
37DESCRIPTION
38 This script does nothing smart or special it just tries to
39 redeploy onap component. It can fix only problems related to
40 race conditions or timeouts. Nothing else. It will not fix
41 broken ONAP - there is no such ambition - that effort should
42 be directed in the upstream.
43
44USAGE
45 ${CMD} -h|--help
46 This help
47
48 ${CMD} -n|--namespace <namespace>
49 (-f|--file <override>)...
50 (-s|--storage <directory>)|--no-storage-deletion
51 [-p|--release-prefix <release prefix>]
52 [-t|--timeout <secs>]
53 [(-c|--component <component release name>)...|
54 (-D|--delete-all)]
55 [-C|--clean-only]
56
57 Usage 1 (simple heuristics - redeploy failed components):
58 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs
59
60 Usage 2 (redeploy ONLY explicit listed components):
61 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \
62 -c onap-aaf -c onap-sdc -c onap-portal
63
64 Usage 3 (delete EVERYTHING and redeploy):
65 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \
66 --delete-all
67
68 Usage 4 (just clean - do not redeploy)
69 ${CMD} -n onap -f /some/override1.yml -s /dockerdata-nfs \
70 --delete-all --clean-only
71
72 Namespace argument and at least one override file are mandatory
73 for this script to execute. Also you must provide path to the
74 storage or explicitly request to not delete file storage of the
75 component.
76
77 Storage should be directory where persistent volume resides. It
78 will work only if component created a persistent volume with the
79 same filename as its release name. Otherwise no effect. The
80 exception is when '--delete-all' is used - in that case all
81 content of the storage is deleted (because ONAP is not consistent
82 with the volume directory names - eg.: sdnc).
83
84 CAUTION 1: filename of an override file cannot contain whitespace!
85 This is actually helm/onap deploy plugin issue which does not
86 handle such files. So I dropped the more complicated version of
87 this script when there is no reason to support something on what
88 will helm deploy choke anyway.
89
90 '--prefix' option is helm release argument - it is actually prefix
91 when you list the helm releases - helm is little confusing here.
92
93 CAUTION 2: By default release prefix is 'onap' - if you deployed
94 release 'onap' and now run this script with different prefix then
95 it will skip all 'onap-*' components and will deploy a new release
96 with new prefix - BEWARE TO USE PROPER RELEASE PREFIX!
97
98 Timeout set the waiting time for helm deploy per component.
99
100 '--component' references to release name of the chart which you
101 want to redeploy excplicitly - otherwise 'ALL FAILED' components
102 will be redeployed. You can target more than one component at once
103 - just use the argument multiple times.
104
105 Component option is mutually exclusive with the '--delete-all'
106 which will delete all components - healthy or not. Actually it will
107 delete the whole NAMESPACE and everything in it.
108
109 '--clean-only' can be used with any usage: heuristics, explicit
110 component list or with '--delete-all'. It basically just skips the
111 last step - the actual redeploy.
112EOF
113}
114
115msg()
116{
117 echo -e "${COLOR_ON_GREEN}INFO: $@ ${COLOR_OFF}"
118}
119
120error()
121{
122 echo -e "${COLOR_ON_RED}ERROR: $@ ${COLOR_OFF}"
123}
124
125# remove all successfully completed jobs
126clean_jobs()
127{
128 kubectl get jobs -n ${NAMESPACE} \
129 --ignore-not-found=true \
130 --no-headers=true | \
131 while read -r _job _completion _duration _age ; do
132 _done=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $1;}')
133 _desired=$(echo ${_completion} | awk 'BEGIN {FS="/";} {print $2;}')
134 if [ "$_desired" -eq "$_done" ] ; then
135 delete_job "$_job"
136 fi
137 done
138}
139
140get_failed_labels()
141{
142 get_labels 'status.phase==Failed'
143}
144
145# arg: [optional: selector]
146get_labels()
147{
148 if [ -n "$1" ] ; then
149 _selector="--field-selector=${1}"
150 else
151 _selector=
152 fi
153
154 kubectl get pods -n ${NAMESPACE} \
155 --show-labels=true \
156 --include-uninitialized=true \
157 ${_selector} \
158 --ignore-not-found=true \
159 --no-headers=true | \
160 while read -r _pod _ready _status _restart _age _labels ; do
161 [ -z "$_labels" ] && break
162 for _label in $(echo "$_labels" | tr ',' ' ') ; do
163 case "$_label" in
164 release=*)
165 _label=$(echo "$_label" | sed 's/release=//')
166 echo "$_label"
167 ;;
168 esac
169 done
170 done | sort -u
171}
172
173# arg: <release name>
174helm_undeploy()
175{
176 msg "Undeploy helm release name: ${1}"
177 helm undeploy ${1} --purge
178}
179
180# arg: <job name>
181delete_job()
182{
183 kubectl delete job -n ${NAMESPACE} \
184 --cascade=true \
185 --now=true \
186 --include-uninitialized=true \
187 --wait=true \
188 ${1}
189
190 # wait for job to be deleted
191 _output=start
192 while [ -n "$_output" ] && sleep 1 ; do
193 _output=$(kubectl get pods -n ${NAMESPACE} \
194 --ignore-not-found=true \
195 --no-headers=true \
196 --selector="job-name=${1}")
197 done
198}
199
200# arg: <resource> <release name>
201delete_resource()
202{
203 _resource="$1"
204 _release="$2"
205
206 msg "Delete ${_resource} for ${_release}..."
207 {
208 kubectl get ${_resource} -n ${NAMESPACE} \
209 --ignore-not-found=true \
210 --selector="release=${_release}" \
211 --no-headers=true
212
213 # this is due to missing "release" label in some pods
214 # grep for the rescue...
215 kubectl get ${_resource} -n ${NAMESPACE} \
216 --no-headers=true | grep "^${_release}"
217 } | awk '{print $1}' | sort -u | while read -r _name _rest ; do
218 echo "Deleting '${_name}'"
219 kubectl delete ${_resource} -n ${NAMESPACE} \
220 --cascade=true \
221 --now=true \
222 --include-uninitialized=true \
223 --wait=true \
224 ${_name} \
225 2>&1 | grep -iv 'not[[:space:]]*found'
226
227 # wait for resource to be deleted
228 _output=start
229 while [ -n "$_output" ] && sleep 1 ; do
230 _output=$(kubectl get ${_resource} -n ${NAMESPACE} \
231 --ignore-not-found=true \
232 --no-headers=true \
233 --field-selector="metadata.name=${_name}")
234 done
235 done
236}
237
238delete_namespace()
239{
240 msg "Delete the whole namespace: ${NAMESPACE}"
241 kubectl delete namespace \
242 --cascade=true \
243 --now=true \
244 --include-uninitialized=true \
245 --wait=true \
246 "$NAMESPACE"
247
248 # wait for namespace to be deleted
249 _output=start
250 while [ -n "$_output" ] && sleep 1 ; do
251 _output=$(kubectl get all -n ${NAMESPACE} \
252 --ignore-not-found=true \
253 --no-headers=true)
254 done
255}
256
257# arg: [optional: subdir]
258delete_storage()
259{
260 _node=$(kubectl get nodes \
261 --selector=node-role.kubernetes.io/worker \
262 -o wide \
263 --no-headers=true | \
264 awk '{print $6}' | head -n 1)
265
266 if [ -z "$_node" ] ; then
267 error "Could not list kubernetes nodes - SKIPPING DELETION"
268 else
269 if [ -n "$1" ] ; then
270 msg "Delete directory '${VOLUME_STORAGE}/${1}' on $_node"
271 ssh -T $_node <<EOF
272rm -rf "${VOLUME_STORAGE}/${1}"
273EOF
274 else
275 msg "Delete directories '${VOLUME_STORAGE}/*' on $_node"
276 ssh -T $_node <<EOF
277find "${VOLUME_STORAGE}" -maxdepth 1 -mindepth 1 -exec rm -rf '{}' \;
278EOF
279 fi
280 fi
281}
282
283# arg: <release name>
284redeploy_component()
285{
286 _chart=$(echo "$1" | sed 's/[^-]*-//')
287 helm_undeploy ${1}
288 # TODO: does deleted secret per component break something?
289 for x in jobs deployments pods pvc pv ; do
290 delete_resource ${x} ${1}
291 done
292
293 if [ -n "$VOLUME_STORAGE" ] ; then
294 msg "Persistent volume data deletion in directory: ${VOLUME_STORAGE}/${1}"
295 delete_storage "$1"
296 fi
297
298 # TODO: until I can verify that this does the same for this component as helm deploy
299 #msg "Redeployment of the component ${1}..."
300 #helm install "local/${_chart}" --name ${1} --namespace ${NAMESPACE} --wait --timeout ${HELM_TIMEOUT}
301}
302
303
304#
305# arguments
306#
307
308state=nil
309arg_namespace=
310arg_overrides=
311arg_timeout=
312arg_storage=
313arg_nostorage=
314arg_components=
315arg_prefix=
316arg_deleteall=
317arg_cleanonly=
318while [ -n "$1" ] ; do
319 case $state in
320 nil)
321 case "$1" in
322 -h|--help)
323 help
324 exit 0
325 ;;
326 -n|--namespace)
327 state=namespace
328 ;;
329 -f|--file)
330 state=override
331 ;;
332 -t|--timeout)
333 state=timeout
334 ;;
335 -s|--storage)
336 state=storage
337 ;;
338 --no-storage-deletion)
339 if [ -n "$arg_storage" ] ; then
340 error "Usage of storage argument together with no storage deletion option!"
341 exit 1
342 elif [ -z "$arg_nostorage" ] ; then
343 arg_nostorage=nostorage
344 else
345 error "Duplicit argument for no storage option! (IGNORING)"
346 fi
347 ;;
348 -c|--component)
349 if [ -n "$arg_deleteall" ] ; then
350 error "'Delete all components' used already - argument mismatch"
351 exit 1
352 fi
353 state=component
354 ;;
355 -D|--delete-all)
356 if [ -n "$arg_components" ] ; then
357 error "Explicit component(s) provided already - argument mismatch"
358 exit 1
359 elif [ -z "$arg_deleteall" ] ; then
360 arg_deleteall=deleteall
361 else
362 error "Duplicit argument for 'delete all' option! (IGNORING)"
363 fi
364 ;;
365 -p|--prefix)
366 state=prefix
367 ;;
368 -C|--clean-only)
369 if [ -z "$arg_cleanonly" ] ; then
370 arg_cleanonly=cleanonly
371 else
372 error "Duplicit argument for 'clean only' option! (IGNORING)"
373 fi
374 ;;
375 *)
376 error "Unknown parameter: $1"
377 exit 1
378 ;;
379 esac
380 ;;
381 namespace)
382 if [ -z "$arg_namespace" ] ; then
383 arg_namespace="$1"
384 state=nil
385 else
386 error "Duplicit argument for namespace!"
387 exit 1
388 fi
389 ;;
390 override)
391 if ! [ -f "$1" ] ; then
392 error "Wrong filename for override file: $1"
393 exit 1
394 fi
395 arg_overrides="${arg_overrides} -f $1"
396 state=nil
397 ;;
398 component)
399 arg_components="${arg_components} $1"
400 state=nil
401 ;;
402 prefix)
403 if [ -z "$arg_prefix" ] ; then
404 arg_prefix="$1"
405 state=nil
406 else
407 error "Duplicit argument for release prefix!"
408 exit 1
409 fi
410 ;;
411 timeout)
412 if [ -z "$arg_timeout" ] ; then
413 if ! echo "$1" | grep -q '^[0-9]\+$' ; then
414 error "Timeout must be an integer: $1"
415 exit 1
416 fi
417 arg_timeout="$1"
418 state=nil
419 else
420 error "Duplicit argument for timeout!"
421 exit 1
422 fi
423 ;;
424 storage)
425 if [ -n "$arg_nostorage" ] ; then
426 error "Usage of storage argument together with no storage deletion option!"
427 exit 1
428 elif [ -z "$arg_storage" ] ; then
429 arg_storage="$1"
430 state=nil
431 else
432 error "Duplicit argument for storage!"
433 exit 1
434 fi
435 ;;
436 esac
437 shift
438done
439
440# sanity check
441if [ -z "$arg_namespace" ] ; then
442 error "Missing namespace"
443 help
444 exit 1
445else
446 NAMESPACE="$arg_namespace"
447fi
448
449if [ -z "$arg_overrides" ] ; then
450 error "Missing override file(s)"
451 help
452 exit 1
453else
454 OVERRIDES="$arg_overrides"
455fi
456
457if [ -n "$arg_prefix" ] ; then
458 RELEASE_PREFIX="$arg_prefix"
459fi
460
461if [ -n "$arg_timeout" ] ; then
462 HELM_TIMEOUT="$arg_timeout"
463fi
464
465if [ -n "$arg_storage" ] ; then
466 VOLUME_STORAGE="$arg_storage"
467elif [ -z "$arg_nostorage" ] ; then
468 error "Missing storage argument! If it is intended then use '--no-storage-deletion' option"
469 exit 1
470fi
471
472if [ -n "$arg_components" ] ; then
473 HELM_CHART_RELEASE_NAME="$arg_components"
474fi
475
476if [ -n "$arg_deleteall" ] ; then
477 HELM_DELETE_ALL=yes
478fi
479
480if [ -n "$arg_cleanonly" ] ; then
481 HELM_SKIP_DEPLOY=yes
482fi
483
484
485#
486# main
487#
488
489# if --delete-all is used then redeploy all components (the current namespace is deleted)
490if [ -n "$HELM_DELETE_ALL" ] ; then
491 # undeploy helm release (prefix)
492 helm_undeploy "$RELEASE_PREFIX"
493
494 # we will delete the whole namespace
495 delete_namespace
496
497 if [ -n "$VOLUME_STORAGE" ] ; then
498 delete_storage
499 fi
500# delete and redeploy explicit or failed components...
501else
502 # if a helm chart release name was given then just redeploy said component and quit
503 if [ -n "$HELM_CHART_RELEASE_NAME" ] ; then
504 msg "Explicitly asked for component redeploy: ${HELM_CHART_RELEASE_NAME}"
505 _COMPONENTS="$HELM_CHART_RELEASE_NAME"
506 # simple heuristics: redeploy only failed components
507 else
508 msg "Delete successfully completed jobs..."
509 clean_jobs
510
511 msg "Find failed components..."
512 _COMPONENTS=$(get_failed_labels)
513 fi
514
515 for _component in ${_COMPONENTS} ; do
516 if echo "$_component" | grep -q "^${RELEASE_PREFIX}-" ; then
517 msg "Redeploy component: ${_component}"
518 redeploy_component ${_component}
519 else
520 error "Component release name '${_component}' does not match release prefix: ${RELEASE_PREFIX} (SKIP)"
521 fi
522 done
523fi
524
525if [ -z "$HELM_SKIP_DEPLOY" ] ; then
526 # TODO: this is suboptimal - find a way how to deploy only the affected component...
527 msg "Redeploy onap..."
528 msg helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
529 helm deploy ${RELEASE_PREFIX} local/onap --namespace ${NAMESPACE} ${OVERRIDES} --timeout ${HELM_TIMEOUT}
530else
531 msg "Clean only option used: Skipping redeploy..."
532fi
533
534msg DONE
535
536exit $?
537