Blame - tools/c2m.sh - onap/doc

blob: cf0b2d1f2e1d4c8e2be9fc8b46fb4d3e9b3f461d [file] [log] [blame]

Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	1	#!/bin/bash
				2
Gergely Csatari	9205c55	2021-09-08 11:53:02 +0300	[diff] [blame]	3	set -x # uncomment for bash script debugging
				4	echo "c2m -------------------------------------------------------------"
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	5	### ============================================================================
				6	### Licensed under the Apache License, Version 2.0 (the "License");
				7	### you may not use this file except in compliance with the License.
				8	### You may obtain a copy of the License at
				9	###
				10	### http://www.apache.org/licenses/LICENSE-2.0
				11	###
				12	### Unless required by applicable law or agreed to in writing, software
				13	### distributed under the License is distributed on an "AS IS" BASIS,
				14	### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				15	### See the License for the specific language governing permissions and
				16	### limitations under the License.
				17	### ============LICENSE_END=====================================================
				18
				19
				20	###
				21	### c2m
				22	###
				23	### AUTHOR(S):
				24	### Thomas Kulik, Deutsche Telekom AG, 2020
				25	###
				26	### DESCRIPTION:
				27	### c2m automates additional tasks required in case you want to export and
				28	### convert a set of wiki pages. the export and first conversion to markdown is
				29	### done by confluence2md, provided by viaboxx.
				30	### c2m processes a list of (to be exported) wiki pages, creates corresponding
				31	### export directories, exports and converts pages (in various formats if
				32	### required), opens an editor and cleans up afterwards.
				33	### c2m checks also for problematic content in the export and creates a warning
				34	### in case of detection.
				35	###
				36	### ISSUES:
				37	### - markdown (md) output of confluence2md contains sometimes tags that are
				38	### somehow "merged" with the topic headline; manual edit is required here
				39	###
				40	### OPEN:
				41	### - confluence2md does not support all of the currently used confluence page
				42	### types (structured-macros) - result for unsupported pages is
				43	### "not satisfying"; enhancements (java) are required
				44	### - opt: toc creation in root document in case you export a tree of documents
				45	### to separate files
				46	### - opt: remove wiki credentials from script
				47	###
				48	### REQUIRED:
				49	### - pandoc, retext, confluence2md, java (older version for confluence2md),
				50	### login for the confluence wiki
				51	###
				52	### SEE ALSO:
				53	### - https://www.viaboxx.de/code/confluence2md/
				54	### - https://github.com/viaboxxsystems/confluence2md
				55	###
				56
				57
				58	###
				59	### CHANGELOG (LATEST ON TOP)
				60	###
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	61	### 1.2.0 (2021-08-02) Corrections to http/https proxy handling and support to
				62	### get Confluence credentials from env variables instead of
				63	### directly from the code.
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	64	### 1.1.0 (2020-03-10) added support for http/https proxy and anonymous wiki
				65	### access. thx to eric, nicolas and sylvain (orange, france)
				66	### confluence2md jar file now has to be in the same path as
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	67	### c2m.
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	68	### 1.0.0 (2020-03-09) initial release
				69	###
				70
				71
				72	###
				73	### c2m example pagelist
				74	###
				75	### example pagelist (field descriptions below); it uses the delimiter "\|" for
				76	### the four fields per line.
				77	### copy/paste page id and title from wiki; to get the wiki page_id you have to
				78	### login to the wiki, open the page and choose e.g. the history.
				79	### depth: use depth to follow down the child-pages hierarchy if required:
				80	### -1=infinte, 0=no children, #=number of child-pages to follow.
				81	### every hierarchy "0" entry will lead into the creation of a dedicated working
				82	### directory where the page and child-pages are stored.
				83	### for better readability you can add spaces to the list, but use "\|" as a
				84	### delimiter. lines starting with a # are filtered by c2m.
				85	###
				86	### hierarchy \| page_id \| page_title \| depth
				87	###
				88	### 0 \| 1018748 \| ONAP Portal \| 0
				89	### 1.1 \| 1018759 \| ONAP Portal for users \| 0
				90	### 1.2 \| 1018762 \| ONAP Portal for administrators \| 0
				91	### 1.2.1 \| 1018764 \| Admins \| 0
				92	### 1.2.2 \| 1018811 \| Users \| 0
				93	### 1.2.3 \| 1018821 \| Portal Admins \| 0
				94	### 1.2.4 \| 1018826 \| Application Onboarding \| 0
				95	### 1.2.5 \| 1018832 \| Widget Onboarding \| 0
				96	### 1.2.6 \| 1018835 \| Edit Functional Menu \| 0
				97	### 1.2.7 \| 16004953 \| Portal Microservices Onboarding \| 0
				98	###
				99	### in case you want to export to only one single output page (that contains all
				100	### child-pages of the above example) use:
				101	###
				102	### 0 \| 1018748 \| ONAP Portal \| -1
				103	###
				104
				105
				106	###
				107	### some initial variables
				108	###
				109
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	110	script_version="1.2.0 (2021-08-02)"
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	111
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	112	if [[ -z "$CONFLUENCE_USERNAME" \|\| -z "$CONFLUENCE_PASSWORD" ]]
				113	then
				114	echo "Mandatory environment variables:"
				115	echo " CONFLUENCE_USERNAME: Confluence username"
				116	echo " CONFLUENCE_PASSWORD: Confluence password."
				117	echo "Be aware! Setting bash debuging on will print credentials."
				118	exit
				119	fi
				120
				121	user="${CONFLUENCE_USERNAME}";
				122	passwd="${CONFLUENCE_PASSWORD}";
				123	credentials="${user}":"${passwd}";
				124	server="https://wiki.onap.org";
Gergely Csatari	9205c55	2021-09-08 11:53:02 +0300	[diff] [blame]	125	[ -z "$rst_editor" ] && rst_editor="retext --preview";
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	126
				127	# remove credentials for those using anonymous access
				128	test "${credentials}" = "***:***" && credentials=""
				129
				130	# explicit script dir to locate jar file
				131	basedir="$(cd "$(dirname "$0")"; pwd)"
				132
				133	###
				134	### some inital tasks after script has been started
				135	###
				136
				137	###
				138	### print script version, date and time
				139	###
				140
				141	echo "INFO ***************************************************************************"
				142	echo "INFO c2m Version ${script_version}, started $(date)";
				143
				144	###
				145	### simple script argument handling
				146	###
				147
				148	page_list=$1;
				149
				150	# check if there is an argument at all
				151	if [[ "$page_list" == "" ]] ; then
				152	echo 'Usage: c2m [PAGELIST]'
				153	exit 1
				154	fi
				155
				156	# check if argument is a file
				157	if [ ! -f $page_list ] ; then
				158	echo "Error: can't find pagelist \"$page_list\""
				159	exit 1
				160	fi
				161
				162	###
				163	### declare the functions of this script
				164	###
				165
				166	###
				167	### function: create working directory; save (only the last) existing one; remove older versions; do some error handling
				168	###
				169
				170	function create_working_dir {
				171
Gergely Csatari	9205c55	2021-09-08 11:53:02 +0300	[diff] [blame]	172	base_dir="output"
				173	[ ! -d $base_dir ] && mkdir $base_dir
				174
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	175	# compose name for working directory
				176	#working_dir="${page_id}-${page_title}";
				177	#working_dir="${page_title}-id${page_id}";
Gergely Csatari	9205c55	2021-09-08 11:53:02 +0300	[diff] [blame]	178	working_dir="${base_dir}/${page_title}";
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	179	echo "INFO ***************************************************************************"
				180	echo "INFO working directory \"$working_dir\" will be created"
				181
				182	# check if current working directory is already in the list
				183	if [[ " ${existing_working_dirs[@]} " =~ " ${working_dir} " ]]; then
				184	echo "ERRR ***************************************************************************"
				185	echo "ERRR working directory \"${working_dir}\" already exists - check entries in page_list for duplicates"
				186	echo "ERRR exiting ..."
				187	exit -1
				188	else
				189	# store working_dir name for error handling
				190	existing_working_dirs+=(${working_dir})
				191	fi
				192
				193	# sample code
				194	#if [[ ! " ${array[@]} " =~ " ${value} " ]]; then
				195	# # whatever you want to do when arr doesn't contain value
				196	#fi
				197
				198	# check existence of working directory
				199	if [ -d "$working_dir" ]; then
				200	# check existence of old saved working directory
				201	if [ -d "${working_dir}.old" ]; then
				202	# remove the old saved working directory
				203	rm -r "${working_dir}.old";
				204	fi
				205	# save (only) the latest working directory
				206	mv $working_dir "$working_dir.old";
				207	fi
				208	# finally create the working directory and cd into it
				209	mkdir $working_dir;
				210	cd $working_dir;
				211	}
				212
				213	###
				214	### function: pull pages from wiki - currently we are testing some export variations
				215	###
				216
				217	function pull_pages_from_wiki {
				218
				219	# define outfile name
				220	#out_file="${page_title}-id${page_id}";
				221	out_file="${page_title}";
				222
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	223	# set proxy if needed
				224	if [[ -v http_proxy && ! -z "$http_proxy" ]]; then
				225	proxy_to_parse="${http_proxy/http:\/\//""}";
				226	echo "http_proxy is set to \"${proxy_to_parse}\"";
				227	elif [[ -v https_proxy && ! -z "$https_proxy" ]]; then
				228	proxy_to_parse="${https_proxy/https:\/\//""}";
				229	echo "https_proxy is set to \"${proxy_to_parse}\"";
				230	fi
Gergely Csatari	9205c55	2021-09-08 11:53:02 +0300	[diff] [blame]	231
				232	#java_options="--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.util=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED --add-opens java.base/java.lang.annotation=ALL-UNNAMED --add-opens java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/java.net=ALL-UNNAMED"
				233
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	234	if [[ $proxy_to_parse =~ ^([\.0-9]+) ]]; then
Gergely Csatari	9205c55	2021-09-08 11:53:02 +0300	[diff] [blame]	235	java_options="${java_options} -Dhttps.proxyHost=${BASH_REMATCH[1]} -Dhttp.proxyHost=${BASH_REMATCH[1]}"
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	236	echo "${java_options}"
				237	fi
				238	if [[ $proxy_to_parse =~ .*:([0-9]+) ]]; then
Gergely Csatari	9205c55	2021-09-08 11:53:02 +0300	[diff] [blame]	239	java_options="${java_options} -Dhttps.proxyPort=${BASH_REMATCH[1]} -Dhttp.proxyPort=${BASH_REMATCH[1]}"
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	240	echo "${java_options}"
				241	fi
				242
				243	# TODO: -depth
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	244	# pull pages from wiki and convert to markdown (as a source for conversion by pandoc)
Gergely Csatari	9205c55	2021-09-08 11:53:02 +0300	[diff] [blame]	245	java $java_options -jar $basedir/confluence2md-2.1-fat.jar +H true +T false +RootPageTitle false +FootNotes true -maxHeaderDepth 7 -depth $depth -v true -o ${out_file}.md -u "${credentials}" -server $server $page_id
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	246	}
				247
				248	###
				249	### function: simple search and (red colored) warning if special terms are detected in the md output file
				250	###
				251
				252	function detect_unwanted_content_in_md_outfile {
				253	for search_term in "ecomp" "wiki.onap.com" "10.53.199.7" "at&t"
				254	do
				255	if grep $search_term ${out_file}.md; then
				256	echo -e "\e[31mWARN ***************************************************************************\e[39m";
				257	echo -e "\e[31mWARN term \"${search_term}\" detected in ${out_file}.md\e[39m";
				258	fi
				259	done
				260	}
				261
				262	###
				263	### function: pandoc conversion from md (variants) to rst - currenty testing some conversion formats
				264	###
				265
				266	function convert_md_outfile_to_rst {
				267	#depending on the given source format (--from) the results may vary
				268	#pandoc -s --toc --toc-depth=5 --from markdown_mmd --to rst "${out_file}.md" -o "${out_file}-markdown_mmd.rst"
				269	#pandoc -s --toc --toc-depth=5 --from markdown_strict --to rst "${out_file}.md" -o "${out_file}-markdown_strict.rst"
				270	#pandoc -s --toc --toc-depth=5 --from markdown_phpextra --to rst "${out_file}.md" -o "${out_file}-markdown_phpextra.rst"
				271	#pandoc -s --toc-depth=5 --from markdown_phpextra --to rst "${out_file}.md" -o "${out_file}-markdown_phpextra.rst"
				272	pandoc -s --toc-depth=5 --from markdown_phpextra --to rst "${out_file}.md" -o "${out_file}.rst"
				273	}
				274
				275	###
				276	### function: check results in rst editor
				277	###
				278
				279	function open_rst_editor {
				280	#echo "DBUG ***************************************************************************"
				281	#echo "DBUG open \"${out_file}\*.rst\" with rst editor"
				282	$rst_editor ${out_file}*.rst &
				283	}
				284
				285	###
				286	### function: clean up export directories from files no longer needed
				287	###
				288
				289	function clean_up {
				290	rm *.md 2>/dev/null
				291	rm attachments/*.json 2>/dev/null
				292	rm attachments/.*.json 2>/dev/null
				293	}
				294
				295	###
				296	### main: let's start the work ...
				297	###
				298
				299	# read in pagelist file, filter lines starting with a comment and create an array that contains all (uncommented) lines of the file
				300
				301	# sample code
				302	# IFS=',' read -r -a page_array <<< "$page_list" # in case $page_list was defined as a varable in this script; use "," as the delimiter
				303	#readarray -t page_array < $page_list; # old version
				304
				305	readarray -t page_array < <(grep -v "^#" $page_list); # new version which skips line with comments
				306
				307	# INFO: show list of pages by printing every line of the array
				308	echo "INFO ***************************************************************************"
				309	for line in "${page_array[@]}"
				310	do
				311	echo "INFO $line"
				312	done
				313
				314	# the main loop reads the page_array line by line and processes the content
				315	for line in "${page_array[@]}"
				316	do
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	317	echo "INFO - bupp $line"
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	318	# cut out values from the current line (delimiter is now the "\|") and assign them to the correct variables
				319	hierarchy=$(echo $line \| cut -f1 -d\\|)
				320	page_id=$(echo $line \| cut -f2 -d\\|)
				321	page_title=$(echo $line \| cut -f3 -d\\|)
				322	depth=$(echo $line \| cut -f4 -d\\|)
				323
				324	# remove leading and trailing spaces from variables
				325	hierarchy="$(echo -e "${hierarchy}" \| sed -e 's/^[[:space:]]//' -e 's/[[:space:]]$//')";
				326	page_id="$(echo -e "${page_id}" \| sed -e 's/^[[:space:]]//' -e 's/[[:space:]]$//')";
				327	page_title="$(echo -e "${page_title}" \| sed -e 's/^[[:space:]]//' -e 's/[[:space:]]$//')";
				328	depth="$(echo -e "${depth}" \| sed -e 's/^[[:space:]]//' -e 's/[[:space:]]$//')";
				329
				330	# substitude all blanks in page_title with a minus sign
				331	page_title=$(echo -e ${page_title} \| tr '[:blank:]' '-');
				332	echo "DBUG page_title=\"$page_title\""
				333
				334	# convert page_title to lowercase
				335	page_title=$(echo -e ${page_title} \| tr '[:upper:]' '[:lower:]');
				336	#echo "DBUG page_title=\"$page_title\""
				337
				338	# remove all characters from page_title which may cause problems in the shell ... or are reserved by conventions of this script
				339	#page_title="$(echo -e "${page_title}" \| sed -e 's/[^A-Za-z0-9._-]//g')"; # a less strict version
				340	page_title="$(echo -e "${page_title}" \| sed -e 's/[^A-Za-z0-9-]//g')";
				341	echo "DBUG page_title=\"$page_title\""
				342
				343	# INFO: print variables to check content
				344	echo "INFO ***************************************************************************"
				345	echo "INFO hierarchy = \"$hierarchy\""
				346	echo "INFO page_id = \"$page_id\""
				347	echo "INFO page_title = \"$page_title\""
				348	echo "INFO depth = \"$depth\""
Gergely Csatari	3091232	2021-07-30 16:52:03 +0300	[diff] [blame]	349
Thomas Kulik	fb8a0ee	2020-03-11 13:13:52 +0100	[diff] [blame]	350	# create working directory - done for every! "hierarchy 0" entry of page_list
				351	if [ "$hierarchy" == "0" ]
				352	then
				353	create_working_dir
				354	fi
				355
				356	# call functions to process page
				357	pull_pages_from_wiki
				358	detect_unwanted_content_in_md_outfile
				359	convert_md_outfile_to_rst
				360	open_rst_editor
				361	clean_up
				362
				363	# main loop end
				364	done
				365
				366	# bye!
				367	echo "INFO ***************************************************************************"
				368	echo "INFO c2m Version ${script_version}, ended $(date)"
				369	echo ""
				370	exit 0