blob: cf0b2d1f2e1d4c8e2be9fc8b46fb4d3e9b3f461d [file] [log] [blame]
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +01001#!/bin/bash
2
Gergely Csatari9205c552021-09-08 11:53:02 +03003set -x # uncomment for bash script debugging
4echo "c2m -------------------------------------------------------------"
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +01005### ============================================================================
6### Licensed under the Apache License, Version 2.0 (the "License");
7### you may not use this file except in compliance with the License.
8### You may obtain a copy of the License at
9###
10### http://www.apache.org/licenses/LICENSE-2.0
11###
12### Unless required by applicable law or agreed to in writing, software
13### distributed under the License is distributed on an "AS IS" BASIS,
14### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15### See the License for the specific language governing permissions and
16### limitations under the License.
17### ============LICENSE_END=====================================================
18
19
20###
21### c2m
22###
23### AUTHOR(S):
24### Thomas Kulik, Deutsche Telekom AG, 2020
25###
26### DESCRIPTION:
27### c2m automates additional tasks required in case you want to export and
28### convert a set of wiki pages. the export and first conversion to markdown is
29### done by confluence2md, provided by viaboxx.
30### c2m processes a list of (to be exported) wiki pages, creates corresponding
31### export directories, exports and converts pages (in various formats if
32### required), opens an editor and cleans up afterwards.
33### c2m checks also for problematic content in the export and creates a warning
34### in case of detection.
35###
36### ISSUES:
37### - markdown (md) output of confluence2md contains sometimes tags that are
38### somehow "merged" with the topic headline; manual edit is required here
39###
40### OPEN:
41### - confluence2md does not support all of the currently used confluence page
42### types (structured-macros) - result for unsupported pages is
43### "not satisfying"; enhancements (java) are required
44### - opt: toc creation in root document in case you export a tree of documents
45### to separate files
46### - opt: remove wiki credentials from script
47###
48### REQUIRED:
49### - pandoc, retext, confluence2md, java (older version for confluence2md),
50### login for the confluence wiki
51###
52### SEE ALSO:
53### - https://www.viaboxx.de/code/confluence2md/
54### - https://github.com/viaboxxsystems/confluence2md
55###
56
57
58###
59### CHANGELOG (LATEST ON TOP)
60###
Gergely Csatari30912322021-07-30 16:52:03 +030061### 1.2.0 (2021-08-02) Corrections to http/https proxy handling and support to
62### get Confluence credentials from env variables instead of
63### directly from the code.
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +010064### 1.1.0 (2020-03-10) added support for http/https proxy and anonymous wiki
65### access. thx to eric, nicolas and sylvain (orange, france)
66### confluence2md jar file now has to be in the same path as
Gergely Csatari30912322021-07-30 16:52:03 +030067### c2m.
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +010068### 1.0.0 (2020-03-09) initial release
69###
70
71
72###
73### c2m example pagelist
74###
75### example pagelist (field descriptions below); it uses the delimiter "|" for
76### the four fields per line.
77### copy/paste page id and title from wiki; to get the wiki page_id you have to
78### login to the wiki, open the page and choose e.g. the history.
79### depth: use depth to follow down the child-pages hierarchy if required:
80### -1=infinte, 0=no children, #=number of child-pages to follow.
81### every hierarchy "0" entry will lead into the creation of a dedicated working
82### directory where the page and child-pages are stored.
83### for better readability you can add spaces to the list, but use "|" as a
84### delimiter. lines starting with a # are filtered by c2m.
85###
86### hierarchy | page_id | page_title | depth
87###
88### 0 | 1018748 | ONAP Portal | 0
89### 1.1 | 1018759 | ONAP Portal for users | 0
90### 1.2 | 1018762 | ONAP Portal for administrators | 0
91### 1.2.1 | 1018764 | Admins | 0
92### 1.2.2 | 1018811 | Users | 0
93### 1.2.3 | 1018821 | Portal Admins | 0
94### 1.2.4 | 1018826 | Application Onboarding | 0
95### 1.2.5 | 1018832 | Widget Onboarding | 0
96### 1.2.6 | 1018835 | Edit Functional Menu | 0
97### 1.2.7 | 16004953 | Portal Microservices Onboarding | 0
98###
99### in case you want to export to only one single output page (that contains all
100### child-pages of the above example) use:
101###
102### 0 | 1018748 | ONAP Portal | -1
103###
104
105
106###
107### some initial variables
108###
109
Gergely Csatari30912322021-07-30 16:52:03 +0300110script_version="1.2.0 (2021-08-02)"
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +0100111
Gergely Csatari30912322021-07-30 16:52:03 +0300112if [[ -z "$CONFLUENCE_USERNAME" || -z "$CONFLUENCE_PASSWORD" ]]
113then
114 echo "Mandatory environment variables:"
115 echo " CONFLUENCE_USERNAME: Confluence username"
116 echo " CONFLUENCE_PASSWORD: Confluence password."
117 echo "Be aware! Setting bash debuging on will print credentials."
118 exit
119fi
120
121user="${CONFLUENCE_USERNAME}";
122passwd="${CONFLUENCE_PASSWORD}";
123credentials="${user}":"${passwd}";
124server="https://wiki.onap.org";
Gergely Csatari9205c552021-09-08 11:53:02 +0300125[ -z "$rst_editor" ] && rst_editor="retext --preview";
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +0100126
127# remove credentials for those using anonymous access
128test "${credentials}" = "*****:*****" && credentials=""
129
130# explicit script dir to locate jar file
131basedir="$(cd "$(dirname "$0")"; pwd)"
132
133###
134### some inital tasks after script has been started
135###
136
137###
138### print script version, date and time
139###
140
141echo "INFO ***************************************************************************"
142echo "INFO c2m Version ${script_version}, started $(date)";
143
144###
145### simple script argument handling
146###
147
148page_list=$1;
149
150# check if there is an argument at all
151if [[ "$page_list" == "" ]] ; then
152 echo 'Usage: c2m [PAGELIST]'
153 exit 1
154fi
155
156# check if argument is a file
157if [ ! -f $page_list ] ; then
158 echo "Error: can't find pagelist \"$page_list\""
159 exit 1
160fi
161
162###
163### declare the functions of this script
164###
165
166###
167### function: create working directory; save (only the last) existing one; remove older versions; do some error handling
168###
169
170function create_working_dir {
171
Gergely Csatari9205c552021-09-08 11:53:02 +0300172 base_dir="output"
173 [ ! -d $base_dir ] && mkdir $base_dir
174
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +0100175 # compose name for working directory
176 #working_dir="${page_id}-${page_title}";
177 #working_dir="${page_title}-id${page_id}";
Gergely Csatari9205c552021-09-08 11:53:02 +0300178 working_dir="${base_dir}/${page_title}";
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +0100179 echo "INFO ***************************************************************************"
180 echo "INFO working directory \"$working_dir\" will be created"
181
182 # check if current working directory is already in the list
183 if [[ " ${existing_working_dirs[@]} " =~ " ${working_dir} " ]]; then
184 echo "ERRR ***************************************************************************"
185 echo "ERRR working directory \"${working_dir}\" already exists - check entries in page_list for duplicates"
186 echo "ERRR exiting ..."
187 exit -1
188 else
189 # store working_dir name for error handling
190 existing_working_dirs+=(${working_dir})
191 fi
192
193 # sample code
194 #if [[ ! " ${array[@]} " =~ " ${value} " ]]; then
195 # # whatever you want to do when arr doesn't contain value
196 #fi
197
198 # check existence of working directory
199 if [ -d "$working_dir" ]; then
200 # check existence of old saved working directory
201 if [ -d "${working_dir}.old" ]; then
202 # remove the old saved working directory
203 rm -r "${working_dir}.old";
204 fi
205 # save (only) the latest working directory
206 mv $working_dir "$working_dir.old";
207 fi
208 # finally create the working directory and cd into it
209 mkdir $working_dir;
210 cd $working_dir;
211}
212
213###
214### function: pull pages from wiki - currently we are testing some export variations
215###
216
217function pull_pages_from_wiki {
218
219 # define outfile name
220 #out_file="${page_title}-id${page_id}";
221 out_file="${page_title}";
222
Gergely Csatari30912322021-07-30 16:52:03 +0300223 # set proxy if needed
224 if [[ -v http_proxy && ! -z "$http_proxy" ]]; then
225 proxy_to_parse="${http_proxy/http:\/\//""}";
226 echo "http_proxy is set to \"${proxy_to_parse}\"";
227 elif [[ -v https_proxy && ! -z "$https_proxy" ]]; then
228 proxy_to_parse="${https_proxy/https:\/\//""}";
229 echo "https_proxy is set to \"${proxy_to_parse}\"";
230 fi
Gergely Csatari9205c552021-09-08 11:53:02 +0300231
232 #java_options="--add-opens java.base/java.lang=ALL-UNNAMED --add-opens java.base/java.util=ALL-UNNAMED --add-opens java.base/java.io=ALL-UNNAMED --add-opens java.base/java.lang.annotation=ALL-UNNAMED --add-opens java.base/java.lang.reflect=ALL-UNNAMED --add-opens java.base/java.net=ALL-UNNAMED"
233
Gergely Csatari30912322021-07-30 16:52:03 +0300234 if [[ $proxy_to_parse =~ ^([\.0-9]+) ]]; then
Gergely Csatari9205c552021-09-08 11:53:02 +0300235 java_options="${java_options} -Dhttps.proxyHost=${BASH_REMATCH[1]} -Dhttp.proxyHost=${BASH_REMATCH[1]}"
Gergely Csatari30912322021-07-30 16:52:03 +0300236 echo "${java_options}"
237 fi
238 if [[ $proxy_to_parse =~ .*:([0-9]+) ]]; then
Gergely Csatari9205c552021-09-08 11:53:02 +0300239 java_options="${java_options} -Dhttps.proxyPort=${BASH_REMATCH[1]} -Dhttp.proxyPort=${BASH_REMATCH[1]}"
Gergely Csatari30912322021-07-30 16:52:03 +0300240 echo "${java_options}"
241 fi
242
243 # TODO: -depth
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +0100244 # pull pages from wiki and convert to markdown (as a source for conversion by pandoc)
Gergely Csatari9205c552021-09-08 11:53:02 +0300245 java $java_options -jar $basedir/confluence2md-2.1-fat.jar +H true +T false +RootPageTitle false +FootNotes true -maxHeaderDepth 7 -depth $depth -v true -o ${out_file}.md -u "${credentials}" -server $server $page_id
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +0100246}
247
248###
249### function: simple search and (red colored) warning if special terms are detected in the md output file
250###
251
252function detect_unwanted_content_in_md_outfile {
253for search_term in "ecomp" "wiki.onap.com" "10.53.199.7" "at&t"
254do
255 if grep $search_term ${out_file}.md; then
256 echo -e "\e[31mWARN ***************************************************************************\e[39m";
257 echo -e "\e[31mWARN term \"${search_term}\" detected in ${out_file}.md\e[39m";
258 fi
259done
260}
261
262###
263### function: pandoc conversion from md (variants) to rst - currenty testing some conversion formats
264###
265
266function convert_md_outfile_to_rst {
267 #depending on the given source format (--from) the results may vary
268 #pandoc -s --toc --toc-depth=5 --from markdown_mmd --to rst "${out_file}.md" -o "${out_file}-markdown_mmd.rst"
269 #pandoc -s --toc --toc-depth=5 --from markdown_strict --to rst "${out_file}.md" -o "${out_file}-markdown_strict.rst"
270 #pandoc -s --toc --toc-depth=5 --from markdown_phpextra --to rst "${out_file}.md" -o "${out_file}-markdown_phpextra.rst"
271 #pandoc -s --toc-depth=5 --from markdown_phpextra --to rst "${out_file}.md" -o "${out_file}-markdown_phpextra.rst"
272 pandoc -s --toc-depth=5 --from markdown_phpextra --to rst "${out_file}.md" -o "${out_file}.rst"
273}
274
275###
276### function: check results in rst editor
277###
278
279function open_rst_editor {
280 #echo "DBUG ***************************************************************************"
281 #echo "DBUG open \"${out_file}\*.rst\" with rst editor"
282 $rst_editor ${out_file}*.rst &
283}
284
285###
286### function: clean up export directories from files no longer needed
287###
288
289function clean_up {
290 rm *.md 2>/dev/null
291 rm attachments/*.json 2>/dev/null
292 rm attachments/.*.json 2>/dev/null
293}
294
295###
296### main: let's start the work ...
297###
298
299# read in pagelist file, filter lines starting with a comment and create an array that contains all (uncommented) lines of the file
300
301# sample code
302# IFS=',' read -r -a page_array <<< "$page_list" # in case $page_list was defined as a varable in this script; use "," as the delimiter
303#readarray -t page_array < $page_list; # old version
304
305readarray -t page_array < <(grep -v "^#" $page_list); # new version which skips line with comments
306
307# INFO: show list of pages by printing every line of the array
308echo "INFO ***************************************************************************"
309for line in "${page_array[@]}"
310do
311 echo "INFO $line"
312done
313
314# the main loop reads the page_array line by line and processes the content
315for line in "${page_array[@]}"
316do
Gergely Csatari30912322021-07-30 16:52:03 +0300317 echo "INFO - bupp $line"
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +0100318 # cut out values from the current line (delimiter is now the "|") and assign them to the correct variables
319 hierarchy=$(echo $line | cut -f1 -d\|)
320 page_id=$(echo $line | cut -f2 -d\|)
321 page_title=$(echo $line | cut -f3 -d\|)
322 depth=$(echo $line | cut -f4 -d\|)
323
324 # remove leading and trailing spaces from variables
325 hierarchy="$(echo -e "${hierarchy}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')";
326 page_id="$(echo -e "${page_id}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')";
327 page_title="$(echo -e "${page_title}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')";
328 depth="$(echo -e "${depth}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')";
329
330 # substitude all blanks in page_title with a minus sign
331 page_title=$(echo -e ${page_title} | tr '[:blank:]' '-');
332 echo "DBUG page_title=\"$page_title\""
333
334 # convert page_title to lowercase
335 page_title=$(echo -e ${page_title} | tr '[:upper:]' '[:lower:]');
336 #echo "DBUG page_title=\"$page_title\""
337
338 # remove all characters from page_title which may cause problems in the shell ... or are reserved by conventions of this script
339 #page_title="$(echo -e "${page_title}" | sed -e 's/[^A-Za-z0-9._-]//g')"; # a less strict version
340 page_title="$(echo -e "${page_title}" | sed -e 's/[^A-Za-z0-9-]//g')";
341 echo "DBUG page_title=\"$page_title\""
342
343 # INFO: print variables to check content
344 echo "INFO ***************************************************************************"
345 echo "INFO hierarchy = \"$hierarchy\""
346 echo "INFO page_id = \"$page_id\""
347 echo "INFO page_title = \"$page_title\""
348 echo "INFO depth = \"$depth\""
Gergely Csatari30912322021-07-30 16:52:03 +0300349
Thomas Kulikfb8a0ee2020-03-11 13:13:52 +0100350 # create working directory - done for every! "hierarchy 0" entry of page_list
351 if [ "$hierarchy" == "0" ]
352 then
353 create_working_dir
354 fi
355
356 # call functions to process page
357 pull_pages_from_wiki
358 detect_unwanted_content_in_md_outfile
359 convert_md_outfile_to_rst
360 open_rst_editor
361 clean_up
362
363# main loop end
364done
365
366# bye!
367echo "INFO ***************************************************************************"
368echo "INFO c2m Version ${script_version}, ended $(date)"
369echo ""
370exit 0