tools/getrsttitle.py - onap/doc - Gitiles

 #!/usr/bin/env python3

 ### ===========================================================================
 ### Licensed under the Apache License, Version 2.0 (the "License");
 ### you may not use this file except in compliance with the License.
 ### You may obtain a copy of the License at
 ###
 ###       http://www.apache.org/licenses/LICENSE-2.0
 ###
 ### Unless required by applicable law or agreed to in writing, software
 ### distributed under the License is distributed on an "AS IS" BASIS,
 ### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ### See the License for the specific language governing permissions and
 ### limitations under the License.
 ###
 ### Copyright (C) 2021 Deutsche Telekom AG
 ### ============LICENSE_END====================================================

 #
 # getrsttitle.py
 # AUTHOR(S):
 # Thomas Kulik, Deutsche Telekom AG, 2021
 # DESCRIPTION:
 # Processes a list of rst files and retrieves the first title for every single rst file.
 # Copy program to {branch} directory of cloned ONAP documentation and run it.
 # USAGE:
 # python3 getrsttitle.py filename
 #
 # Helpful resources:
 # https://regex101.com/r/YNYK2Q/1/
 # https://stackoverflow.com/questions/20312443/how-to-find-title-a-la-restructuredtext
 #

 import re
 import os.path
 import sys
 import argparse

 #
 # argument handling
 #

 parser = argparse.ArgumentParser(description='Processes a list of rst files and retrieves the first title for every single rst file.')
 parser.add_argument('filename')
 args = parser.parse_args()

 # regex to find title underlined with various characters
 #regex1 = r"(?:^|\n)(?!\=)([^\n\r]+)\r?\n(\=+)(?:\r?\n| *$)"
 #regex2 = r"(?:^|\n)(?!\-)([^\n\r]+)\r?\n(\-+)(?:\r?\n| *$)"
 #regex3 = r"(?:^|\n)(?!\~)([^\n\r]+)\r?\n(\~+)(?:\r?\n| *$)"
 #regex4 = r"(?:^|\n)(?!\#)([^\n\r]+)\r?\n(\#+)(?:\r?\n| *$)"
 #regex5 = r"(?:^|\n)(?!\*)([^\n\r]+)\r?\n(\*+)(?:\r?\n| *$)"

 # there is a problem with raw strings (r"...") in the regex search below
 # workaround: using \\ to mask special characters in regex
 regex_list = [
     "(?:^|\\n)(?!\\=)([^\\n\\r]+)\\r?\\n(\\=+)(?:\\r?\\n| *$)",
     "(?:^|\\n)(?!\\-)([^\\n\\r]+)\\r?\\n(\\-+)(?:\\r?\\n| *$)",
     "(?:^|\\n)(?!\\~)([^\\n\\r]+)\\r?\\n(\\~+)(?:\\r?\\n| *$)",
     "(?:^|\\n)(?!\\#)([^\\n\\r]+)\\r?\\n(\\#+)(?:\\r?\\n| *$)",
     "(?:^|\\n)(?!\\*)([^\\n\\r]+)\\r?\\n(\\*+)(?:\\r?\\n| *$)",
     ]

 # DBUG only
 #for regex in regex_list:
 #    print(repr(regex))

 #filename = './master_indexrst_docs_root.log'
 #filename = './master_rstfiles.log'

 if os.path.isfile(args.filename):
     with open(args.filename) as fn:
         # read first line
         line = fn.readline()
         #print("DBUG: line={}".format(line))
         file_cnt = 0
         while line:
             rstfile         = "./" + re.sub('\[|\]', '', line).strip()
             repository_tmp1 = re.sub('\].+$', '',line).strip()
             repository      = re.sub('\[', '',repository_tmp1).strip()
             project_tmp1    = re.sub('\].+$', '',line).strip()
             project_tmp2    = re.sub('\/.+$', '',project_tmp1).strip()
             project         = re.sub('\[', '',project_tmp2).strip()
             #print("DBUG:       file #{} {}".format(file_cnt, rstfile))
             #print("DBUG: repository #{} {}".format(file_cnt, repository))
             #print("DBUG:    project #{} {}".format(file_cnt, project))
             file_cnt += 1
             if os.path.isfile(rstfile):
                 with open(rstfile, 'r') as content:
                     content_rstfile = content.read()
                     #print("DBUG: content_rstfile = \n{}".format(content_rstfile))
                     regex_cnt = 0
                     for regex in regex_list:
                         regex_cnt += 1
                         m = re.search(regex, content_rstfile, re.MULTILINE)
                         #print("DBUG: using regex  " + repr(regex))
                         #print("DBUG: using regex1 " + repr(regex1))
                         #print("DBUG: regex_cnt = {}".format(regex_cnt))
                         if m:
                             match = m.group(1)
                             #print ("DBUG: |REGEX| {} |REGEXCNT| {} |FILECNT| {} |FILE| {} |MATCH| {}".format(repr(regex), regex_cnt, file_cnt, rstfile, match))
                             # end regex loop if we have a title
                             break
                         else:
                             match = "NO-TITLE-FOUND"
                             #print ("DBUG: NO-TITLE-FOUND")
             else:
                 print ("ERR:  File {} does not exist".format(rstfile))

             #print ("DBUG: |REGEX| {} |REGEXCNT| {} |FILECNT| {} |FILE| {} |MATCH| {}".format(repr(regex), regex_cnt, file_cnt, rstfile, match))
             #print ("DBUG: file #{} '{}' '{}'".format(file_cnt, rstfile, match))

             # clean up result and print
             match_1 = match.replace(",", "") # remove ,
             match_final = match_1.strip()    # remove \n
             print ("{},{},{},{}".format(project.strip(), repository.strip(), line.strip(), match_final.strip()))

             # read next line and loop
             line = fn.readline()
 else:
     print ("ERR:  File {} does not exist".format(args.filename))

 sys.exit()

 #
 # example code to show detailed regex matches and group content
 # to be used in a future version of this program
 #
 # matches = re.finditer(regex2, content, re.MULTILINE)
 # for matchNum, match in enumerate(matches, start=1):
 #     print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
 #     print ("{match}".format(match = match.group()))
 #     for groupNum in range(0, len(match.groups())):
 #         groupNum = groupNum + 1
 #         print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
 # print ("Test:" "{group}".format(group = match.group(1)))
 #

 #
 # example code for pandas
 # to be used in a future version of this program
 #
 # import pandas as pd
 # pd.set_option('display.max_rows', 500)
 # pd.set_option('display.max_columns', 500)
 # pd.set_option('display.width', 1000)
 #
 # table = pd.read_csv("master_table.csv")
 # print(table)
 #
	#!/usr/bin/env python3

	### ===========================================================================
	### Licensed under the Apache License, Version 2.0 (the "License");
	### you may not use this file except in compliance with the License.
	### You may obtain a copy of the License at
	###
	### http://www.apache.org/licenses/LICENSE-2.0
	###
	### Unless required by applicable law or agreed to in writing, software
	### distributed under the License is distributed on an "AS IS" BASIS,
	### WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	### See the License for the specific language governing permissions and
	### limitations under the License.
	###
	### Copyright (C) 2021 Deutsche Telekom AG
	### ============LICENSE_END====================================================

	#
	# getrsttitle.py
	# AUTHOR(S):
	# Thomas Kulik, Deutsche Telekom AG, 2021
	# DESCRIPTION:
	# Processes a list of rst files and retrieves the first title for every single rst file.
	# Copy program to {branch} directory of cloned ONAP documentation and run it.
	# USAGE:
	# python3 getrsttitle.py filename
	#
	# Helpful resources:
	# https://regex101.com/r/YNYK2Q/1/
	# https://stackoverflow.com/questions/20312443/how-to-find-title-a-la-restructuredtext
	#

	import re
	import os.path
	import sys
	import argparse

	#
	# argument handling
	#

	parser = argparse.ArgumentParser(description='Processes a list of rst files and retrieves the first title for every single rst file.')
	parser.add_argument('filename')
	args = parser.parse_args()

	# regex to find title underlined with various characters
	#regex1 = r"(?:^\|\n)(?!\=)([^\n\r]+)\r?\n(\=+)(?:\r?\n\| *$)"
	#regex2 = r"(?:^\|\n)(?!\-)([^\n\r]+)\r?\n(\-+)(?:\r?\n\| *$)"
	#regex3 = r"(?:^\|\n)(?!\~)([^\n\r]+)\r?\n(\~+)(?:\r?\n\| *$)"
	#regex4 = r"(?:^\|\n)(?!\#)([^\n\r]+)\r?\n(\#+)(?:\r?\n\| *$)"
	#regex5 = r"(?:^\|\n)(?!\)([^\n\r]+)\r?\n(\+)(?:\r?\n\| *$)"

	# there is a problem with raw strings (r"...") in the regex search below
	# workaround: using \\ to mask special characters in regex
	regex_list = [
	"(?:^\|\\n)(?!\\=)([^\\n\\r]+)\\r?\\n(\\=+)(?:\\r?\\n\| *$)",
	"(?:^\|\\n)(?!\\-)([^\\n\\r]+)\\r?\\n(\\-+)(?:\\r?\\n\| *$)",
	"(?:^\|\\n)(?!\\~)([^\\n\\r]+)\\r?\\n(\\~+)(?:\\r?\\n\| *$)",
	"(?:^\|\\n)(?!\\#)([^\\n\\r]+)\\r?\\n(\\#+)(?:\\r?\\n\| *$)",
	"(?:^\|\\n)(?!\\)([^\\n\\r]+)\\r?\\n(\\+)(?:\\r?\\n\| *$)",
	]

	# DBUG only
	#for regex in regex_list:
	# print(repr(regex))

	#filename = './master_indexrst_docs_root.log'
	#filename = './master_rstfiles.log'

	if os.path.isfile(args.filename):
	with open(args.filename) as fn:
	# read first line
	line = fn.readline()
	#print("DBUG: line={}".format(line))
	file_cnt = 0
	while line:
	rstfile = "./" + re.sub('\[\|\]', '', line).strip()
	repository_tmp1 = re.sub('\].+$', '',line).strip()
	repository = re.sub('\[', '',repository_tmp1).strip()
	project_tmp1 = re.sub('\].+$', '',line).strip()
	project_tmp2 = re.sub('\/.+$', '',project_tmp1).strip()
	project = re.sub('\[', '',project_tmp2).strip()
	#print("DBUG: file #{} {}".format(file_cnt, rstfile))
	#print("DBUG: repository #{} {}".format(file_cnt, repository))
	#print("DBUG: project #{} {}".format(file_cnt, project))
	file_cnt += 1
	if os.path.isfile(rstfile):
	with open(rstfile, 'r') as content:
	content_rstfile = content.read()
	#print("DBUG: content_rstfile = \n{}".format(content_rstfile))
	regex_cnt = 0
	for regex in regex_list:
	regex_cnt += 1
	m = re.search(regex, content_rstfile, re.MULTILINE)
	#print("DBUG: using regex " + repr(regex))
	#print("DBUG: using regex1 " + repr(regex1))
	#print("DBUG: regex_cnt = {}".format(regex_cnt))
	if m:
	match = m.group(1)
	#print ("DBUG: \|REGEX\| {} \|REGEXCNT\| {} \|FILECNT\| {} \|FILE\| {} \|MATCH\| {}".format(repr(regex), regex_cnt, file_cnt, rstfile, match))
	# end regex loop if we have a title
	break
	else:
	match = "NO-TITLE-FOUND"
	#print ("DBUG: NO-TITLE-FOUND")
	else:
	print ("ERR: File {} does not exist".format(rstfile))

	#print ("DBUG: \|REGEX\| {} \|REGEXCNT\| {} \|FILECNT\| {} \|FILE\| {} \|MATCH\| {}".format(repr(regex), regex_cnt, file_cnt, rstfile, match))
	#print ("DBUG: file #{} '{}' '{}'".format(file_cnt, rstfile, match))

	# clean up result and print
	match_1 = match.replace(",", "") # remove ,
	match_final = match_1.strip() # remove \n
	print ("{},{},{},{}".format(project.strip(), repository.strip(), line.strip(), match_final.strip()))

	# read next line and loop
	line = fn.readline()
	else:
	print ("ERR: File {} does not exist".format(args.filename))

	sys.exit()

	#
	# example code to show detailed regex matches and group content
	# to be used in a future version of this program
	#
	# matches = re.finditer(regex2, content, re.MULTILINE)
	# for matchNum, match in enumerate(matches, start=1):
	# print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
	# print ("{match}".format(match = match.group()))
	# for groupNum in range(0, len(match.groups())):
	# groupNum = groupNum + 1
	# print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))
	# print ("Test:" "{group}".format(group = match.group(1)))
	#

	#
	# example code for pandas
	# to be used in a future version of this program
	#
	# import pandas as pd
	# pd.set_option('display.max_rows', 500)
	# pd.set_option('display.max_columns', 500)
	# pd.set_option('display.width', 1000)
	#
	# table = pd.read_csv("master_table.csv")
	# print(table)
	#