Changed generated file names to section numbers or a short hash. Added verbose... (c47b9d6a) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

toMkdocs/toMkdocs.py

+197 −76

Original line number	Diff line number	Diff line
		@@ -7,31 +7,81 @@
		# directory structure.
		#

		from typing import Tuple
		import argparse, re, os, shutil
		from enum import Enum, auto
		import argparse, re, os, shutil, hashlib, base64
		from dataclasses import dataclass
		from rich import print

		verbose = False
		veryVerbose = False

		class LineType(Enum):
		""" Represents the type of a line in the markdown file. """
		HEADING = auto()
		TEXT = auto()
		CODEFENCESTART = auto()
		CODE = auto()
		CODEFENCEEND = auto()
		LIST = auto()
		NOTE = auto()

		@dataclass
		class Line:
		""" Represents a line in the markdown file. """
		text:str
		lineType:LineType = LineType.TEXT



		@dataclass
		class Clause:
		""" Represents a clause in the markdown file. """
		level:int
		clauseNumber:str
		title:str
		lines:list[str]
		lines:list[Line]
		onlyNav:bool = False

		fnLength = 4
		def asStringList(self) -> list[str]:
		""" Return the clause as a list of strings.

		Returns:
		The clause's lines as a list of strings.
		"""
		return [ l.text for l in self.lines ]


		_matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE)
		_matchCodefence = re.compile(r'\s```\s?.', re.IGNORECASE)
		_matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d(\.\d+)\b', re.IGNORECASE)
		_matchCodefenceStart = re.compile(r'\s```\s?.', re.IGNORECASE)
		_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
		_matchNote = re.compile(r'^\s>\s', re.IGNORECASE)
		_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
		_markdownLink = re.compile(r'[^!]\[[^\]]\]\((#.)\)', re.IGNORECASE)
		_markdownLink = re.compile(r'[^!]\[[^\]]\]\((#[^)])\)', re.IGNORECASE)
		_htmlLink = re.compile(r'<a\s+href="([^"\'])">[^<]</a>', re.IGNORECASE)
		_htmlAnchorLink = re.compile(r'<a\s+name="([^"])">[^<]</a>', re.IGNORECASE)
		_matchNoteStart = re.compile(r'^\s>\s(note)?\s[:]?\s', re.IGNORECASE)


		# TODO handle multiple nav levels (left bar) better (make conifgurable)
		# TODO Update links in the markdown files to the new structure


		def shortHash(value:str, length:int) -> str:
		""" Generate a short hash of a string value.

		Args:
		value: The value to hash.
		length: The length of the hash.

		Returns:
		The hash.
		"""
		return base64.b64encode(
		hashlib.sha256(
		value.encode()
		).digest()
		).decode()[:length]


		def analyseMarkdown(filename:str) -> list[Clause]:
		""" Analyse the markdown file and split it into clauses.
		@@ -48,26 +98,44 @@ def analyseMarkdown(filename:str) -> list[Clause]:
		with open(filename, 'r') as file:
		inLines = file.readlines()

		outLines:list[Clause] = [Clause(0, '', [])]
		outLines:list[Clause] = [Clause(0, '', '', [])]

		# Go through the lines and detect headers and codefences
		inCodefence = False
		for line in inLines:

		# Detect codefences
		if _matchCodefence.match(line):
		inCodefence = not inCodefence
		if _matchCodefenceStart.match(line):
		inCodefence = True
		outLines[-1].lines.append(Line(line, LineType.CODEFENCESTART))
		continue

		if _matchCodefenceEnd.match(line):
		inCodefence = False
		outLines[-1].lines.append(Line(line, LineType.CODEFENCEND))
		continue

		if inCodefence:
		outLines[-1].lines.append(line)
		outLines[-1].lines.append(Line(line, LineType.CODE))
		continue

		# Detect notes
		if _matchNote.match(line):
		outLines[-1].lines.append(Line(line, LineType.NOTE))
		continue

		# Detect headers
		_lineType = LineType.TEXT
		if (m := _matchHeader.match(line)):
		level = len(m.groups()[0])
		clauseTitle = m.groups()[1].strip()
		outLines.append(Clause(level, clauseTitle, []))
		headerNumber = _matchHeaderNumber.search(clauseTitle)
		outLines.append(Clause(len(m.groups()[0]), # level
		headerNumber.group() if headerNumber else shortHash(clauseTitle, 6),
		clauseTitle,
		[]))
		_lineType = LineType.HEADING

		outLines[-1].lines.append(line)
		outLines[-1].lines.append(Line(line, _lineType))

		return outLines

		@@ -88,7 +156,7 @@ def splitMarkdownDocument(clauses:list[Clause],
		Returns:
		The list of clauses.
		"""
		outLines:list[Clause] = [Clause(0, '', [])]
		outLines:list[Clause] = [Clause(0, '', '', [])]

		for clause in clauses:
		level = clause.level
		@@ -100,7 +168,7 @@ def splitMarkdownDocument(clauses:list[Clause],
		# Add a new output clause if the current clause's level is
		# equal or less than the split level
		if clause.level <= splitLevel:
		outLines.append(Clause(level, clause.title, []))
		outLines.append(Clause(level, clause.clauseNumber, clause.title, []))

		# Add the lines to the output clause
		outLines[-1].lines.extend(clause.lines)
		@@ -131,64 +199,26 @@ def prepareForMkdocs(clauses:list[Clause]) -> list[Clause]:
		if len(clause.lines) > 0:
		clause.lines.pop(0)
		# Also, remove the first empty lines if they exist
		while len(clause.lines) > 0 and clause.lines[0].strip() == '':
		while len(clause.lines) > 0 and clause.lines[0].text.strip() == '':
		clause.lines.pop(0)

		# Mark the whole clause if it is the first AND NOT only clause
		# for a parent clause. Then it is usually empty except the heading.
		# We still need it for navigation, so we mark it as onlyNav
		for clause in clauses:
		if len(''.join(clause.lines).strip()) == 0 and clause.level > 0:
		if len(''.join(clause.asStringList()).strip()) == 0 and clause.level > 0:
		clause.onlyNav = True

		# Repair wrong markdown for indented lines.
		# Add 2 spaces to existing 2-space indentions
		for clause in clauses:
		for i, line in enumerate(clause.lines):
		if _match2spaceListIndention.match(line):
		clause.lines[i] = ' ' + line
		if _match2spaceListIndention.match(line.text):
		clause.lines[i].text = ' ' + line.text

		return clauses


		def writeClauses(outLines:list[Clause], filename:str, navTitle:str) -> None:
		""" Write the clauses to separate files and create a navigation file.

		Args:
		outLines: The list of clauses.
		filename: The name of the original markdown file.
		navTitle: The title of the navigation entry. This is used to determine the directories.
		"""

		# Write the files
		# create directory first
		os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True)
		for i, f in enumerate(outLines):
		if len(f.lines) == 0 or f.onlyNav: # ignore empty clauses or clauses that are only for navigation
		print(f'[green]Navigation only - "{f.title}"')
		continue

		# write to single files
		print(f'[green]Writing "{i:0{fnLength}}.md" - "{f.title}"')
		with open(f'{os.path.dirname(filename)}/{navTitle}/{i:0{fnLength}}.md', 'w') as file:
		file.writelines(f.lines)


		# write nav.yml file
		print(f'[green]Writing "_nav.yml"')
		with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file:
		file.write(f' - {navTitle}:\n')
		for i, f in enumerate(outLines):
		if f.onlyNav:
		file.write(f" {' '*f.level}- '{f.title}':\n")
		#file.write(f"{' '*f.level}- '{f.title}':\n")
		else:
		if len(f.lines) == 0:
		continue
		file.write(f" {' '*f.level}- '{f.title}': '{navTitle}/{i:0{fnLength}}.md'\n")
		#file.write(f"{' '*f.level}- '{f.title}': '{navTitle}/{i:0{fnLength}}.md'\n")


		def updateLinks(clauses:list[Clause]) -> list[Clause]:
		""" Update the links in the clauses to the new structure. This is done by
		creating a dictionary of all links and their targets and then replacing
		@@ -202,47 +232,133 @@ def updateLinks(clauses:list[Clause]) -> list[Clause]:
		"""
		print(f'[green]Updating links in clauses')

		# Build the link target dictionary. Mapping anchor -> (clause index, clause)
		linkTargets:dict[str, Tuple[int, str]] = {}
		# Build the link target dictionary. Mapping anchor -> clause
		linkTargets:dict[str, Clause] = {}

		# Find all Markdown headers in the clauses and convert them to anchor format
		for i, clause in enumerate(clauses):
		# Find all headers in the clause
		for line in clause.lines:
		if (m := _matchHeader.match(line)):
		if (m := _matchHeader.match(line.text)):
		# convert the header to anchor format and add it to the dictionary
		# Remove special characters
		# TODO move perhaps to an own function
		anchor = m.groups()[1].strip().casefold().replace(' ', '-').replace('.', '')
		linkTargets[f'#{anchor}'] = (i, clause)
		anchor = m.groups()[1].strip().casefold().replace(' ', '-')
		for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'):
		anchor = anchor.replace(c, '')
		linkTargets[f'#{anchor}'] = clause
		if veryVerbose:
		print(f'[dim]Added Markdown anchor "{anchor}"')

		# Find all HTML anchors in the clauses and add them to the dictionary
		for i, clause in enumerate(clauses):
		for line in clause.lines:
		if (anchors := _htmlAnchorLink.findall(line)):
		if (anchors := _htmlAnchorLink.findall(line.text)):
		for a in anchors:
		linkTargets[f'#{a}'] = (i, clause)
		print(f'[green]Found anchor "{a}" in clause "{clause.title}"')
		linkTargets[f'#{a}'] = clause
		if veryVerbose:
		print(f'[dim]Found HTML anchor "{a}" in clause "{clause.title}"')

		# Replace the html links
		for clause in clauses:
		for i, line in enumerate(clause.lines):
		if (links := _htmlLink.findall(line)):
		if (links := _htmlLink.findall(line.text)):
		for lnk in links:
		width = 4
		if lnk in linkTargets:
		line = clause.lines[i] = line.replace(lnk, f'../{linkTargets[lnk][0]:0{width}}/#{lnk[1:]}') # Update the current line as well
		line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well
		if veryVerbose:
		print(f'[dim]Updated HTML link "{lnk}" in clause "{clause.title}"')

		# Replace the markdown links
		for clause in clauses:
		for i, line in enumerate(clause.lines):
		if (links := _markdownLink.findall(line)):
		if (links := _markdownLink.findall(line.text)):
		# Replace the old link targets with converted
		# (lower case) versions that point to the output files
		for lnk in links:
		if lnk in linkTargets:
		line = clause.lines[i] = line.replace(lnk, f'../{linkTargets[lnk][0]:0{fnLength}}/#{lnk[1:]}') # Update the current line as well
		_lnk =lnk.casefold()
		if _lnk in linkTargets:
		line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}') # Update the current line as well
		if veryVerbose:
		print(f'[dim]Updated Markdown link "{lnk}" in clause "{clause.title}"')

		return clauses


		def updateNotes(clauses:list[Clause]) -> list[Clause]:
		""" Update the notes in the clauses to the mkDocs notes version.

		Args:
		clauses: The list of clauses.

		Returns:
		The list of clauses.
		"""
		print(f'[green]Updating notes in clauses')

		for clause in clauses:
		lines:list[Line] = []
		inNote = False
		for line in clause.lines:
		match line.lineType:
		case LineType.NOTE:
		if not inNote:
		lines.append(Line('\n', LineType.TEXT))
		lines.append(Line('!!! note\n', LineType.NOTE))
		inNote = True
		lines.append(Line(f"\t{re.sub(_matchNoteStart, '', line.text)}", LineType.NOTE))
		if verbose:
		print(f'[dim]Converted note in clause "{clause.title}"')
		case _:
		if inNote:
		lines.append(Line('\n', LineType.TEXT))
		inNote = False
		lines.append(line)
		clause.lines = lines
		return clauses


		def writeClauses(outLines:list[Clause], filename:str, navTitle:str) -> None:
		""" Write the clauses to separate files and create a navigation file.

		Args:
		outLines: The list of clauses.
		filename: The name of the original markdown file.
		navTitle: The title of the navigation entry. This is used to determine the directories.
		"""

		print(f'[green]Writing clauses to files')
		# Write the files
		# create directory first
		os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True)
		for i, f in enumerate(outLines):
		if len(f.lines) == 0 or f.onlyNav: # ignore empty clauses or clauses that are only for navigation
		if verbose:
		print(f'[dim]Navigation only - "{f.title}"')
		continue

		# write to single files
		if verbose:
		print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"')
		with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file:
		file.writelines(f.asStringList())


		# write nav.yml file
		print(f'[green]Writing "_nav.yml"')
		with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file:
		if veryVerbose:
		print(f'[dim]Writing navigation file')
		file.write(f' - {navTitle}:\n')
		for i, f in enumerate(outLines):
		if f.onlyNav:
		file.write(f" {' '*f.level}- '{f.title}':\n")
		else:
		if len(f.lines) == 0:
		continue
		file.write(f" {' '*f.level}- '{f.title}': '{navTitle}/{f.clauseNumber}.md'\n")


		def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> None:
		""" Copy media files from the source directory to the target directory.

		@@ -262,14 +378,18 @@ def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') ->


		def processDocument(args:argparse.Namespace) -> None:
		global fnLength
		global verbose, veryVerbose
		document = os.path.abspath(args.document)
		fnLength = args.filename_length
		veryVerbose = args.very_verbose
		verbose = args.verbose
		if veryVerbose:
		verbose = True

		# Analyse the markdown file
		clauses = analyseMarkdown(document)
		clauses = splitMarkdownDocument(clauses, [ t.casefold() for t in args.ignore_clause ], args.split_level)
		clauses = updateLinks(clauses)
		clauses = updateNotes(clauses)
		clauses = prepareForMkdocs(clauses)

		# Write the clauses to files
		@@ -282,11 +402,12 @@ def processDocument(args:argparse.Namespace) -> None:
		if __name__ == '__main__':
		parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)

		parser.add_argument('--verbose', '-v', action = 'store_true', help = 'verbose output during processing')
		parser.add_argument('--very-verbose', '-vv', action = 'store_true', help = 'very verbose output during processing')
		parser.add_argument('--title', '-t', metavar = 'title', required = True, help = 'mkdocs navigation tile')
		parser.add_argument('--ignore-clause', '-ic', metavar = 'clause', nargs = '+', default = [ 'Contents', 'History' ], help = 'ignore headers in the markdown document')
		parser.add_argument('--split-level', '-sl', metavar = 'level', type = int, default = 2, help = 'split clauses on which level')
		parser.add_argument('--media-directory', '-md', metavar = 'media-directory', default = 'media', help = 'directory name where media files are stored')
		parser.add_argument('--filename-length', '-fl', metavar = 'length', default = 4, help = 'length of the filename with leading zeros')
		parser.add_argument('document', type = str, help = 'a oneM2M markdown specification document to process')
		args = parser.parse_args()
		processDocument(args)