Commit c47b9d6a authored by ankraft's avatar ankraft
Browse files

Changed generated file names to section numbers or a short hash. Added verbose...

Changed generated file names to section numbers or a short hash. Added verbose and veryVerbose CLAs. Added converting of notes to MkDocs admonitions
parent b52168fa
Loading
Loading
Loading
Loading
+197 −76
Original line number Diff line number Diff line
@@ -7,31 +7,81 @@
#	directory structure.
#

from typing import Tuple
import argparse, re, os, shutil
from enum import Enum, auto
import argparse, re, os, shutil, hashlib, base64
from dataclasses import dataclass
from rich import print

verbose = False
veryVerbose = False

class LineType(Enum):
	"""	Represents the type of a line in the markdown file. """
	HEADING = auto()
	TEXT = auto()
	CODEFENCESTART = auto()
	CODE = auto()
	CODEFENCEEND = auto()
	LIST = auto()
	NOTE = auto()

@dataclass
class Line:
	"""	Represents a line in the markdown file. """
	text:str
	lineType:LineType = LineType.TEXT



@dataclass
class Clause:
	"""	Represents a clause in the markdown file. """
	level:int
	clauseNumber:str
	title:str
	lines:list[str]
	lines:list[Line]
	onlyNav:bool = False

fnLength = 4
	def asStringList(self) -> list[str]:
		"""	Return the clause as a list of strings. 

			Returns:
				The clause's lines as a list of strings.
		"""
		return [ l.text for l in self.lines ]


_matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE)
_matchCodefence = re.compile(r'\s*```\s?.*', re.IGNORECASE)
_matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE)
_matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#.*)\)', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
_htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE)
_matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE)


# TODO handle multiple nav levels (left bar) better (make conifgurable)
# TODO Update links in the markdown files to the new structure


def shortHash(value:str, length:int) -> str:
	"""	Generate a short hash of a string value.

		Args:
			value: The value to hash.
			length: The length of the hash.

		Returns:
			The hash.
	"""
	return	base64.b64encode( 
				hashlib.sha256( 
					value.encode()
				).digest()
			 ).decode()[:length]


def analyseMarkdown(filename:str) -> list[Clause]:
	"""	Analyse the markdown file and split it into clauses.
@@ -48,26 +98,44 @@ def analyseMarkdown(filename:str) -> list[Clause]:
	with open(filename, 'r') as file:
		inLines = file.readlines()
	
	outLines:list[Clause] = [Clause(0, '', [])]
	outLines:list[Clause] = [Clause(0, '', '', [])]

	# Go through the lines and detect headers and codefences
	inCodefence = False
	for line in inLines:

		# Detect codefences
		if _matchCodefence.match(line):
			inCodefence = not inCodefence
		if _matchCodefenceStart.match(line):
			inCodefence = True
			outLines[-1].lines.append(Line(line, LineType.CODEFENCESTART))
			continue

		if _matchCodefenceEnd.match(line):
			inCodefence = False
			outLines[-1].lines.append(Line(line, LineType.CODEFENCEND))
			continue

		if inCodefence:
			outLines[-1].lines.append(line)
			outLines[-1].lines.append(Line(line, LineType.CODE))
			continue
	
		# Detect notes
		if _matchNote.match(line):
			outLines[-1].lines.append(Line(line, LineType.NOTE))
			continue
  
		# Detect headers
		_lineType = LineType.TEXT
		if (m := _matchHeader.match(line)):
			level = len(m.groups()[0])
			clauseTitle = m.groups()[1].strip()
			outLines.append(Clause(level, clauseTitle, []))
			headerNumber = _matchHeaderNumber.search(clauseTitle)
			outLines.append(Clause(len(m.groups()[0]), # level
						  		   headerNumber.group() if headerNumber else shortHash(clauseTitle, 6),
								   clauseTitle, 
								   []))
			_lineType = LineType.HEADING

		outLines[-1].lines.append(line)
		outLines[-1].lines.append(Line(line, _lineType))

	return outLines

@@ -88,7 +156,7 @@ def splitMarkdownDocument(clauses:list[Clause],
		Returns:
			The list of clauses.
	"""
	outLines:list[Clause] = [Clause(0, '', [])]
	outLines:list[Clause] = [Clause(0, '', '', [])]

	for clause in clauses:
		level = clause.level
@@ -100,7 +168,7 @@ def splitMarkdownDocument(clauses:list[Clause],
		# Add a new output clause if the current clause's level is 
  		# equal or less than the split level
		if clause.level <= splitLevel:
			outLines.append(Clause(level, clause.title, []))
			outLines.append(Clause(level, clause.clauseNumber, clause.title, []))
		
		# Add the lines to the output clause
		outLines[-1].lines.extend(clause.lines)
@@ -131,64 +199,26 @@ def prepareForMkdocs(clauses:list[Clause]) -> list[Clause]:
		if len(clause.lines) > 0:
			clause.lines.pop(0)
			# Also, remove the first empty lines if they exist
			while len(clause.lines) > 0 and clause.lines[0].strip() == '':
			while len(clause.lines) > 0 and clause.lines[0].text.strip() == '':
				clause.lines.pop(0)
	
	# Mark the whole clause if it is the first AND NOT only clause
	# for a parent clause. Then it is usually empty except the heading.
	# We still need it for navigation, so we mark it as onlyNav
	for clause in clauses:
		if len(''.join(clause.lines).strip()) == 0 and clause.level > 0:
		if len(''.join(clause.asStringList()).strip()) == 0 and clause.level > 0:
			clause.onlyNav = True

	# Repair wrong markdown for indented lines.
	# Add 2 spaces to existing 2-space indentions
	for clause in clauses:
		for i, line in enumerate(clause.lines):
			if _match2spaceListIndention.match(line):
				clause.lines[i] = '  ' + line
			if _match2spaceListIndention.match(line.text):
				clause.lines[i].text = '  ' + line.text
	
	return clauses


def writeClauses(outLines:list[Clause], filename:str, navTitle:str) -> None:
	"""	Write the clauses to separate files and create a navigation file.

		Args:
			outLines: The list of clauses.
			filename: The name of the original markdown file.
			navTitle: The title of the navigation entry. This is used to determine the directories.
	"""

	# Write the files
	# create directory first
	os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True)
	for i, f in enumerate(outLines):
		if len(f.lines) == 0 or f.onlyNav:	# ignore empty clauses or clauses that are only for navigation
			print(f'[green]Navigation only   - "{f.title}"')
			continue
	
		# write to single files
		print(f'[green]Writing "{i:0{fnLength}}.md" - "{f.title}"')
		with open(f'{os.path.dirname(filename)}/{navTitle}/{i:0{fnLength}}.md', 'w') as file:
			file.writelines(f.lines)

	
	# write nav.yml file
	print(f'[green]Writing "_nav.yml"')
	with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file:
		file.write(f'  - {navTitle}:\n')
		for i, f in enumerate(outLines):
			if f.onlyNav:
				file.write(f"  {'  '*f.level}- '{f.title}':\n")
				#file.write(f"{'  '*f.level}- '{f.title}':\n")
			else:
				if len(f.lines) == 0:
					continue
				file.write(f"  {'  '*f.level}- '{f.title}': '{navTitle}/{i:0{fnLength}}.md'\n")
				#file.write(f"{'  '*f.level}- '{f.title}': '{navTitle}/{i:0{fnLength}}.md'\n")


def updateLinks(clauses:list[Clause]) -> list[Clause]:
	"""	Update the links in the clauses to the new structure. This is done by
		creating a dictionary of all links and their targets and then replacing
@@ -202,47 +232,133 @@ def updateLinks(clauses:list[Clause]) -> list[Clause]:
	"""
	print(f'[green]Updating links in clauses')

	# Build the link target dictionary. Mapping anchor -> (clause index, clause)
	linkTargets:dict[str, Tuple[int, str]] = {}
	# Build the link target dictionary. Mapping anchor -> clause
	linkTargets:dict[str, Clause] = {}

	# Find all Markdown headers in the clauses and convert them to anchor format
	for i, clause in enumerate(clauses):
		# Find all headers in the clause
		for line in clause.lines:
			if (m := _matchHeader.match(line)):
			if (m := _matchHeader.match(line.text)):
				# convert the header to anchor format and add it to the dictionary
				# Remove special characters
				# TODO move perhaps to an own function
				anchor = m.groups()[1].strip().casefold().replace(' ', '-').replace('.', '')
				linkTargets[f'#{anchor}'] = (i, clause)
				anchor = m.groups()[1].strip().casefold().replace(' ', '-')
				for c in ( '.', '(', ')', '[', ']', ':', ',', "'", '"'):
					anchor = anchor.replace(c, '')
				linkTargets[f'#{anchor}'] = clause
				if veryVerbose:
					print(f'[dim]Added Markdown anchor "{anchor}"')

	# Find all HTML anchors in the clauses and add them to the dictionary
	for i, clause in enumerate(clauses):
		for line in clause.lines:
			if (anchors := _htmlAnchorLink.findall(line)):
			if (anchors := _htmlAnchorLink.findall(line.text)):
				for a in anchors:
					linkTargets[f'#{a}'] = (i, clause)
					print(f'[green]Found anchor "{a}" in clause "{clause.title}"')
					linkTargets[f'#{a}'] = clause
					if veryVerbose:
						print(f'[dim]Found HTML anchor "{a}" in clause "{clause.title}"')

	# Replace the html links
	for clause in clauses:
		for i, line in enumerate(clause.lines):
			if (links := _htmlLink.findall(line)):
			if (links := _htmlLink.findall(line.text)):
				for lnk in links:
					width = 4
					if lnk in linkTargets:
						line = clause.lines[i] = line.replace(lnk, f'../{linkTargets[lnk][0]:0{width}}/#{lnk[1:]}')	# Update the current line as well
						line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[lnk].clauseNumber}/#{lnk[1:]}')	# Update the current line as well
				if veryVerbose:
					print(f'[dim]Updated HTML link "{lnk}" in clause "{clause.title}"')

	# Replace the markdown links
	for clause in clauses:
		for i, line in enumerate(clause.lines):
			if (links := _markdownLink.findall(line)):
			if (links := _markdownLink.findall(line.text)):
				# Replace the old link targets with converted 
				# (lower case) versions that point to the output files
				for lnk in links:
					if lnk in linkTargets:
						line = clause.lines[i] = line.replace(lnk, f'../{linkTargets[lnk][0]:0{fnLength}}/#{lnk[1:]}')	# Update the current line as well
					_lnk =lnk.casefold()
					if _lnk in linkTargets:
						line.text = clause.lines[i].text = line.text.replace(lnk, f'../{linkTargets[_lnk].clauseNumber}/#{lnk[1:]}')	# Update the current line as well
				if veryVerbose:
					print(f'[dim]Updated Markdown link "{lnk}" in clause "{clause.title}"')

	return clauses


def updateNotes(clauses:list[Clause]) -> list[Clause]:
	"""	Update the notes in the clauses to the mkDocs notes version.

		Args:
			clauses: The list of clauses.
		
		Returns:
			The list of clauses.
	"""
	print(f'[green]Updating notes in clauses')

	for clause in clauses:
		lines:list[Line] = []
		inNote = False
		for line in clause.lines:
			match line.lineType:
				case LineType.NOTE:
					if not inNote:
						lines.append(Line('\n', LineType.TEXT))
						lines.append(Line('!!! note\n', LineType.NOTE))
						inNote = True
					lines.append(Line(f"\t{re.sub(_matchNoteStart, '', line.text)}", LineType.NOTE))
					if verbose:
						print(f'[dim]Converted note in clause "{clause.title}"')
				case _:
					if inNote:
						lines.append(Line('\n', LineType.TEXT))
					inNote = False
					lines.append(line)
		clause.lines = lines
	return clauses


def writeClauses(outLines:list[Clause], filename:str, navTitle:str) -> None:
	"""	Write the clauses to separate files and create a navigation file.

		Args:
			outLines: The list of clauses.
			filename: The name of the original markdown file.
			navTitle: The title of the navigation entry. This is used to determine the directories.
	"""

	print(f'[green]Writing clauses to files')
	# Write the files
	# create directory first
	os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True)
	for i, f in enumerate(outLines):
		if len(f.lines) == 0 or f.onlyNav:	# ignore empty clauses or clauses that are only for navigation
			if verbose:
				print(f'[dim]Navigation only - "{f.title}"')
			continue
	
		# write to single files
		if verbose:
			print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"')
		with open(f'{os.path.dirname(filename)}/{navTitle}/{f.clauseNumber}.md', 'w') as file:
			file.writelines(f.asStringList())

	
	# write nav.yml file
	print(f'[green]Writing "_nav.yml"')
	with open(f'{os.path.dirname(filename)}/_nav.yml', 'w') as file:
		if veryVerbose:
			print(f'[dim]Writing navigation file')
		file.write(f'  - {navTitle}:\n')
		for i, f in enumerate(outLines):
			if f.onlyNav:
				file.write(f"  {'  '*f.level}- '{f.title}':\n")
			else:
				if len(f.lines) == 0:
					continue
				file.write(f"  {'  '*f.level}- '{f.title}': '{navTitle}/{f.clauseNumber}.md'\n")


def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> None:
	"""	Copy media files from the source directory to the target directory.

@@ -262,14 +378,18 @@ def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') ->

	
def processDocument(args:argparse.Namespace) -> None:
	global fnLength
	global verbose, veryVerbose
	document = os.path.abspath(args.document)
	fnLength = args.filename_length
	veryVerbose = args.very_verbose
	verbose = args.verbose
	if veryVerbose:
		verbose = True

	# Analyse the markdown file
	clauses = analyseMarkdown(document)
	clauses = splitMarkdownDocument(clauses, [ t.casefold() for t in args.ignore_clause ], args.split_level)
	clauses = updateLinks(clauses)
	clauses = updateNotes(clauses)
	clauses = prepareForMkdocs(clauses)

	# Write the clauses to files
@@ -282,11 +402,12 @@ def processDocument(args:argparse.Namespace) -> None:
if __name__ == '__main__':
	parser = argparse.ArgumentParser(formatter_class = argparse.ArgumentDefaultsHelpFormatter)

	parser.add_argument('--verbose', '-v', action = 'store_true', help = 'verbose output during processing')
	parser.add_argument('--very-verbose', '-vv', action = 'store_true', help = 'very verbose output during processing')
	parser.add_argument('--title', '-t', metavar = 'title', required = True, help = 'mkdocs navigation tile')
	parser.add_argument('--ignore-clause', '-ic', metavar = 'clause', nargs = '+', default = [ 'Contents', 'History' ], help = 'ignore headers in the markdown document')
	parser.add_argument('--split-level', '-sl', metavar = 'level', type = int, default = 2, help = 'split clauses on which level')
	parser.add_argument('--media-directory', '-md', metavar = 'media-directory', default = 'media', help = 'directory name where media files are stored')
	parser.add_argument('--filename-length', '-fl', metavar = 'length', default = 4, help = 'length of the filename with leading zeros')
	parser.add_argument('document', type = str, help = 'a oneM2M markdown specification document to process')
	args = parser.parse_args()
	processDocument(args)