Commit b65475df authored by ankraft's avatar ankraft
Browse files

Added support for footnotes (move them to the correct clauses if detected...

Added support for footnotes (move them to the correct clauses if detected somewhere in the document). Also refactored functions into a new Document class
parent 1391ee18
Loading
Loading
Loading
Loading
+268 −231
Original line number Original line Diff line number Diff line
@@ -91,123 +91,43 @@ class Clause:
		return sum([ len(l.text) for l in self.lines ])
		return sum([ len(l.text) for l in self.lines ])




_matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE)
class Footnote:
_matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE)
	"""	Represents a footnote in the markdown file. """
_matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
	def __init__(self, id:str, line:Line) -> None:
_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
		self.id = id
_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
		self.line = line
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
_htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE)
_htmlTag = re.compile(r'<[^>]*>', re.IGNORECASE)
_matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE)


# TODO handle multiple nav levels (left bar) better (make conifgurable)


def shortHash(value:str, length:int) -> str:
	"""	Generate a short hash of a string value.

		Args:
			value: The value to hash.
			length: The length of the hash.

		Returns:
			The hash.
	"""
	return	base64.b64encode( 
				hashlib.sha256( 
					value.encode()
				).digest()
			 ).decode()[:length]


def analyseMarkdown(filename:str) -> list[Clause]:
	"""	Analyse the markdown file and split it into clauses.


		Args:
			filename: The name of the markdown file.

		Returns:
			The list of clauses.
	"""

	print(f'[green]Analyzing "{filename}"')

	# Read the file.
	# Note: We use utf-8 and replace errors to avoid problems with special or unknown characters.
	with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file:
		inLines = file.readlines()


	outClauses:list[Clause] = [Clause(0, '', '', [])]
class Document:
	"""	Represents the document object. """	
	clauses:list[Clause] = []
	footnotes:list[Footnote] = []


	# Go through the lines and detect headers and codefences
	def __init__(self, clauses:list[Clause], footnotes:list[Footnote]) -> None:
	inCodefence = False
		self.clauses = clauses
	for line in inLines:
		self.footnotes = footnotes


		# Detect and handle codefences
		# For the moment we support only codefences that start and end
		# with 3 backticks. This is the most common way to define codefences.
		# Note, that longer codefences are allowed by the markdown specification.


		if _matchCodefenceStart.match(line) and not inCodefence:
	def splitMarkdownDocument(self, 
			inCodefence = True
			outClauses[-1].append(Line(line, LineType.CODEFENCESTART))
			continue
		if _matchCodefenceEnd.match(line):
			inCodefence = False
			outClauses[-1].append(Line(line, LineType.CODEFENCEEND))
			continue
		if inCodefence:
			outClauses[-1].append(Line(line, LineType.CODE))
			continue
	
		# Detect notes
  		# Notes are lines that start with a '>'.
		if _matchNote.match(line):
			outClauses[-1].append(Line(line, LineType.NOTE))
			continue
  
		# Detect headers
		_lineType = LineType.TEXT
		if (m := _matchHeader.match(line)):
			# Add a new clause
			clauseTitle = m.groups()[1].strip()
			clauseTitle = re.sub(_htmlTag, '', clauseTitle)
			headerNumber = _matchHeaderNumber.search(clauseTitle)
			outClauses.append(Clause(len(m.groups()[0]), # level
						  		   headerNumber.group() if headerNumber else shortHash(clauseTitle, 6),
								   clauseTitle, 
								   []))
			_lineType = LineType.HEADING

		# Just add the line to the current clause as text
		outClauses[-1].append(Line(line, _lineType))

	return outClauses


def splitMarkdownDocument(clauses:list[Clause], 
							  ignoreTitles:list[str] = [], 
							  ignoreTitles:list[str] = [], 
							  splitLevel:int = 1,
							  splitLevel:int = 1,
						  ignoreUntilFirstHeading:bool = True) -> list[Clause]:
							  ignoreUntilFirstHeading:bool = True) -> None:
		"""	Split the clauses at a certain level. This is used to create the separate
		"""	Split the clauses at a certain level. This is used to create the separate
			markdown files for MkDocs.
			markdown files for MkDocs.


			After the split, the clauses are stored in the document object.

			Args:
			Args:
			clauses: The list of clauses.
				ignoreTitles: A list of titles that should be ignored. They are not included in the output.
				ignoreTitles: A list of titles that should be ignored. They are not included in the output.
				splitLevel: The level at which the clauses should be split.
				splitLevel: The level at which the clauses should be split.
				ignoreUntilFirstHeader: Ignore all clauses until the first heading.
				ignoreUntilFirstHeader: Ignore all clauses until the first heading.
			
			
		Returns:
			The list of clauses.
		"""
		"""
	outClauses:list[Clause] = [Clause(0, '', '', [])]
		result:list[Clause] = [Clause(0, '', '', [])]

		ignoreTitles = [ t.casefold() for t in ignoreTitles ]	# convert to lower case


	for clause in clauses:
		for clause in self.clauses:
			level = clause.level
			level = clause.level


			# Check if the current clause should be ignored
			# Check if the current clause should be ignored
@@ -217,60 +137,51 @@ def splitMarkdownDocument(clauses:list[Clause],
			# Add a new output clause if the current clause's level is 
			# Add a new output clause if the current clause's level is 
			# equal or less than the split level
			# equal or less than the split level
			if clause.level <= splitLevel:
			if clause.level <= splitLevel:
			outClauses.append(Clause(level, clause.clauseNumber, clause.title, []))
				result.append(Clause(level, clause.clauseNumber, clause.title, []))
			
			
			# Add the lines to the output clause
			# Add the lines to the output clause
		outClauses[-1].extend(clause)
			result[-1].extend(clause)
		
		
		# Remove the first clause if it has no title
		# Remove the first clause if it has no title
		if ignoreUntilFirstHeading:
		if ignoreUntilFirstHeading:
		while len(outClauses[0].title) == 0:
			while len(result[0].title) == 0:
			outClauses.pop(0)
				result.pop(0)
		
		
	return outClauses
		self.clauses = result




def prepareForMkdocs(clauses:list[Clause]) -> list[Clause]:
	def insertFootnotes(self) -> None:
	"""	Prepare the clauses for MkDocs. This includes removing the heading
		"""	Insert footnotes into the clauses.
		from the clauses and marking the clauses that are only for navigation.


		Args:
			After the insertion, the clauses are stored in the document object.
			clauses: The list of clauses.
			
			
		Returns:
			The list of clauses.
		"""
		"""
		print(f'[green]Adding footnotes to clauses')


	# Remove the heading from the lines. The heading is the first line
		for clause in self.clauses:
	# in the clause. This is done because MkDocs repeats the heading when
			foundFootnotes:list[Footnote] = []
	# displaying the page.
			for line in clause.lines:
	for clause in clauses:
				# ATTN: Only footnotes in normal text lines are checked
		if clause.linesCount > 0:
			clause.lines.pop(0)
			# Also, remove the first empty lines if they exist
			while clause.linesCount > 0 and clause.lines[0].text.strip() == '':
				clause.lines.pop(0)
				
				
	# Repair wrong markdown for indented lines.
				if line.lineType == LineType.TEXT and (fn := _inlineFootnote.search(line.text)):
	# Add 2 spaces to existing 2-space indentions
					# Find the footnote in the list of footnotes
	for clause in clauses:
					for f in self.footnotes:
		for i, line in enumerate(clause.lines):
						if f.id == fn.groups()[0]:
			if _match2spaceListIndention.match(line.text):
							foundFootnotes.append(f)
				clause.lines[i].text = '  ' + line.text


	return clauses
			# Insert the footnotes at the end of the clause
			if len(foundFootnotes) > 0:
				clause.append(Line('\n', LineType.TEXT))
				for f in foundFootnotes:
					clause.append(f.line)




def updateLinks(clauses:list[Clause]) -> list[Clause]:
	def updateLinks(self) -> None:
		"""	Update the links in the clauses to the new structure. This is done by
		"""	Update the links in the clauses to the new structure. This is done by
			creating a dictionary of all links and their targets and then replacing
			creating a dictionary of all links and their targets and then replacing
			the links in the clauses.
			the links in the clauses.


		Args:
			After the update, the clauses are stored in the document object.
			clauses: The list of clauses.
		
		Returns:
			The list of clauses.
		"""
		"""
		print(f'[green]Updating links in clauses')
		print(f'[green]Updating links in clauses')


@@ -278,7 +189,7 @@ def updateLinks(clauses:list[Clause]) -> list[Clause]:
		linkTargets:dict[str, Clause] = {}
		linkTargets:dict[str, Clause] = {}


		# Find all Markdown headers in the clauses and convert them to anchor format
		# Find all Markdown headers in the clauses and convert them to anchor format
	for i, clause in enumerate(clauses):
		for i, clause in enumerate(self.clauses):
			# Find all headers in the clause
			# Find all headers in the clause
			for line in clause.lines:
			for line in clause.lines:
				if (m := _matchHeader.match(line.text)):
				if (m := _matchHeader.match(line.text)):
@@ -297,7 +208,7 @@ def updateLinks(clauses:list[Clause]) -> list[Clause]:
						print(f'[dim]Added Markdown anchor "{anchor}"')
						print(f'[dim]Added Markdown anchor "{anchor}"')


		# Find all HTML anchors in the clauses and add them to the dictionary
		# Find all HTML anchors in the clauses and add them to the dictionary
	for i, clause in enumerate(clauses):
		for i, clause in enumerate(self.clauses):
			for line in clause.lines:
			for line in clause.lines:
				if (anchors := _htmlAnchorLink.findall(line.text)):
				if (anchors := _htmlAnchorLink.findall(line.text)):
					for a in anchors:
					for a in anchors:
@@ -306,7 +217,7 @@ def updateLinks(clauses:list[Clause]) -> list[Clause]:
							print(f'[dim]Found HTML anchor "{a}" in clause "{clause.title}"')
							print(f'[dim]Found HTML anchor "{a}" in clause "{clause.title}"')


		# Replace the html links
		# Replace the html links
	for clause in clauses:
		for clause in self.clauses:
			for i, line in enumerate(clause.lines):
			for i, line in enumerate(clause.lines):
				if (links := _htmlLink.findall(line.text)):
				if (links := _htmlLink.findall(line.text)):
					for lnk in links:
					for lnk in links:
@@ -316,7 +227,7 @@ def updateLinks(clauses:list[Clause]) -> list[Clause]:
						print(f'[dim]Updated HTML link "{lnk}" in clause "{clause.title}"')
						print(f'[dim]Updated HTML link "{lnk}" in clause "{clause.title}"')


		# Replace the markdown links
		# Replace the markdown links
	for clause in clauses:
		for clause in self.clauses:
			for i, line in enumerate(clause.lines):
			for i, line in enumerate(clause.lines):
				if (links := _markdownLink.findall(line.text)):
				if (links := _markdownLink.findall(line.text)):
					# Replace the old link targets with converted 
					# Replace the old link targets with converted 
@@ -328,21 +239,15 @@ def updateLinks(clauses:list[Clause]) -> list[Clause]:
					if veryVerbose:
					if veryVerbose:
						print(f'[dim]Updated Markdown link "{lnk}" in clause "{clause.title}"')
						print(f'[dim]Updated Markdown link "{lnk}" in clause "{clause.title}"')


	return clauses



def updateNotes(clauses:list[Clause]) -> list[Clause]:
	def updateNotes(self) -> None:
		"""	Update the notes in the clauses to the mkDocs notes version.
		"""	Update the notes in the clauses to the mkDocs notes version.


		Args:
			After the update, the clauses are stored in the document object.
			clauses: The list of clauses.
		
		Returns:
			The list of clauses.
		"""
		"""
		print(f'[green]Updating notes in clauses')
		print(f'[green]Updating notes in clauses')


	for clause in clauses:
		for clause in self.clauses:
			lines:list[Line] = []
			lines:list[Line] = []
			inNote = False
			inNote = False
			for line in clause.lines:
			for line in clause.lines:
@@ -360,14 +265,37 @@ def updateNotes(clauses:list[Clause]) -> list[Clause]:
					inNote = False
					inNote = False
					lines.append(line)
					lines.append(line)
			clause.lines = lines
			clause.lines = lines
	return clauses




def writeClauses(outClauses:list[Clause], filename:str, navTitle:str) -> None:
	def prepareForMkdocs(self) -> None:
		"""	Prepare the clauses for MkDocs. This includes removing the heading
			from the clauses and marking the clauses that are only for navigation.

			After the preparation, the clauses are stored in the document object.
		"""

		# Remove the heading from the lines. The heading is the first line
		# in the clause. This is done because MkDocs repeats the heading when
		# displaying the page.
		for clause in self.clauses:
			if clause.linesCount > 0:
				clause.lines.pop(0)
				# Also, remove the first empty lines if they exist
				while clause.linesCount > 0 and clause.lines[0].text.strip() == '':
					clause.lines.pop(0)
		
		# Repair wrong markdown for indented lines.
		# Add 2 spaces to existing 2-space indentions
		for clause in self.clauses:
			for i, line in enumerate(clause.lines):
				if _match2spaceListIndention.match(line.text):
					clause.lines[i].text = '  ' + line.text


	def writeClausesMkDocs(self, filename:str, navTitle:str) -> None:
		"""	Write the clauses to separate files and create a navigation file.
		"""	Write the clauses to separate files and create a navigation file.


			Args:
			Args:
			outClauses: The list of clauses.
				filename: The name of the original markdown file.
				filename: The name of the original markdown file.
				navTitle: The title of the navigation entry. This is used to determine the directories.
				navTitle: The title of the navigation entry. This is used to determine the directories.
		"""
		"""
@@ -377,7 +305,7 @@ def writeClauses(outClauses:list[Clause], filename:str, navTitle:str) -> None:
		os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True)
		os.makedirs(f'{os.path.dirname(filename)}/{navTitle}', exist_ok = True)


		# Write the files
		# Write the files
	for i, f in enumerate(outClauses):
		for i, f in enumerate(self.clauses):
			# write to single files, even empty ones
			# write to single files, even empty ones
			if verbose:
			if verbose:
				print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"')
				print(f'[dim]Writing "{f.clauseNumber}.md" - "{f.title}"')
@@ -395,12 +323,12 @@ def writeClauses(outClauses:list[Clause], filename:str, navTitle:str) -> None:
			if veryVerbose:
			if veryVerbose:
				print(f'[dim]Writing navigation file')
				print(f'[dim]Writing navigation file')
			file.write(f'  - {navTitle}:\n')
			file.write(f'  - {navTitle}:\n')
		for i, f in enumerate(outClauses):
			for i, f in enumerate(self.clauses):


				# TODO handle if the next clause is more than one level deeper
				# TODO handle if the next clause is more than one level deeper
	
	
				_title = f.title.replace("'", '"')
				_title = f.title.replace("'", '"')
			nextClause = outClauses[i+1] if i+1 < len(outClauses) else None
				nextClause = self.clauses[i+1] if i+1 < len(self.clauses) else None
				if nextClause is None or nextClause.level <= f.level:
				if nextClause is None or nextClause.level <= f.level:
					file.write(f"  {'  '*f.level}- '{_title}': '{navTitle}/{f.clauseNumber}.md'\n")
					file.write(f"  {'  '*f.level}- '{_title}': '{navTitle}/{f.clauseNumber}.md'\n")
				else:
				else:
@@ -409,6 +337,114 @@ def writeClauses(outClauses:list[Clause], filename:str, navTitle:str) -> None:
						file.write(f"  {'  '*nextClause.level}- 'Introduction': '{navTitle}/{f.clauseNumber}.md'\n")
						file.write(f"  {'  '*nextClause.level}- 'Introduction': '{navTitle}/{f.clauseNumber}.md'\n")





_matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE)
_matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE)
_matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
_htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE)
_htmlTag = re.compile(r'<[^>]*>', re.IGNORECASE)
_matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE)
_footnote = re.compile(r'\[\^([^\]]*)\]:', re.IGNORECASE)
_inlineFootnote = re.compile(r'\[\^([^\]]*)\]', re.IGNORECASE)


# TODO handle multiple nav levels (left bar) better (make conifgurable)


def shortHash(value:str, length:int) -> str:
	"""	Generate a short hash of a string value.

		Args:
			value: The value to hash.
			length: The length of the hash.

		Returns:
			The hash.
	"""
	return	base64.b64encode( 
				hashlib.sha256( 
					value.encode()
				).digest()
			 ).decode()[:length]


def analyseMarkdown(filename:str) -> Document:
	"""	Analyse the markdown file and split it into clauses.

		Args:
			filename: The name of the markdown file.

		Returns:
			The document object.
	"""

	print(f'[green]Analyzing "{filename}"')

	# Read the file.
	# Note: We use utf-8 and replace errors to avoid problems with special or unknown characters.
	with open(filename, 'r', encoding = 'utf-8', errors = 'replace') as file:
		inLines = file.readlines()
	
	outClauses:list[Clause] = [Clause(0, '', '', [])]
	footnotes:list[Footnote] = []

	# Go through the lines and detect headers and codefences
	inCodefence = False
	for line in inLines:

		# Detect and handle codefences
		# For the moment we support only codefences that start and end
		# with 3 backticks. This is the most common way to define codefences.
		# Note, that longer codefences are allowed by the markdown specification.
  
		if _matchCodefenceStart.match(line) and not inCodefence:
			inCodefence = True
			outClauses[-1].append(Line(line, LineType.CODEFENCESTART))
			continue
		if _matchCodefenceEnd.match(line):
			inCodefence = False
			outClauses[-1].append(Line(line, LineType.CODEFENCEEND))
			continue
		if inCodefence:
			outClauses[-1].append(Line(line, LineType.CODE))
			continue
	
		# Detect notes
  		# Notes are lines that start with a '>'.
		if _matchNote.match(line):
			outClauses[-1].append(Line(line, LineType.NOTE))
			continue

		# Detect footnotes
		# Footnotes are lines that start with a '^'
		if (_fn := _footnote.match(line)):
			footnotes.append(Footnote(_fn.groups()[0], Line(line, LineType.TEXT)))
			continue
  
		# Detect headers
		_lineType = LineType.TEXT
		if (m := _matchHeader.match(line)):
			# Add a new clause
			clauseTitle = m.groups()[1].strip()
			clauseTitle = re.sub(_htmlTag, '', clauseTitle)
			headerNumber = _matchHeaderNumber.search(clauseTitle)
			outClauses.append(Clause(len(m.groups()[0]), # level
						  		   headerNumber.group() if headerNumber else shortHash(clauseTitle, 6),
								   clauseTitle, 
								   []))
			_lineType = LineType.HEADING

		# Just add the line to the current clause as text
		outClauses[-1].append(Line(line, _lineType))

	return Document(outClauses, footnotes)


def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> None:
def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') -> None:
	"""	Copy media files from the source directory to the target directory.
	"""	Copy media files from the source directory to the target directory.


@@ -429,24 +465,25 @@ def copyMediaFiles(filename:str, navTitle:str, mediaDirectory:str = 'media') ->
	
	
def processDocument(args:argparse.Namespace) -> None:
def processDocument(args:argparse.Namespace) -> None:
	global verbose, veryVerbose
	global verbose, veryVerbose
	document = os.path.abspath(args.document)
	inDocumentFilename = os.path.abspath(args.document)
	veryVerbose = args.very_verbose
	veryVerbose = args.very_verbose
	verbose = args.verbose
	verbose = args.verbose
	if veryVerbose:
	if veryVerbose:
		verbose = True
		verbose = True


	# Analyse the markdown file
	# Analyse the markdown file
	clauses = analyseMarkdown(document)
	document = analyseMarkdown(inDocumentFilename)
	clauses = splitMarkdownDocument(clauses, [ t.casefold() for t in args.ignore_clause ], args.split_level)
	document.splitMarkdownDocument(args.ignore_clause, args.split_level)
	clauses = updateLinks(clauses)
	document.insertFootnotes()
	clauses = updateNotes(clauses)
	document.updateLinks()
	clauses = prepareForMkdocs(clauses)
	document.updateNotes()
	document.prepareForMkdocs()


	# Write the clauses to files
	# Write the clauses to files
	writeClauses(clauses, document, args.title)
	document.writeClausesMkDocs(inDocumentFilename, args.title)


	# Copy the media files
	# Copy the media files
	copyMediaFiles(document, args.title, args.media_directory)
	copyMediaFiles(inDocumentFilename, args.title, args.media_directory)




if __name__ == '__main__':
if __name__ == '__main__':