Added support for footnotes (6f5baea5) · Commits · Centre for Testing and Interoperability / Markdown specifications development / spec2md

config.ini

+19 −6

Original line number	Diff line number	Diff line
		@@ -31,18 +31,31 @@ imageCaptions2AltText = true


		[toc]
		# Add section numbers to the headlines
		addSectionNumbers = false
		excludeFromNumbering =

		# Exclude the following paragraph types from numbering.
		# The default is to exclude the "Content" heading.
		excludeFromNumbering = tt

		# The paragraph type that is used in the original document for the table of contents.
		tocStartParagraph = heading no numbering

		# The level of the table of contents.
		tocHeaderLevel = 1

		# Automatically generate a table of contents.
		generateToc = false

		# Add a macro "[toc]" to the document that can be used to generate a table of contents.
		# Some converters and viewer support this macro.
		addTocMacro = false


		[paragraphs]
		normal = normal
		h1 = heading 1, tt
		h2 = heading 2
		normal = normal, onem2m-normal
		h1 = heading 1, tt, onem2m-heading1
		h2 = heading 2, onem2m-heading2
		h3 = heading 3
		h4 = heading 4
		h5 = heading 5
		@@ -53,7 +66,7 @@ h9 = heading 9
		a1 = heading 1
		a2 = heading 2
		a3 = heading 3
		note = no
		note = no, onem2m-iprtitle, onem2m-ipr
		code = pl
		example = ex, ew
		ul1 = b1, b1+, list paragraph
		@@ -63,7 +76,7 @@ ul4 = b4, b4+
		ul5 = b5, b5+
		ol1 = bn
		ol2 = bl
		tablecaption = caption, th
		tablecaption = caption, th, onem2m-tabletitle
		imagecaption = tf
		image = fl
		empty = fp

spec2md.py

+67 −16

Original line number	Diff line number	Diff line
		@@ -9,10 +9,12 @@


		from enum import IntEnum, auto
		from typing import Callable, Tuple, Dict, Optional
		from typing import Callable, Tuple, Dict, Optional, Any

		from pathlib import Path, PurePath
		from docx.document import Document
		from docx.text.paragraph import Paragraph
		from docx.package import Package
		import docx.opc.exceptions
		from docx.table import _Cell, Table
		from docx.oxml.table import CT_Tbl
		@@ -201,7 +203,7 @@ class DocumentConfiguration(object):


		def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
		docs:Dict[str, Tuple[Document, DocumentConfiguration]] = {}
		docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {}
		ptasks = {}
		mediaRelations:Dict[str, str] = {}
		addSectionNumbers = False
		@@ -209,6 +211,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		headers:list[Tuple[int, str]] = []
		emfFiles:list[str] = []
		referencedImages:list[str] = []
		footnotes:dict[str, str] = {}

		global _print

		@@ -292,7 +295,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		return tag


		def getTextFromXML(elem:Paragraph\|_Cell) -> str:
		def getTextFromXML(elem:Paragraph\|_Cell\|ET._Element) -> str:

		# Not-used document tags.
		_ignoredTags = ( 'AlternateContent',
		@@ -310,6 +313,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		'commentRangeStart',
		'commentRangeEnd',
		'commentReference',
		'smartTag',
		'footnoteRef',
		)


		@@ -405,13 +410,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		pass # ignore a soft hyphen character which has no meaning in Markdown and zero-width

		case 'sym':
		def _symError(ch:str) -> str:

		def _symError(ch:int) -> None:
		nonlocal _result
		_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]} ({ch})"'
		_print(f'[yellow]{_symError}')
		_result += f'<mark>{_symError}</mark>'

		try:
		_ch = '????'
		_ch = 0
		_ch = int(element.attrib["{"+wns+"}char"], 16)
		if _ch in docConfig.characters:
		if (rch := docConfig.characters[_ch]) == chr(0):
		@@ -431,6 +438,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		for x in element:
		_result += _parseXML(x)

		case 'footnoteReference':
		id = element.attrib[f'{{{wns}}}id']
		_result += f'[^{id}]'
		footnotes[id] = '<mark>unknown footnote</mark>'

		# The footnote itself is not included in the document but in a separate file.
		# Therefore, we need to extract the footnote from the footnotes.xml file. The format
		# of the footnote is the same as a paragraph.
		case 'footnote':
		for x in element:
		_result += _parseXML(x)

		case _ if tag in _ignoredTags: # ignore
		pass

		@@ -448,6 +467,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		# Create a list of parsed paragraphs and join them with linebreaks
		return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
		for p in elem.paragraphs ])
		case ET._Element():
		return _parseXML(elem)
		case _:
		return ''

		@@ -480,7 +501,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		stopProgress(f'[red]Input document "{d}" is not a file')
		return
		try:
		docs[d] = (docx.Document(d), DocumentConfiguration(d))
		# Search for footnotes in the document XML
		footnotesPart = None
		for part in Package.open(d).parts:
		if part.partname.endswith('/footnotes.xml'):
		footnotesPart = part
		docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart)
		ptasks[d] = progress.add_task(f'Processing {d}', total = None)
		progress.update(readTask, advance=1)
		except docx.opc.exceptions.PackageNotFoundError as e:
		@@ -495,7 +521,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		# Processing Documents
		#

		for docFileName, (doc, docConfig) in docs.items():
		for docFileName, (doc, docConfig, footnotesPart) in docs.items():
		processTask = ptasks[docFileName]
		docItems = list(iter_block_items(doc))
		addSectionNumbers = docConfig.addSectionNumbers
		@@ -517,7 +543,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		return

		# Add sub-progress task
		progress.update(processTask, total = len(docItems) + 5) # + relations + image extraction + characters + toc + media convert
		progress.update(processTask, total = len(docItems) + 6) # + relations + image extraction + characters + toc + footnotes + media convert


		# Extract the media relations file, and get the mappings from document IDs to media files
		@@ -769,7 +795,9 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		elif codeblock:
		# Add whole code block to lines
		_lines.append('```')
		_lines.append('')
		_lines.extend(codeblock)
		_lines.append('')
		_lines.append('```')
		codeblock = []
		else:
		@@ -830,6 +858,29 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type]


		#
		# Process footnotes
		#
		progress.update(processTask, advance = 1) # progress update
		if len(footnotes) and footnotesPart is not None:
		_print(f'[yellow]Footnotes found: {len(footnotes)}')
		# Analyze footnotes file
		footnotesXML = ET.fromstring(footnotesPart.blob)
		# Process the footnotes XML here
		for element in footnotesXML:

		# Footnote found
		if strippedTag(element.tag) == 'footnote':
		footnoteID = element.attrib[f'{{{wns}}}id']
		if footnoteID in footnotes:
		t = getTextFromXML(element)
		footnotes[footnoteID] = t

		# Add footnotes to the end of the document
		lines.append('')
		for fid, text in footnotes.items():
		lines.append(f'[^{fid}]: {text}')

		#
		# List unresolved CAPTION markers
		#