Commit 6f5baea5 authored by ankraft's avatar ankraft
Browse files

Added support for footnotes

parent 9328ebdb
Loading
Loading
Loading
Loading
+19 −6
Original line number Diff line number Diff line
@@ -31,18 +31,31 @@ imageCaptions2AltText = true


[toc]
# Add section numbers to the headlines
addSectionNumbers = false
excludeFromNumbering =

# Exclude the following paragraph types from numbering. 
# The default is to exclude the "Content" heading.
excludeFromNumbering = tt

# The paragraph type that is used in the original document for the table of contents.
tocStartParagraph = heading no numbering

# The level of the table of contents.
tocHeaderLevel = 1

# Automatically generate a table of contents.
generateToc = false

# Add a macro "[toc]" to the document that can be used to generate a table of contents.
# Some converters and viewer support this macro.
addTocMacro = false


[paragraphs]
normal = normal
h1 = heading 1, tt
h2 = heading 2
normal = normal, onem2m-normal
h1 = heading 1, tt, onem2m-heading1
h2 = heading 2, onem2m-heading2
h3 = heading 3
h4 = heading 4
h5 = heading 5
@@ -53,7 +66,7 @@ h9 = heading 9
a1 = heading 1
a2 = heading 2
a3 = heading 3
note = no
note = no, onem2m-iprtitle, onem2m-ipr
code = pl
example = ex, ew
ul1 = b1, b1+, list paragraph
@@ -63,7 +76,7 @@ ul4 = b4, b4+
ul5 = b5, b5+
ol1 = bn
ol2 = bl
tablecaption = caption, th
tablecaption = caption, th, onem2m-tabletitle
imagecaption = tf
image = fl
empty = fp
+67 −16
Original line number Diff line number Diff line
@@ -9,10 +9,12 @@


from enum import IntEnum, auto
from typing import Callable, Tuple, Dict, Optional
from typing import Callable, Tuple, Dict, Optional, Any

from pathlib import Path, PurePath
from docx.document import Document
from docx.text.paragraph import Paragraph
from docx.package import Package
import docx.opc.exceptions
from docx.table import _Cell, Table
from docx.oxml.table import CT_Tbl
@@ -201,7 +203,7 @@ class DocumentConfiguration(object):


def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
	docs:Dict[str, Tuple[Document, DocumentConfiguration]]		= {}
	docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]]		= {}
	ptasks 															= {}
	mediaRelations:Dict[str, str] 									= {}
	addSectionNumbers 												= False
@@ -209,6 +211,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
	headers:list[Tuple[int, str]]									= []
	emfFiles:list[str]												= []
	referencedImages:list[str]										= []
	footnotes:dict[str, str]										= {}

	global _print
	
@@ -292,7 +295,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
			return tag


		def getTextFromXML(elem:Paragraph|_Cell) -> str:
		def getTextFromXML(elem:Paragraph|_Cell|ET._Element) -> str:

			#	Not-used document tags.
			_ignoredTags = ( 'AlternateContent',
@@ -310,6 +313,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
							 'commentRangeStart',
							 'commentRangeEnd',
							 'commentReference',
							 'smartTag',
							 'footnoteRef',
			)
			

@@ -405,13 +410,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						pass	# ignore a soft hyphen character which has no meaning in Markdown and zero-width 
					
					case 'sym':
						def _symError(ch:str) -> str:

						def _symError(ch:int) -> None:
							nonlocal _result
							_symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]} ({ch})"'
							_print(f'[yellow]{_symError}')
							_result += f'<mark>{_symError}</mark>'

						try:
							_ch = '????'
							_ch = 0
							_ch = int(element.attrib["{"+wns+"}char"], 16)
							if _ch in docConfig.characters:
								if (rch := docConfig.characters[_ch]) == chr(0):
@@ -431,6 +438,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						for x in element:
							_result += _parseXML(x)
					
					case 'footnoteReference':
						id = element.attrib[f'{{{wns}}}id']
						_result += f'[^{id}]'
						footnotes[id] = '<mark>unknown footnote</mark>'
					
					# The footnote itself is not included in the document but in a separate file.
					# Therefore, we need to extract the footnote from the footnotes.xml file. The format
					# of the footnote is the same as a paragraph.
					case 'footnote':
						for x in element:
							_result += _parseXML(x)
					
					case _ if tag in _ignoredTags:	# ignore
						pass
					
@@ -448,6 +467,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
					# Create a list of parsed paragraphs and join them with linebreaks
					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() 
										   for p in elem.paragraphs ])
				case ET._Element():
					return _parseXML(elem)
				case _:
					return ''

@@ -480,7 +501,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				stopProgress(f'[red]Input document "{d}" is not a file')
				return
			try:
				docs[d] = (docx.Document(d), DocumentConfiguration(d))
				# Search for footnotes in the document XML
				footnotesPart = None
				for part in Package.open(d).parts:
					if part.partname.endswith('/footnotes.xml'):
						footnotesPart = part
				docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart)
				ptasks[d] = progress.add_task(f'Processing {d}', total = None)
				progress.update(readTask, advance=1)
			except docx.opc.exceptions.PackageNotFoundError as e:
@@ -495,7 +521,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		#	Processing Documents
		#

		for docFileName, (doc, docConfig) in docs.items():
		for docFileName, (doc, docConfig, footnotesPart) in docs.items():
			processTask = ptasks[docFileName]
			docItems = list(iter_block_items(doc))
			addSectionNumbers = docConfig.addSectionNumbers
@@ -517,7 +543,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				return

			# Add sub-progress task
			progress.update(processTask, total = len(docItems) + 5)	# + relations + image extraction + characters + toc + media convert
			progress.update(processTask, total = len(docItems) + 6)	# + relations + image extraction + characters + toc + footnotes + media convert


			#	Extract the media relations file, and get the mappings from document IDs to media files
@@ -769,7 +795,9 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
					elif codeblock:
						# Add whole code block to lines
						_lines.append('```')
						_lines.append('')
						_lines.extend(codeblock)
						_lines.append('')
						_lines.append('```')
						codeblock = []
					else:
@@ -830,6 +858,29 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				lines[i] = re.sub(_referenceExpression, _repl, line)	# type:ignore[arg-type]


			#
			#	Process footnotes
			#
			progress.update(processTask, advance = 1)	# progress update
			if len(footnotes) and footnotesPart is not None:
				_print(f'[yellow]Footnotes found: {len(footnotes)}')
				# Analyze footnotes file
				footnotesXML = ET.fromstring(footnotesPart.blob)
				# Process the footnotes XML here
				for element in footnotesXML:

					# Footnote found
					if strippedTag(element.tag) == 'footnote':
						footnoteID = element.attrib[f'{{{wns}}}id']
						if footnoteID in footnotes:
							t = getTextFromXML(element)
							footnotes[footnoteID] = t
				
				# Add footnotes to the end of the document
				lines.append('')
				for fid, text in footnotes.items():
					lines.append(f'[^{fid}]: {text}')

			#
			#	List unresolved CAPTION markers
			#