Commit a7f2709d authored by ankraft's avatar ankraft
Browse files

Support for "w:sdt" tags (structured document tags) and abstracted paragraph numbering added.

parent d2600a99
Loading
Loading
Loading
Loading
+1 −0
Original line number Diff line number Diff line
@@ -95,6 +95,7 @@ This may not be the desired result, but markdown doesn't support colspans. A sol

## Changes

- **2025-12-18** - Support for "w:sdt" tags (structured document tags) and abstracted paragraph numbering added. Added more entity conversions.
- **2025-01-15** - Improved handling of tables with colspans (converting them to simple grid tables). Improved error messages (added line numbers). Improved error detection for tables.
- **2024-01-09** - Added support for merging consecutive code paragraphs into a single code block.
- **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
+25 −0
Original line number Diff line number Diff line
@@ -91,6 +91,8 @@ ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7, toc 8, toc 9
; replacement string must be specified as hex values
; To remove a character from the file set it to 00 (2 zeros)
;
; See https://www.toptal.com/designers/htmlarrows/ for html entities
;
; The following are some common characters that can be replaced as well.

; Registered trademark (®) — (`®`)
@@ -110,8 +112,17 @@ a9 = ©
; "(R)"
; ae = 285229
ae = ®
; "tm"
0x2122 = ™
; space
a0 = 20
; backtick
b4 = &acute
; a`
0xe0 = à
; e´
0xe9 = é

; double quote
201c = 22
201d = 22
@@ -123,6 +134,10 @@ a0 = 20
b7 = 2a
; Dashes
2013 = 2d
; mdash
0x2014 = —
; degree
0xb0 = °
; Full-size comma
ff0c = 2c20
; Fullwidth colon
@@ -133,6 +148,7 @@ ff08 = 2028
ff09 = 2920
; "<="
2264 = 3c3d

; ">="
2265 = 3e3d
; "..."
@@ -151,10 +167,19 @@ f0df = 3c3d
;f0fd = 3c3d
; "=>"
f0e0 = 3d3e
; "->"
2192 = &rarr;
; "<->"
f0f3 = 266c743b2d3e
; subscript 2
2082 = 32
; minus sign
2212 = &minus;
; sect 
a7 = &sect;
; plus minus
b1 = &plusmn;




+58 −4
Original line number Diff line number Diff line
@@ -240,7 +240,7 @@ def processDocuments(documents:list[str],
					 outDirectory:str, 
					 skipImageConversion:bool,
					 forceMarkdownTables:bool) -> None:
	docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]]		= {}
	docs:Dict[str, Tuple[Document, DocumentConfiguration, Any, Any]]		= {}
	ptasks 															= {}
	mediaRelations:Dict[str, str] 									= {}
	addSectionNumbers 												= False
@@ -341,7 +341,6 @@ def processDocuments(documents:list[str],
							 'instrText',
							 'lastRenderedPageBreak',
							 'noBreakHyphen',
							 'pPr',
							 'proofErr',
							 'rPr',
							 'moveFromRangeEnd',
@@ -488,6 +487,37 @@ def processDocuments(documents:list[str],
						for x in element:
							_result += _parseXML(x)
					
					case 'sdt':	# structured document tag
						for x in element:
							match strippedTag(x.tag):
								case 'sdtContent':
									for y in x:
										_result += _parseXML(y)
								case _:
									pass
					
					case 'pPr':
						numId = element.find(f'{{{wns}}}numPr/{{{wns}}}numId')

						if numId is not None:
							# Numbering ID found, so treat this paragraph either as a numbered or bulleted list item
							_numberingID = numId.attrib[_val]
							_ilvl = element.find(f'{{{wns}}}numPr/{{{wns}}}ilvl').attrib.get(_val)
							_abstractNumbering = numberings.get(_numberingID, None)
							if _abstractNumbering is not None:
								# Determine whether ordered or unordered list for the given level
								_levels = _abstractNumbering.findall(f'{{{wns}}}lvl')
								for _lev in _levels:
									if _ilvl == (numberingLevel := _lev.attrib.get(f'{{{wns}}}ilvl')):
										numberingStyle = _lev.find(f'{{{wns}}}numFmt').attrib.get(_val)
										# This is a numbered or bulleted list item
										match numberingStyle:
											case 'bullet':
												_result += f'{"    " * int(numberingLevel)}- '
											case 'decimal' | 'lowerLetter' | 'upperLetter' | 'lowerRoman' | 'upperRoman':
												_result += f'{"    " * int(numberingLevel)}1. '
										break
						
					case _ if tag in _ignoredTags:	# ignore
						pass
					
@@ -543,10 +573,13 @@ def processDocuments(documents:list[str],
			try:
				# Search for footnotes in the document XML
				footnotesPart = None
				numberintPart = None
				for part in Package.open(d).parts:
					if part.partname.endswith('/footnotes.xml'):
						footnotesPart = part
				docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart)
					elif part.partname.endswith('/numbering.xml'):
						numberintPart = part
				docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart, numberintPart)
				ptasks[d] = progress.add_task(f'Processing {d}', total = None)
				progress.update(readTask, advance=1)
			except docx.opc.exceptions.PackageNotFoundError as e:
@@ -561,12 +594,33 @@ def processDocuments(documents:list[str],
		#	Processing Documents
		#

		for docFileName, (doc, docConfig, footnotesPart) in docs.items():
		for docFileName, (doc, docConfig, footnotesPart, numberingPart) in docs.items():
			processTask = ptasks[docFileName]
			docItems = list(iter_block_items(doc))
			addSectionNumbers = docConfig.addSectionNumbers
			excludeFromNumbering = docConfig.excludeFromNumbering


			# Process the numbering information from the numberings.xml file in the docx package
			# The information is only available through an indirect way: there are numbering IDs assigned
			# to "w:num" elements which map to abstract numbering definitions in "w:abstractNum" elements.
			# After extracting the mapping, we replace the numbering IDs with the abstract numbering definitions
			# for easier access later.

			numberings:dict[str, ET.Element] = {}
			_numberings =  ET.fromstring(numberingPart.blob)
			# First, extract the numbering mappings
			for n in _numberings:
				if strippedTag(n.tag) == 'num':
					numId = n.attrib.get('{'+wns+'}numId')
					abstractNumId = n.find(f'{{{wns}}}abstractNumId').attrib.get(_val)
					numberings[numId] = abstractNumId
			# Next replace the numbering IDs with the abstract numbering definitions
			for numId, abstractNumId in numberings.items():
				for n in _numberings:
					if strippedTag(n.tag) == 'abstractNum' and n.attrib.get('{'+wns+'}abstractNumId') == abstractNumId:
						numberings[numId] = n

			paragraphNr = 0

			# TODO