Support for "w:sdt" tags (structured document tags) and abstracted paragraph numbering added. (a7f2709d) · Commits · Centre for Testing and Interoperability / Markdown specifications development / spec2md

README.md

+1 −0

Original line number	Diff line number	Diff line
		@@ -95,6 +95,7 @@ This may not be the desired result, but markdown doesn't support colspans. A sol

		## Changes

		- 2025-12-18 - Support for "w:sdt" tags (structured document tags) and abstracted paragraph numbering added. Added more entity conversions.
		- 2025-01-15 - Improved handling of tables with colspans (converting them to simple grid tables). Improved error messages (added line numbers). Improved error detection for tables.
		- 2024-01-09 - Added support for merging consecutive code paragraphs into a single code block.
		- 2023-08-18 - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.

config.ini

+25 −0

Original line number	Diff line number	Diff line
		@@ -91,6 +91,8 @@ ignore = toc 1, toc 2, toc 3, toc 4, toc 5, toc 6, toc 7, toc 8, toc 9
		; replacement string must be specified as hex values
		; To remove a character from the file set it to 00 (2 zeros)
		;
		; See https://www.toptal.com/designers/htmlarrows/ for html entities
		;
		; The following are some common characters that can be replaced as well.

		; Registered trademark (®) — (`®`)
		@@ -110,8 +112,17 @@ a9 = ©
		; "(R)"
		; ae = 285229
		ae = ®
		; "tm"
		0x2122 = ™
		; space
		a0 = 20
		; backtick
		b4 = &acute
		; a`
		0xe0 = à
		; e´
		0xe9 = é

		; double quote
		201c = 22
		201d = 22
		@@ -123,6 +134,10 @@ a0 = 20
		b7 = 2a
		; Dashes
		2013 = 2d
		; mdash
		0x2014 = —
		; degree
		0xb0 = °
		; Full-size comma
		ff0c = 2c20
		; Fullwidth colon
		@@ -133,6 +148,7 @@ ff08 = 2028
		ff09 = 2920
		; "<="
		2264 = 3c3d

		; ">="
		2265 = 3e3d
		; "..."
		@@ -151,10 +167,19 @@ f0df = 3c3d
		;f0fd = 3c3d
		; "=>"
		f0e0 = 3d3e
		; "->"
		2192 = →
		; "<->"
		f0f3 = 266c743b2d3e
		; subscript 2
		2082 = 32
		; minus sign
		2212 = −
		; sect
		a7 = §
		; plus minus
		b1 = ±

spec2md.py

+58 −4

Original line number	Diff line number	Diff line
		@@ -240,7 +240,7 @@ def processDocuments(documents:list[str],
		outDirectory:str,
		skipImageConversion:bool,
		forceMarkdownTables:bool) -> None:
		docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {}
		docs:Dict[str, Tuple[Document, DocumentConfiguration, Any, Any]] = {}
		ptasks = {}
		mediaRelations:Dict[str, str] = {}
		addSectionNumbers = False
		@@ -341,7 +341,6 @@ def processDocuments(documents:list[str],
		'instrText',
		'lastRenderedPageBreak',
		'noBreakHyphen',
		'pPr',
		'proofErr',
		'rPr',
		'moveFromRangeEnd',
		@@ -488,6 +487,37 @@ def processDocuments(documents:list[str],
		for x in element:
		_result += _parseXML(x)

		case 'sdt': # structured document tag
		for x in element:
		match strippedTag(x.tag):
		case 'sdtContent':
		for y in x:
		_result += _parseXML(y)
		case _:
		pass

		case 'pPr':
		numId = element.find(f'{{{wns}}}numPr/{{{wns}}}numId')

		if numId is not None:
		# Numbering ID found, so treat this paragraph either as a numbered or bulleted list item
		_numberingID = numId.attrib[_val]
		_ilvl = element.find(f'{{{wns}}}numPr/{{{wns}}}ilvl').attrib.get(_val)
		_abstractNumbering = numberings.get(_numberingID, None)
		if _abstractNumbering is not None:
		# Determine whether ordered or unordered list for the given level
		_levels = _abstractNumbering.findall(f'{{{wns}}}lvl')
		for _lev in _levels:
		if _ilvl == (numberingLevel := _lev.attrib.get(f'{{{wns}}}ilvl')):
		numberingStyle = _lev.find(f'{{{wns}}}numFmt').attrib.get(_val)
		# This is a numbered or bulleted list item
		match numberingStyle:
		case 'bullet':
		_result += f'{" " * int(numberingLevel)}- '
		case 'decimal' \| 'lowerLetter' \| 'upperLetter' \| 'lowerRoman' \| 'upperRoman':
		_result += f'{" " * int(numberingLevel)}1. '
		break

		case _ if tag in _ignoredTags: # ignore
		pass

		@@ -543,10 +573,13 @@ def processDocuments(documents:list[str],
		try:
		# Search for footnotes in the document XML
		footnotesPart = None
		numberintPart = None
		for part in Package.open(d).parts:
		if part.partname.endswith('/footnotes.xml'):
		footnotesPart = part
		docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart)
		elif part.partname.endswith('/numbering.xml'):
		numberintPart = part
		docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart, numberintPart)
		ptasks[d] = progress.add_task(f'Processing {d}', total = None)
		progress.update(readTask, advance=1)
		except docx.opc.exceptions.PackageNotFoundError as e:
		@@ -561,12 +594,33 @@ def processDocuments(documents:list[str],
		# Processing Documents
		#

		for docFileName, (doc, docConfig, footnotesPart) in docs.items():
		for docFileName, (doc, docConfig, footnotesPart, numberingPart) in docs.items():
		processTask = ptasks[docFileName]
		docItems = list(iter_block_items(doc))
		addSectionNumbers = docConfig.addSectionNumbers
		excludeFromNumbering = docConfig.excludeFromNumbering


		# Process the numbering information from the numberings.xml file in the docx package
		# The information is only available through an indirect way: there are numbering IDs assigned
		# to "w:num" elements which map to abstract numbering definitions in "w:abstractNum" elements.
		# After extracting the mapping, we replace the numbering IDs with the abstract numbering definitions
		# for easier access later.

		numberings:dict[str, ET.Element] = {}
		_numberings = ET.fromstring(numberingPart.blob)
		# First, extract the numbering mappings
		for n in _numberings:
		if strippedTag(n.tag) == 'num':
		numId = n.attrib.get('{'+wns+'}numId')
		abstractNumId = n.find(f'{{{wns}}}abstractNumId').attrib.get(_val)
		numberings[numId] = abstractNumId
		# Next replace the numbering IDs with the abstract numbering definitions
		for numId, abstractNumId in numberings.items():
		for n in _numberings:
		if strippedTag(n.tag) == 'abstractNum' and n.attrib.get('{'+wns+'}abstractNumId') == abstractNumId:
		numberings[numId] = n

		paragraphNr = 0

		# TODO

Admin message