Improved parsing and generation of tables. Corrected various wrong... (f46a2a97) · Commits · Centre for Testing and Interoperability / Markdown specifications development / spec2md

spec2md.py

+37 −19

Original line number	Diff line number	Diff line
		@@ -52,8 +52,8 @@ unreferencedSubDir = 'unreferenced'
		_linebreak = '<br />'
		_entityLt = '<'
		_nbsp = ' '
		_tocInsertPoint = '__t_o_c__'
		_captionMarker = '__CAPTION__'
		_tocInsertPoint = '~~t~o~c~~'
		_captionMarker = '~~CAPTION~~'


		# https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1
		@@ -335,6 +335,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		_bold = '**'
		case 'i' if ep.attrib.get(_val, 'true') == 'true':
		_italics = '_'
		# case _:
		# _print(f'[yellow]unsupported style: {ep.tag}')

		# Strip white spaces if bold or italics
		_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
		@@ -342,11 +344,13 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		_s = _s.replace('_', '\\_')
		_s = _s.replace('', '\\')
		# Add trailing white space when bold or italics
		_prefix = ' ' if _bold or _italics else ''
		_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}'
		_postfix = ' ' if _bold or _italics else ''
		_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
		# print(_result)

		case 'br':
		_result += _linebreak

		case 'bookmarkStart' \| 'bookmarkEnd': # TODO ?
		pass

		@@ -366,17 +370,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip',
		namespaces = {
		'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
		'ns3' : wns,
		'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
		'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
		})
		if blip and \
		(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
		(mediaFile := mediaRelations.get(rId)):
		referencedImages.append(Path(mediaFile).stem) # Add to referenced files
		if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'):
		mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}'))
		_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"')
		_result += f'![{_captionMarker}]({mediaFile})'
		mediaFilePath = Path(mediaFile)
		referencedImages.append(mediaFilePath.stem) # Add to referenced files
		if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf':
		mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}')
		_print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"')
		_result += f'![{_captionMarker}]({mediaFilePath.as_posix()})' # image reference as posix path
		# else:
		# _print(blip)

		@@ -423,10 +428,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		case Paragraph(): # type: ignore[misc]
		return _parseXML(ET.fromstring(elem._p.xml))
		case _Cell(): # type: ignore[misc]
		result = ''
		for p in elem.paragraphs:
		result += _parseXML(ET.fromstring(p._p.xml), True)
		return result
		# Iterate over all paragraphs in the cell and parse them
		# Create a list of parsed paragraphs and join them with linebreaks
		return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
		for p in elem.paragraphs ])
		case _:
		return ''

		@@ -614,7 +619,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		# Table Caption
		elif style in docConfig.tablecaption:
		lines.append('')
		lines.append(f'{replaceNL(text).strip()}')
		caption = replaceNL(text).strip()
		anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
		lines.append(f'{caption}{anchor}')

		@@ -679,12 +684,16 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:

		# Warning if this is a single-row table
		if nrRows == 1:
		_print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}')
		_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)

		lines.append('') # Add an empty line before a table
		for idx, row in enumerate(rows):

		# Check for a table caption and add separator line
		if idx == 1:
		lines.append('-'.join('\|' * (len(row) + 1) ))

		# Add table row
		lines.append(f'\|{"\|".join(row)}\|'
		.replace('\n', _linebreak)) # replace line breaks in cells
		lines.append('') # Add another empty line after a table
		@@ -719,7 +728,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		line = lines[i]
		line = line.replace('__', '')
		line = line.replace('****', '')
		line = line.replace(' ', ' ')
		#line = line.replace(' ', ' ')
		lines[i] = line


		@@ -775,6 +784,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		line = lines[i]
		lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type]


		#
		# List unresolved CAPTION markers
		#
		for i in range(len(lines)):
		line = lines[i]
		if _captionMarker in line:
		_print(f'[yellow]Unresolved figure caption : \[{i}] "{line}"')

		#
		# Write produced Markdown file
		#