Commit f46a2a97 authored by ankraft's avatar ankraft
Browse files

Improved parsing and generation of tables. Corrected various wrong...

Improved parsing and generation of tables. Corrected various wrong replacements. List unresolved captions
parent 6401024d
Loading
Loading
Loading
Loading
+37 −19
Original line number Diff line number Diff line
@@ -52,8 +52,8 @@ unreferencedSubDir = 'unreferenced'
_linebreak = '<br />'
_entityLt = '&lt;'
_nbsp = '&nbsp;'
_tocInsertPoint = '__t_o_c__'
_captionMarker = '__CAPTION__'
_tocInsertPoint = '~~t~o~c~~'
_captionMarker = '~~CAPTION~~'


# https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1
@@ -335,6 +335,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
											_bold = '**'
										case 'i' if ep.attrib.get(_val, 'true') == 'true':
											_italics = '_'
										# case _:
										# 	_print(f'[yellow]unsupported style: {ep.tag}')
						
						# Strip white spaces if bold or italics
						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
@@ -342,11 +344,13 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						_s = _s.replace('_', '\\_')
						_s = _s.replace('*', '\\*')
						# Add trailing white space when bold or italics
						_prefix = ' ' if _bold or _italics else ''
						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}'
						_postfix = ' ' if _bold or _italics else ''
						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
						# print(_result)

					case 'br':
						_result += _linebreak
						
					case 'bookmarkStart' | 'bookmarkEnd':		# TODO ?
						pass

@@ -366,17 +370,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', 
												namespaces = { 
													'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
													'ns3' : wns,
													'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
													'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
												})
						if blip and \
							(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
							(mediaFile := mediaRelations.get(rId)):
							referencedImages.append(Path(mediaFile).stem)	# Add to referenced files
							if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'):
								mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}'))
								_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"')
							_result += f'![{_captionMarker}]({mediaFile})'
							mediaFilePath = Path(mediaFile)
							referencedImages.append(mediaFilePath.stem)	# Add to referenced files
							if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf':
								mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}')
								_print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"')
							_result += f'![{_captionMarker}]({mediaFilePath.as_posix()})'	# image reference as posix path
						# else:
						# 	_print(blip)

@@ -423,10 +428,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				case Paragraph():	# type: ignore[misc]
					return _parseXML(ET.fromstring(elem._p.xml))
				case _Cell():		# type: ignore[misc]
					result = ''
					for p in elem.paragraphs:
						result += _parseXML(ET.fromstring(p._p.xml), True)
					return result
					# Iterate over all paragraphs in the cell and parse them
					# Create a list of parsed paragraphs and join them with linebreaks
					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() 
										   for p in elem.paragraphs ])
				case _:
					return ''

@@ -614,7 +619,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						#	Table Caption
						elif style in docConfig.tablecaption:
							lines.append('')
							lines.append(f'**{replaceNL(text).strip()}**')
							caption = replaceNL(text).strip()
							anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
							lines.append(f'**{caption}**{anchor}')

@@ -679,12 +684,16 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						
						# Warning if this is a single-row table
						if nrRows == 1:
							_print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}')
							_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)

						lines.append('')	# Add an empty line before a table
						for idx, row in enumerate(rows):

							# Check for a table caption and add separator line
							if idx == 1:
								lines.append('-'.join('|' * (len(row) + 1) ))
							
							# Add table row
							lines.append(f'|{"|".join(row)}|'
										 .replace('\n', _linebreak))	# replace line breaks in cells
						lines.append('')	# Add another empty line after a table
@@ -719,7 +728,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				line = lines[i]
				line = line.replace('__', '')
				line = line.replace('****', '')
				line = line.replace('  ', ' ')
				#line = line.replace('  ', ' ')
				lines[i] = line


@@ -775,6 +784,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				line = lines[i]
				lines[i] = re.sub(_referenceExpression, _repl, line)	# type:ignore[arg-type]


			#
			#	List unresolved CAPTION markers
			#
			for i in range(len(lines)):
				line = lines[i]
				if _captionMarker in line:
					_print(f'[yellow]Unresolved figure caption : \[{i}] "{line}"')
			
			#
			#	Write produced Markdown file
			#