Commit f46a2a97 authored by ankraft's avatar ankraft
Browse files

Improved parsing and generation of tables. Corrected various wrong...

Improved parsing and generation of tables. Corrected various wrong replacements. List unresolved captions
parent 6401024d
Loading
Loading
Loading
Loading
+37 −19
Original line number Original line Diff line number Diff line
@@ -52,8 +52,8 @@ unreferencedSubDir = 'unreferenced'
_linebreak = '<br />'
_linebreak = '<br />'
_entityLt = '&lt;'
_entityLt = '&lt;'
_nbsp = '&nbsp;'
_nbsp = '&nbsp;'
_tocInsertPoint = '__t_o_c__'
_tocInsertPoint = '~~t~o~c~~'
_captionMarker = '__CAPTION__'
_captionMarker = '~~CAPTION~~'




# https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1
# https://learn.microsoft.com/en-us/dotnet/api/documentformat.openxml.wordprocessing?view=openxml-2.8.1
@@ -335,6 +335,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
											_bold = '**'
											_bold = '**'
										case 'i' if ep.attrib.get(_val, 'true') == 'true':
										case 'i' if ep.attrib.get(_val, 'true') == 'true':
											_italics = '_'
											_italics = '_'
										# case _:
										# 	_print(f'[yellow]unsupported style: {ep.tag}')
						
						
						# Strip white spaces if bold or italics
						# Strip white spaces if bold or italics
						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
						_s = str(toMD(str(element.text))).strip() if _bold or _italics else str(toMD(str(element.text)))
@@ -342,11 +344,13 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						_s = _s.replace('_', '\\_')
						_s = _s.replace('_', '\\_')
						_s = _s.replace('*', '\\*')
						_s = _s.replace('*', '\\*')
						# Add trailing white space when bold or italics
						# Add trailing white space when bold or italics
						_prefix = ' ' if _bold or _italics else ''
						_postfix = ' ' if _bold or _italics else ''
						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_prefix}'
						_result += f'{_bold}{_italics}{_s}{_italics}{_bold}{_postfix}'
						# print(_result)


					case 'br':
					case 'br':
						_result += _linebreak
						_result += _linebreak
						
					case 'bookmarkStart' | 'bookmarkEnd':		# TODO ?
					case 'bookmarkStart' | 'bookmarkEnd':		# TODO ?
						pass
						pass


@@ -366,17 +370,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', 
						blip = element.findall('ns1:inline/ns3:graphic/ns3:graphicData/ns4:pic/ns4:blipFill/ns3:blip', 
												namespaces = { 
												namespaces = { 
													'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
													'ns1' : 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
													'ns3' : wns,
													'ns3' : 'http://schemas.openxmlformats.org/drawingml/2006/main',
													'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
													'ns4' : 'http://schemas.openxmlformats.org/drawingml/2006/picture',
												})
												})
						if blip and \
						if blip and \
							(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
							(rId := blip[0].attrib.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')) and \
							(mediaFile := mediaRelations.get(rId)):
							(mediaFile := mediaRelations.get(rId)):
							referencedImages.append(Path(mediaFile).stem)	# Add to referenced files
							mediaFilePath = Path(mediaFile)
							if docConfig.renameEMFExtension and mediaFile.lower().endswith('.emf'):
							referencedImages.append(mediaFilePath.stem)	# Add to referenced files
								mediaFile = str(Path(mediaFile).with_suffix(f'.{docConfig.renameEMFExtension}'))
							if docConfig.renameEMFExtension and mediaFilePath.suffix.lower() == '.emf':
								_print(f'[yellow]Renaming EMF file reference to "{mediaFile}"')
								mediaFilePath = mediaFilePath.with_suffix(f'.{docConfig.renameEMFExtension}')
							_result += f'![{_captionMarker}]({mediaFile})'
								_print(f'[yellow]Renaming EMF file reference to "{str(mediaFilePath)}"')
							_result += f'![{_captionMarker}]({mediaFilePath.as_posix()})'	# image reference as posix path
						# else:
						# else:
						# 	_print(blip)
						# 	_print(blip)


@@ -423,10 +428,10 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				case Paragraph():	# type: ignore[misc]
				case Paragraph():	# type: ignore[misc]
					return _parseXML(ET.fromstring(elem._p.xml))
					return _parseXML(ET.fromstring(elem._p.xml))
				case _Cell():		# type: ignore[misc]
				case _Cell():		# type: ignore[misc]
					result = ''
					# Iterate over all paragraphs in the cell and parse them
					for p in elem.paragraphs:
					# Create a list of parsed paragraphs and join them with linebreaks
						result += _parseXML(ET.fromstring(p._p.xml), True)
					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() 
					return result
										   for p in elem.paragraphs ])
				case _:
				case _:
					return ''
					return ''


@@ -614,7 +619,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						#	Table Caption
						#	Table Caption
						elif style in docConfig.tablecaption:
						elif style in docConfig.tablecaption:
							lines.append('')
							lines.append('')
							lines.append(f'**{replaceNL(text).strip()}**')
							caption = replaceNL(text).strip()
							anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
							anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
							lines.append(f'**{caption}**{anchor}')
							lines.append(f'**{caption}**{anchor}')


@@ -679,12 +684,16 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
						
						
						# Warning if this is a single-row table
						# Warning if this is a single-row table
						if nrRows == 1:
						if nrRows == 1:
							_print(f'[red]Single-row table found. Consider replacing it in the original document:\n{rows[0]}')
							_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)


						lines.append('')	# Add an empty line before a table
						lines.append('')	# Add an empty line before a table
						for idx, row in enumerate(rows):
						for idx, row in enumerate(rows):

							# Check for a table caption and add separator line
							if idx == 1:
							if idx == 1:
								lines.append('-'.join('|' * (len(row) + 1) ))
								lines.append('-'.join('|' * (len(row) + 1) ))
							
							# Add table row
							lines.append(f'|{"|".join(row)}|'
							lines.append(f'|{"|".join(row)}|'
										 .replace('\n', _linebreak))	# replace line breaks in cells
										 .replace('\n', _linebreak))	# replace line breaks in cells
						lines.append('')	# Add another empty line after a table
						lines.append('')	# Add another empty line after a table
@@ -719,7 +728,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				line = lines[i]
				line = lines[i]
				line = line.replace('__', '')
				line = line.replace('__', '')
				line = line.replace('****', '')
				line = line.replace('****', '')
				line = line.replace('  ', ' ')
				#line = line.replace('  ', ' ')
				lines[i] = line
				lines[i] = line




@@ -775,6 +784,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
				line = lines[i]
				line = lines[i]
				lines[i] = re.sub(_referenceExpression, _repl, line)	# type:ignore[arg-type]
				lines[i] = re.sub(_referenceExpression, _repl, line)	# type:ignore[arg-type]



			#
			#	List unresolved CAPTION markers
			#
			for i in range(len(lines)):
				line = lines[i]
				if _captionMarker in line:
					_print(f'[yellow]Unresolved figure caption : \[{i}] "{line}"')
			
			#
			#
			#	Write produced Markdown file
			#	Write produced Markdown file
			#
			#