Loading config.ini +19 −6 Original line number Diff line number Diff line Loading @@ -31,18 +31,31 @@ imageCaptions2AltText = true [toc] # Add section numbers to the headlines addSectionNumbers = false excludeFromNumbering = # Exclude the following paragraph types from numbering. # The default is to exclude the "Content" heading. excludeFromNumbering = tt # The paragraph type that is used in the original document for the table of contents. tocStartParagraph = heading no numbering # The level of the table of contents. tocHeaderLevel = 1 # Automatically generate a table of contents. generateToc = false # Add a macro "[toc]" to the document that can be used to generate a table of contents. # Some converters and viewer support this macro. addTocMacro = false [paragraphs] normal = normal h1 = heading 1, tt h2 = heading 2 normal = normal, onem2m-normal h1 = heading 1, tt, onem2m-heading1 h2 = heading 2, onem2m-heading2 h3 = heading 3 h4 = heading 4 h5 = heading 5 Loading @@ -53,7 +66,7 @@ h9 = heading 9 a1 = heading 1 a2 = heading 2 a3 = heading 3 note = no note = no, onem2m-iprtitle, onem2m-ipr code = pl example = ex, ew ul1 = b1, b1+, list paragraph Loading @@ -63,7 +76,7 @@ ul4 = b4, b4+ ul5 = b5, b5+ ol1 = bn ol2 = bl tablecaption = caption, th tablecaption = caption, th, onem2m-tabletitle imagecaption = tf image = fl empty = fp Loading spec2md.py +67 −16 Original line number Diff line number Diff line Loading @@ -9,10 +9,12 @@ from enum import IntEnum, auto from typing import Callable, Tuple, Dict, Optional from typing import Callable, Tuple, Dict, Optional, Any from pathlib import Path, PurePath from docx.document import Document from docx.text.paragraph import Paragraph from docx.package import Package import docx.opc.exceptions from docx.table import _Cell, Table from docx.oxml.table import CT_Tbl Loading Loading @@ -201,7 +203,7 @@ class DocumentConfiguration(object): def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None: docs:Dict[str, Tuple[Document, DocumentConfiguration]] = {} docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {} ptasks = {} mediaRelations:Dict[str, str] = {} addSectionNumbers = False Loading @@ -209,6 +211,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: headers:list[Tuple[int, str]] = [] emfFiles:list[str] = [] referencedImages:list[str] = [] footnotes:dict[str, str] = {} global _print Loading Loading @@ -292,7 +295,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: return tag def getTextFromXML(elem:Paragraph|_Cell) -> str: def getTextFromXML(elem:Paragraph|_Cell|ET._Element) -> str: # Not-used document tags. _ignoredTags = ( 'AlternateContent', Loading @@ -310,6 +313,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: 'commentRangeStart', 'commentRangeEnd', 'commentReference', 'smartTag', 'footnoteRef', ) Loading Loading @@ -405,13 +410,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: pass # ignore a soft hyphen character which has no meaning in Markdown and zero-width case 'sym': def _symError(ch:str) -> str: def _symError(ch:int) -> None: nonlocal _result _symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]} ({ch})"' _print(f'[yellow]{_symError}') _result += f'<mark>{_symError}</mark>' try: _ch = '????' _ch = 0 _ch = int(element.attrib["{"+wns+"}char"], 16) if _ch in docConfig.characters: if (rch := docConfig.characters[_ch]) == chr(0): Loading @@ -431,6 +438,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: for x in element: _result += _parseXML(x) case 'footnoteReference': id = element.attrib[f'{{{wns}}}id'] _result += f'[^{id}]' footnotes[id] = '<mark>unknown footnote</mark>' # The footnote itself is not included in the document but in a separate file. # Therefore, we need to extract the footnote from the footnotes.xml file. The format # of the footnote is the same as a paragraph. case 'footnote': for x in element: _result += _parseXML(x) case _ if tag in _ignoredTags: # ignore pass Loading @@ -448,6 +467,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Create a list of parsed paragraphs and join them with linebreaks return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() for p in elem.paragraphs ]) case ET._Element(): return _parseXML(elem) case _: return '' Loading Loading @@ -480,7 +501,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: stopProgress(f'[red]Input document "{d}" is not a file') return try: docs[d] = (docx.Document(d), DocumentConfiguration(d)) # Search for footnotes in the document XML footnotesPart = None for part in Package.open(d).parts: if part.partname.endswith('/footnotes.xml'): footnotesPart = part docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart) ptasks[d] = progress.add_task(f'Processing {d}', total = None) progress.update(readTask, advance=1) except docx.opc.exceptions.PackageNotFoundError as e: Loading @@ -495,7 +521,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Processing Documents # for docFileName, (doc, docConfig) in docs.items(): for docFileName, (doc, docConfig, footnotesPart) in docs.items(): processTask = ptasks[docFileName] docItems = list(iter_block_items(doc)) addSectionNumbers = docConfig.addSectionNumbers Loading @@ -517,7 +543,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: return # Add sub-progress task progress.update(processTask, total = len(docItems) + 5) # + relations + image extraction + characters + toc + media convert progress.update(processTask, total = len(docItems) + 6) # + relations + image extraction + characters + toc + footnotes + media convert # Extract the media relations file, and get the mappings from document IDs to media files Loading Loading @@ -769,7 +795,9 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: elif codeblock: # Add whole code block to lines _lines.append('```') _lines.append('') _lines.extend(codeblock) _lines.append('') _lines.append('```') codeblock = [] else: Loading Loading @@ -830,6 +858,29 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type] # # Process footnotes # progress.update(processTask, advance = 1) # progress update if len(footnotes) and footnotesPart is not None: _print(f'[yellow]Footnotes found: {len(footnotes)}') # Analyze footnotes file footnotesXML = ET.fromstring(footnotesPart.blob) # Process the footnotes XML here for element in footnotesXML: # Footnote found if strippedTag(element.tag) == 'footnote': footnoteID = element.attrib[f'{{{wns}}}id'] if footnoteID in footnotes: t = getTextFromXML(element) footnotes[footnoteID] = t # Add footnotes to the end of the document lines.append('') for fid, text in footnotes.items(): lines.append(f'[^{fid}]: {text}') # # List unresolved CAPTION markers # Loading Loading
config.ini +19 −6 Original line number Diff line number Diff line Loading @@ -31,18 +31,31 @@ imageCaptions2AltText = true [toc] # Add section numbers to the headlines addSectionNumbers = false excludeFromNumbering = # Exclude the following paragraph types from numbering. # The default is to exclude the "Content" heading. excludeFromNumbering = tt # The paragraph type that is used in the original document for the table of contents. tocStartParagraph = heading no numbering # The level of the table of contents. tocHeaderLevel = 1 # Automatically generate a table of contents. generateToc = false # Add a macro "[toc]" to the document that can be used to generate a table of contents. # Some converters and viewer support this macro. addTocMacro = false [paragraphs] normal = normal h1 = heading 1, tt h2 = heading 2 normal = normal, onem2m-normal h1 = heading 1, tt, onem2m-heading1 h2 = heading 2, onem2m-heading2 h3 = heading 3 h4 = heading 4 h5 = heading 5 Loading @@ -53,7 +66,7 @@ h9 = heading 9 a1 = heading 1 a2 = heading 2 a3 = heading 3 note = no note = no, onem2m-iprtitle, onem2m-ipr code = pl example = ex, ew ul1 = b1, b1+, list paragraph Loading @@ -63,7 +76,7 @@ ul4 = b4, b4+ ul5 = b5, b5+ ol1 = bn ol2 = bl tablecaption = caption, th tablecaption = caption, th, onem2m-tabletitle imagecaption = tf image = fl empty = fp Loading
spec2md.py +67 −16 Original line number Diff line number Diff line Loading @@ -9,10 +9,12 @@ from enum import IntEnum, auto from typing import Callable, Tuple, Dict, Optional from typing import Callable, Tuple, Dict, Optional, Any from pathlib import Path, PurePath from docx.document import Document from docx.text.paragraph import Paragraph from docx.package import Package import docx.opc.exceptions from docx.table import _Cell, Table from docx.oxml.table import CT_Tbl Loading Loading @@ -201,7 +203,7 @@ class DocumentConfiguration(object): def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None: docs:Dict[str, Tuple[Document, DocumentConfiguration]] = {} docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {} ptasks = {} mediaRelations:Dict[str, str] = {} addSectionNumbers = False Loading @@ -209,6 +211,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: headers:list[Tuple[int, str]] = [] emfFiles:list[str] = [] referencedImages:list[str] = [] footnotes:dict[str, str] = {} global _print Loading Loading @@ -292,7 +295,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: return tag def getTextFromXML(elem:Paragraph|_Cell) -> str: def getTextFromXML(elem:Paragraph|_Cell|ET._Element) -> str: # Not-used document tags. _ignoredTags = ( 'AlternateContent', Loading @@ -310,6 +313,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: 'commentRangeStart', 'commentRangeEnd', 'commentReference', 'smartTag', 'footnoteRef', ) Loading Loading @@ -405,13 +410,15 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: pass # ignore a soft hyphen character which has no meaning in Markdown and zero-width case 'sym': def _symError(ch:str) -> str: def _symError(ch:int) -> None: nonlocal _result _symError = f'unknown font+symbol: {element.attrib["{"+wns+"}font"]} - "{element.attrib["{"+wns+"}char"]} ({ch})"' _print(f'[yellow]{_symError}') _result += f'<mark>{_symError}</mark>' try: _ch = '????' _ch = 0 _ch = int(element.attrib["{"+wns+"}char"], 16) if _ch in docConfig.characters: if (rch := docConfig.characters[_ch]) == chr(0): Loading @@ -431,6 +438,18 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: for x in element: _result += _parseXML(x) case 'footnoteReference': id = element.attrib[f'{{{wns}}}id'] _result += f'[^{id}]' footnotes[id] = '<mark>unknown footnote</mark>' # The footnote itself is not included in the document but in a separate file. # Therefore, we need to extract the footnote from the footnotes.xml file. The format # of the footnote is the same as a paragraph. case 'footnote': for x in element: _result += _parseXML(x) case _ if tag in _ignoredTags: # ignore pass Loading @@ -448,6 +467,8 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Create a list of parsed paragraphs and join them with linebreaks return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip() for p in elem.paragraphs ]) case ET._Element(): return _parseXML(elem) case _: return '' Loading Loading @@ -480,7 +501,12 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: stopProgress(f'[red]Input document "{d}" is not a file') return try: docs[d] = (docx.Document(d), DocumentConfiguration(d)) # Search for footnotes in the document XML footnotesPart = None for part in Package.open(d).parts: if part.partname.endswith('/footnotes.xml'): footnotesPart = part docs[d] = (docx.Document(d), DocumentConfiguration(d), footnotesPart) ptasks[d] = progress.add_task(f'Processing {d}', total = None) progress.update(readTask, advance=1) except docx.opc.exceptions.PackageNotFoundError as e: Loading @@ -495,7 +521,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: # Processing Documents # for docFileName, (doc, docConfig) in docs.items(): for docFileName, (doc, docConfig, footnotesPart) in docs.items(): processTask = ptasks[docFileName] docItems = list(iter_block_items(doc)) addSectionNumbers = docConfig.addSectionNumbers Loading @@ -517,7 +543,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: return # Add sub-progress task progress.update(processTask, total = len(docItems) + 5) # + relations + image extraction + characters + toc + media convert progress.update(processTask, total = len(docItems) + 6) # + relations + image extraction + characters + toc + footnotes + media convert # Extract the media relations file, and get the mappings from document IDs to media files Loading Loading @@ -769,7 +795,9 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: elif codeblock: # Add whole code block to lines _lines.append('```') _lines.append('') _lines.extend(codeblock) _lines.append('') _lines.append('```') codeblock = [] else: Loading Loading @@ -830,6 +858,29 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion: lines[i] = re.sub(_referenceExpression, _repl, line) # type:ignore[arg-type] # # Process footnotes # progress.update(processTask, advance = 1) # progress update if len(footnotes) and footnotesPart is not None: _print(f'[yellow]Footnotes found: {len(footnotes)}') # Analyze footnotes file footnotesXML = ET.fromstring(footnotesPart.blob) # Process the footnotes XML here for element in footnotesXML: # Footnote found if strippedTag(element.tag) == 'footnote': footnoteID = element.attrib[f'{{{wns}}}id'] if footnoteID in footnotes: t = getTextFromXML(element) footnotes[footnoteID] = t # Add footnotes to the end of the document lines.append('') for fid, text in footnotes.items(): lines.append(f'[^{fid}]: {text}') # # List unresolved CAPTION markers # Loading