Loading generateBaseline/pandocFilter.py +10 −6 Original line number Diff line number Diff line Loading @@ -211,21 +211,25 @@ def correctTableSeparators(progress: Progress, mdLines: list[str]) -> list[str]: return _lines def process(document:str, outDirectory:str) -> None: def process(args) -> None: with Progress(TextColumn('{task.description}'), TimeElapsedColumn()) as progress: mdLines = readMDFile(progress, document) mdLines = readMDFile(progress, args.document) mdLines = correctTOC(progress, mdLines) mdLines = replaceTableCaptions(progress, mdLines) mdLines = replaceFigureCaptions(progress, mdLines) if args.figure_paths: mdLines = replaceFiguresPathSvgToPng(progress, mdLines) mdLines = replaceLineBreaks(progress, mdLines) if args.table_separators: mdLines = correctTableSeparators(progress, mdLines) writeMDFile(progress, mdLines, document, outDirectory) writeMDFile(progress, mdLines, args.document, args.outDirectory) def main(args=None): # Parse command line arguments parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-ts', '--table-separators', action='store_true', required=False, default=False, help="Correct table separators") parser.add_argument('-fp', '--figure-paths', action='store_true', required=False, default=False, help="Replace figure paths") parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory') parser.add_argument('document', help = 'document to parse') args = parser.parse_args() Loading @@ -233,7 +237,7 @@ def main(args=None): # Process documents and print output os.makedirs(args.outDirectory, exist_ok = True) process(args.document, args.outDirectory) process(args) if __name__ == '__main__': sys.exit(main()) No newline at end of file generateBaseline/postprocessing.py +99 −36 Original line number Diff line number Diff line Loading @@ -3,6 +3,10 @@ import os import re import warnings from pathlib import Path import zipfile from lxml import etree import tempfile import shutil import win32com.client #pip install pywin32 Loading @@ -11,7 +15,6 @@ from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Cm from errors import ErrorHandler, Level, Mode from file_helper import get_all_files_from_dir Loading @@ -37,7 +40,7 @@ def apply_standard_style_to_unformatted_paragraphs(config): p.style = standard_style_name changed += 1 ErrorHandler()(f"Changed style to '{standard_style_name}' for {changed} paragraphs.") print(f'Changed style to {standard_style_name} for {changed} paragraphs.') doc.save(output_path) def rotate_cell_text(cell): Loading Loading @@ -86,7 +89,7 @@ def update_word_fields(config_path: dict|str): relativer_pfad = Path(docx_path) docx_absolute_path = relativer_pfad.resolve() if not os.path.isfile(docx_absolute_path): ErrorHandler()(f"File not found: {docx_absolute_path}", Level.ERROR) print(f'File not found: {docx_absolute_path}') # Word starten word = win32com.client.Dispatch("Word.Application") Loading @@ -105,7 +108,7 @@ def update_word_fields(config_path: dict|str): # Schließen doc.Close() ErrorHandler()(f"Fields in '{docx_absolute_path}' updated and saved") print(f'Fields in {docx_absolute_path} updated and saved') finally: word.Quit() Loading @@ -128,7 +131,7 @@ def insert_page_break_before_long_tables(config): end_page = doc.Range(end - 1, end - 1).Information(3) if end_page > start_page: ErrorHandler()(f"Table {i + 1} is on a page break: {start_page} -> {end_page}", Level.INFO) print(f'Table {i + 1} is on a page break: {start_page} -> {end_page}') # Seitenumbruch einfügen para = doc.Range(start, start) para.InsertBreak(7) # wdPageBreak = 7 Loading @@ -138,39 +141,99 @@ def insert_page_break_before_long_tables(config): doc.Close() word.Quit() def update_toc_level(config): docx_path = config.get("output_docx") word = win32com.client.Dispatch("Word.Application") word.Visible = False def format_toc_header(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}): root = etree.fromstring(xml_data) doc = word.Documents.Open(docx_path) counter = 0 # Find <w:pStyle w:val="TOCHeading"> for pstyle in root.xpath('.//w:pStyle[@w:val="TOCHeading"]', namespaces=ns): # Change it to be <w:pStyle w:val="TT"> old_text = pstyle pstyle.set(f"{{{ns['w']}}}val", "TT") counter+=1 print(f'Changed Style "TOCHeading" to "TT" {counter} times') return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") # Wenn kein TOC vorhanden ist, kannst du eins hinzufügen: if doc.TablesOfContents.Count == 0: # Inhaltsverzeichnis am Anfang des Dokuments einfügen doc.TablesOfContents.Add( Range=doc.Range(0, 0), UseHeadingStyles=True, UpperHeadingLevel=1, LowerHeadingLevel=9, # 👉 bis Heading 9 UseHyperlinks=True, HidePageNumbersInWeb=False, UseOutlineLevels=True ) # Vorhandenes TOC anpassen toc = doc.TablesOfContents(1) #Formating heading -> ToDo: last line not working so skipped for the moment #toc_range = toc.Range #heading_para = toc_range.Paragraphs(1) #heading_para.Style = doc.Styles("Heading 1") #set level range from 1-9 toc.UpperHeadingLevel = 1 toc.LowerHeadingLevel = 9 toc.Update() doc.SaveAs(docx_path) doc.Close() word.Quit() def update_toc_level(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}): root = etree.fromstring(xml_data) new_range = "1-9" # Regex for \o "x-y" with x and y being numbers pattern = re.compile(r'(?<=\\o )"\d+-\d+"\s*') # Loop over all elements to find "TOC" for elem in root.xpath('.//w:instrText', namespaces=ns): if 'TOC' in elem.text: old_text = elem.text elem.text = pattern.sub('', elem.text) print(f'Changed TOC: {old_text} → {elem.text}') return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") def update_toc(docx_input, docx_output): # read xml with zipfile.ZipFile(docx_input, 'r') as zin: xml_data = zin.read("word/document.xml") xml_data = update_toc_level(xml_data) xml_data = format_toc_header(xml_data) # create temp file tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) # Datei wird nur über zipfile geöffnet try: # write new docx to temp file with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) # Write to output file shutil.move(tmp_path, docx_output) finally: # delete temp file if still existing if os.path.exists(tmp_path): os.remove(tmp_path) #def update_toc_level(config): # docx_path = config.get("output_docx") # word = win32com.client.Dispatch("Word.Application") # word.Visible = False # # doc = word.Documents.Open(docx_path) # # # Wenn kein TOC vorhanden ist, kannst du eins hinzufügen: # if doc.TablesOfContents.Count == 0: # # Inhaltsverzeichnis am Anfang des Dokuments einfügen # doc.TablesOfContents.Add( # Range=doc.Range(0, 0), # UseHeadingStyles=True, # UpperHeadingLevel=1, # LowerHeadingLevel=9, # 👉 bis Heading 9 # UseHyperlinks=True, # HidePageNumbersInWeb=False, # UseOutlineLevels=True # ) # # Vorhandenes TOC anpassen # toc = doc.TablesOfContents(1) # #Formating heading -> ToDo: last line not working so skipped for the moment # #toc_range = toc.Range # #heading_para = toc_range.Paragraphs(1) # #heading_para.Style = doc.Styles("Heading 1") # #set level range from 1-9 # toc.UpperHeadingLevel = 1 # toc.LowerHeadingLevel = 9 # toc.Update() # doc.SaveAs(docx_path) # doc.Close() # word.Quit() def table_widths_adjustment(config): table_path = config.get("tables_folder") Loading generateBaseline/setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ setup( "table_width_adjustment=postprocessing:table_widths_adjustment", "check_multipage_tables=postprocessing:insert_page_break_before_long_tables", #"apply_etsi_styling: postprocessing:postprocess_etsi_styles", "update_toc_level=postprocessing:update_toc_level", "update_toc=postprocessing:update_toc", ] } Loading Loading
generateBaseline/pandocFilter.py +10 −6 Original line number Diff line number Diff line Loading @@ -211,21 +211,25 @@ def correctTableSeparators(progress: Progress, mdLines: list[str]) -> list[str]: return _lines def process(document:str, outDirectory:str) -> None: def process(args) -> None: with Progress(TextColumn('{task.description}'), TimeElapsedColumn()) as progress: mdLines = readMDFile(progress, document) mdLines = readMDFile(progress, args.document) mdLines = correctTOC(progress, mdLines) mdLines = replaceTableCaptions(progress, mdLines) mdLines = replaceFigureCaptions(progress, mdLines) if args.figure_paths: mdLines = replaceFiguresPathSvgToPng(progress, mdLines) mdLines = replaceLineBreaks(progress, mdLines) if args.table_separators: mdLines = correctTableSeparators(progress, mdLines) writeMDFile(progress, mdLines, document, outDirectory) writeMDFile(progress, mdLines, args.document, args.outDirectory) def main(args=None): # Parse command line arguments parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-ts', '--table-separators', action='store_true', required=False, default=False, help="Correct table separators") parser.add_argument('-fp', '--figure-paths', action='store_true', required=False, default=False, help="Replace figure paths") parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory') parser.add_argument('document', help = 'document to parse') args = parser.parse_args() Loading @@ -233,7 +237,7 @@ def main(args=None): # Process documents and print output os.makedirs(args.outDirectory, exist_ok = True) process(args.document, args.outDirectory) process(args) if __name__ == '__main__': sys.exit(main()) No newline at end of file
generateBaseline/postprocessing.py +99 −36 Original line number Diff line number Diff line Loading @@ -3,6 +3,10 @@ import os import re import warnings from pathlib import Path import zipfile from lxml import etree import tempfile import shutil import win32com.client #pip install pywin32 Loading @@ -11,7 +15,6 @@ from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Cm from errors import ErrorHandler, Level, Mode from file_helper import get_all_files_from_dir Loading @@ -37,7 +40,7 @@ def apply_standard_style_to_unformatted_paragraphs(config): p.style = standard_style_name changed += 1 ErrorHandler()(f"Changed style to '{standard_style_name}' for {changed} paragraphs.") print(f'Changed style to {standard_style_name} for {changed} paragraphs.') doc.save(output_path) def rotate_cell_text(cell): Loading Loading @@ -86,7 +89,7 @@ def update_word_fields(config_path: dict|str): relativer_pfad = Path(docx_path) docx_absolute_path = relativer_pfad.resolve() if not os.path.isfile(docx_absolute_path): ErrorHandler()(f"File not found: {docx_absolute_path}", Level.ERROR) print(f'File not found: {docx_absolute_path}') # Word starten word = win32com.client.Dispatch("Word.Application") Loading @@ -105,7 +108,7 @@ def update_word_fields(config_path: dict|str): # Schließen doc.Close() ErrorHandler()(f"Fields in '{docx_absolute_path}' updated and saved") print(f'Fields in {docx_absolute_path} updated and saved') finally: word.Quit() Loading @@ -128,7 +131,7 @@ def insert_page_break_before_long_tables(config): end_page = doc.Range(end - 1, end - 1).Information(3) if end_page > start_page: ErrorHandler()(f"Table {i + 1} is on a page break: {start_page} -> {end_page}", Level.INFO) print(f'Table {i + 1} is on a page break: {start_page} -> {end_page}') # Seitenumbruch einfügen para = doc.Range(start, start) para.InsertBreak(7) # wdPageBreak = 7 Loading @@ -138,39 +141,99 @@ def insert_page_break_before_long_tables(config): doc.Close() word.Quit() def update_toc_level(config): docx_path = config.get("output_docx") word = win32com.client.Dispatch("Word.Application") word.Visible = False def format_toc_header(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}): root = etree.fromstring(xml_data) doc = word.Documents.Open(docx_path) counter = 0 # Find <w:pStyle w:val="TOCHeading"> for pstyle in root.xpath('.//w:pStyle[@w:val="TOCHeading"]', namespaces=ns): # Change it to be <w:pStyle w:val="TT"> old_text = pstyle pstyle.set(f"{{{ns['w']}}}val", "TT") counter+=1 print(f'Changed Style "TOCHeading" to "TT" {counter} times') return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") # Wenn kein TOC vorhanden ist, kannst du eins hinzufügen: if doc.TablesOfContents.Count == 0: # Inhaltsverzeichnis am Anfang des Dokuments einfügen doc.TablesOfContents.Add( Range=doc.Range(0, 0), UseHeadingStyles=True, UpperHeadingLevel=1, LowerHeadingLevel=9, # 👉 bis Heading 9 UseHyperlinks=True, HidePageNumbersInWeb=False, UseOutlineLevels=True ) # Vorhandenes TOC anpassen toc = doc.TablesOfContents(1) #Formating heading -> ToDo: last line not working so skipped for the moment #toc_range = toc.Range #heading_para = toc_range.Paragraphs(1) #heading_para.Style = doc.Styles("Heading 1") #set level range from 1-9 toc.UpperHeadingLevel = 1 toc.LowerHeadingLevel = 9 toc.Update() doc.SaveAs(docx_path) doc.Close() word.Quit() def update_toc_level(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}): root = etree.fromstring(xml_data) new_range = "1-9" # Regex for \o "x-y" with x and y being numbers pattern = re.compile(r'(?<=\\o )"\d+-\d+"\s*') # Loop over all elements to find "TOC" for elem in root.xpath('.//w:instrText', namespaces=ns): if 'TOC' in elem.text: old_text = elem.text elem.text = pattern.sub('', elem.text) print(f'Changed TOC: {old_text} → {elem.text}') return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") def update_toc(docx_input, docx_output): # read xml with zipfile.ZipFile(docx_input, 'r') as zin: xml_data = zin.read("word/document.xml") xml_data = update_toc_level(xml_data) xml_data = format_toc_header(xml_data) # create temp file tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) # Datei wird nur über zipfile geöffnet try: # write new docx to temp file with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) # Write to output file shutil.move(tmp_path, docx_output) finally: # delete temp file if still existing if os.path.exists(tmp_path): os.remove(tmp_path) #def update_toc_level(config): # docx_path = config.get("output_docx") # word = win32com.client.Dispatch("Word.Application") # word.Visible = False # # doc = word.Documents.Open(docx_path) # # # Wenn kein TOC vorhanden ist, kannst du eins hinzufügen: # if doc.TablesOfContents.Count == 0: # # Inhaltsverzeichnis am Anfang des Dokuments einfügen # doc.TablesOfContents.Add( # Range=doc.Range(0, 0), # UseHeadingStyles=True, # UpperHeadingLevel=1, # LowerHeadingLevel=9, # 👉 bis Heading 9 # UseHyperlinks=True, # HidePageNumbersInWeb=False, # UseOutlineLevels=True # ) # # Vorhandenes TOC anpassen # toc = doc.TablesOfContents(1) # #Formating heading -> ToDo: last line not working so skipped for the moment # #toc_range = toc.Range # #heading_para = toc_range.Paragraphs(1) # #heading_para.Style = doc.Styles("Heading 1") # #set level range from 1-9 # toc.UpperHeadingLevel = 1 # toc.LowerHeadingLevel = 9 # toc.Update() # doc.SaveAs(docx_path) # doc.Close() # word.Quit() def table_widths_adjustment(config): table_path = config.get("tables_folder") Loading
generateBaseline/setup.py +1 −1 Original line number Diff line number Diff line Loading @@ -20,7 +20,7 @@ setup( "table_width_adjustment=postprocessing:table_widths_adjustment", "check_multipage_tables=postprocessing:insert_page_break_before_long_tables", #"apply_etsi_styling: postprocessing:postprocess_etsi_styles", "update_toc_level=postprocessing:update_toc_level", "update_toc=postprocessing:update_toc", ] } Loading