Loading generateBaseline/postprocessing.py +139 −85 Original line number Diff line number Diff line import json import os import re import sys import warnings from pathlib import Path import zipfile from lxml import etree import tempfile import shutil from typing import Union import argparse import subprocess Loading @@ -16,13 +16,108 @@ from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Cm #from file_helper import get_all_files_from_dir def parse_input(args_to_parse=None, description = None): """ args_to_parse: list of arguments, e.g. ["input", "output"] """ if args_to_parse is None: args_to_parse = ["input", "output"] parser = argparse.ArgumentParser(description=description) if args_to_parse is None: args_to_parse = [] if "input" in args_to_parse: parser.add_argument( "input", "-i", "--input", metavar="INPUT_FILE", type=str, required=True, help="Path to input DOCX file" ) if "output" in args_to_parse: parser.add_argument( "output", "-o", "--output", metavar="OUTPUT_FILE", type=str, required=False, help="Path to output DOCX file" ) if "lib_office_image" in args_to_parse: parser.add_argument("lib_office_image", "--loi", default="docx-field-refresh", help="Docker image name (default: docx-field-refresh)") if "default_style" in args_to_parse: parser.add_argument("default_style", "--dstyle", default="Normal", help="Docker image name (default: docx-field-refresh)") if "tables_folder" in args_to_parse: parser.add_argument("tables_folder", "--tf", default="input/document_content/tables", help="folder containing the json files for tables") args = parser.parse_args() # check input file input_file_path = Path.cwd() / args.input file_suffix = input_file_path.suffix print("Input file:\t", input_file_path) if not os.path.exists(input_file_path): print("Error: Input file doesn't exist") sys.exit() # exit if input file does't exist if file_suffix != '.docx': print('Error: Input file with wrong file suffix! Expect file suffix .docx') sys.exit() # exit if input file is not a markdown file def apply_standard_style_to_unformatted_paragraphs(config): docx_path = config.get("output_docx") output_path = config.get("output_docx") standard_style_name = config.get("standard_style_name", "etsi_standard") # check output file # create output file if output file does't exist if args.output is None: print("Using input file as output file") args.output = args.input output_file_path = Path.cwd() / args.output file_suffix = output_file_path.suffix file_path = output_file_path.parent print("Output file:\t", output_file_path) if file_suffix != '.docx': print('Error: Output file with wrong file suffix! Expect file suffix .docx') sys.exit() # exit if output file is not a markdown file if not os.path.exists(output_file_path): print("Output file doesn't exist. Create empty output file.") Path(file_path).mkdir(parents=True, exist_ok=True) # make directory Path(output_file_path).touch() # touch empty file # Return requested args return tuple(getattr(args, name) for name in args_to_parse) def update_toc_cli(): docx_input, docx_output = parse_input(description="Update a DOCX table of contents.") update_toc(docx_input, docx_output) def turn_table_contents_cli(): docx_input, docx_output = parse_input(description="Searches for cells beginning with [rotate] and turns the content 90 degree counterclockwise") postprocess_table_content(docx_input, docx_output) def refresh_docx_fields_cli(): docx_input, image = parse_input(["input", "lib_office_image"],description="Refresh DOCX fields using LibreOffice in Docker (in-place).") refresh_docx_fields(docx_input, image) def update_formats_cli(): docx_input, docx_output, style = parse_input(["input", "output", "default_style"], description="Set unformated paragraphs to standard styling.") apply_standard_style_to_unformatted_paragraphs(docx_input, docx_output, style) def table_width_adjustment_cli(): docx_input, docx_output, tables_folder = parse_input(["input", "output", "tables_folder"], description="Set the width of table columns according to values in json or if non there to be equal.") table_widths_adjustment(docx_input, docx_output, tables_folder) def apply_standard_style_to_unformatted_paragraphs(docx_input, docx_output, standard_style_name = "Normal"): # Filter warning warnings.filterwarnings( "ignore", Loading @@ -31,7 +126,7 @@ def apply_standard_style_to_unformatted_paragraphs(config): ) doc = Document(docx_path) doc = Document(docx_input) changed = 0 for p in doc.paragraphs: Loading @@ -42,7 +137,7 @@ def apply_standard_style_to_unformatted_paragraphs(config): changed += 1 print(f'Changed style to {standard_style_name} for {changed} paragraphs.') doc.save(output_path) doc.save(docx_output) def rotate_cell_text(cell): # Hole oder erstelle <w:tcPr> Loading Loading @@ -78,14 +173,6 @@ def postprocess_table_content(docx_path, output_path): rotate_cell_text(cell) doc.save(output_path) def turn_table_contents_cli(): parser = argparse.ArgumentParser(description="Searches for cells beginning with [rotate] and turns the content 90 degree counterclockwise") parser.add_argument("docx_input", help="Path to input DOCX file") parser.add_argument("docx_output", help="Path to output DOCX file") args = parser.parse_args() postprocess_table_content(args.docx_input, args.docx_output) def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> str: """ Refreshes fields in a DOCX file using LibreOffice inside a Docker container. Loading Loading @@ -199,44 +286,6 @@ def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> s return str(input_path) def refresh_docx_fields_cli(): parser = argparse.ArgumentParser(description="Refresh DOCX fields using LibreOffice in Docker (in-place).") parser.add_argument("input", help="Path to input DOCX file.") parser.add_argument("--image", default="docx-field-refresh", help="Docker image name (default: docx-field-refresh)") args = parser.parse_args() refresh_docx_fields(args.input, args.image) def insert_page_break_before_long_tables(config): docx_path = config.get("output_docx") output_path = config.get("output_docx") word = win32com.client.Dispatch("Word.Application") word.Visible = False doc = word.Documents.Open(docx_path) for i, table in enumerate(doc.Tables): # Tabellenbereich abrufen start = table.Range.Start end = table.Range.End # Seitenzahl berechnen start_page = doc.Range(start, start).Information(3) # wdActiveEndPageNumber = 3 end_page = doc.Range(end - 1, end - 1).Information(3) if end_page > start_page: print(f'Table {i + 1} is on a page break: {start_page} -> {end_page}') # Seitenumbruch einfügen para = doc.Range(start, start) para.InsertBreak(7) # wdPageBreak = 7 # Speichern unter neuem Namen doc.SaveAs(output_path) doc.Close() word.Quit() def format_toc_header(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}): root = etree.fromstring(xml_data) Loading Loading @@ -301,35 +350,40 @@ def update_toc(docx_input, docx_output): if os.path.exists(tmp_path): os.remove(tmp_path) def update_toc_cli(): docx_input, docx_output = parse_input() def get_all_files_from_dir(relative_path, ending=None, subfolder=True): """ Gibt eine Liste aller Dateien im angegebenen Verzeichnis (und optional in Unterordnern) zurück, die eine bestimmte Endung haben. update_toc(docx_input, docx_output) Parameter: relative_path (str | Path): Pfad zum Verzeichnis (relativ oder absolut) ending (str | None): Dateiendung (z. B. '.txt' oder 'txt'). Wenn None, werden alle Dateien berücksichtigt. subfolder (bool): Wenn True, werden Unterordner durchsucht. def parse_input(): parser = argparse.ArgumentParser(description="Update a DOCX table of contents.") parser.add_argument('docx_input', '-i', '--input', metavar='INPUT_FILE', required=True, type=str, default=None, help='Path to input DOCX file') Rückgabe: list[Path]: Liste von pathlib.Path-Objekten """ base_path = Path(relative_path) if not base_path.exists(): print(f"Error: Folder not found: {base_path}") parser.add_argument('docx_output', '-o', '--output', metavar='OUTPUT_FILE', required=False, type=str, default=None, help='Path to output DOCX file') args = parser.parse_args() # Endung normalisieren (z. B. 'txt' → '.txt') if ending is not None: if not ending.startswith('.'): ending = '.' + ending return args.docx_input, args.docx_output # Auswahl der passenden Dateien pattern = "**/*" if subfolder else "*" files = [p for p in base_path.glob(pattern) if p.is_file()] if ending is None: return files else: return [f for f in files if f.suffix.lower() == ending.lower()] def table_widths_adjustment(config): table_path = config.get("tables_folder") docx_path = config.get("output_docx") doc = Document(docx_path) def table_widths_adjustment(docx_input, docx_output, tables_folder): doc = Document(docx_input) def get_table_caption(table): """Liest den Alternativtext-Titel (Caption) aus einer Tabelle.""" tbl = table._tbl Loading Loading @@ -364,7 +418,7 @@ def table_widths_adjustment(config): # Falls Prozente als Ganzzahlen angegeben sind, normalisieren wir sie total_percent = sum(width_percentages) if total_percent == 0: ErrorHandler()("sum of percent-values is 0.") print("Error: sum of percent-values is 0.") # Umrechnen auf relative cm-Werte widths_cm = [Cm((p / total_percent) * total_width_cm) for p in width_percentages] Loading Loading @@ -394,21 +448,21 @@ def table_widths_adjustment(config): # preparing json_table list -> getting caption and width from table-json files table_list = get_all_files_from_dir(table_path, "json") table_list = get_all_files_from_dir(tables_folder, "json") json_tables = [] for table_path in table_list: with open(table_path, "r", encoding="utf-8") as f: for tables_folder in table_list: with open(tables_folder, "r", encoding="utf-8") as f: try: data = json.load(f) except json.JSONDecodeError as e: ErrorHandler()(f"Skipped {table_path}. Error in reading file: {e}") print(f"Skipped {tables_folder}. Error in reading file: {e}") continue caption = data.get("caption") widths = data.get("column_width") if not caption or not widths: ErrorHandler()(f"Skipping {table_path} – as caption and/or column_width are not set as expected") print(f"Skipping {tables_folder} – as caption and/or column_width are not set as expected") continue json_tables.append(data) Loading @@ -431,4 +485,4 @@ def table_widths_adjustment(config): for i, width in enumerate(col_widths): cell = row.cells[i] cell.width = width doc.save(docx_path) No newline at end of file doc.save(docx_output) No newline at end of file generateBaseline/setup.py +4 −4 Original line number Diff line number Diff line Loading @@ -14,11 +14,11 @@ setup( 'console_scripts' : ['pandocFilter=pandocFilter:main', 'generateTOC=generateTOC:main', 'svg2png=svg2png:main', "update_references=postprocessing:update_word_fields", "update_formats=postprocessing:apply_standard_style_to_unformatted_paragraphs", #"update_references=postprocessing:update_word_fields", "update_formats=postprocessing:update_formats_cli", "turn_table_contents=postprocessing:turn_table_contents_cli", "table_width_adjustment=postprocessing:table_widths_adjustment", "check_multipage_tables=postprocessing:insert_page_break_before_long_tables", "table_width_adjustment=postprocessing:table_width_adjustment_cli", #"check_multipage_tables=postprocessing:insert_page_break_before_long_tables", #"apply_etsi_styling: postprocessing:postprocess_etsi_styles", "update_toc=postprocessing:update_toc_cli", "refresh_docx_fields=postprocessing:refresh_docx_fields_cli", Loading Loading
generateBaseline/postprocessing.py +139 −85 Original line number Diff line number Diff line import json import os import re import sys import warnings from pathlib import Path import zipfile from lxml import etree import tempfile import shutil from typing import Union import argparse import subprocess Loading @@ -16,13 +16,108 @@ from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import Cm #from file_helper import get_all_files_from_dir def parse_input(args_to_parse=None, description = None): """ args_to_parse: list of arguments, e.g. ["input", "output"] """ if args_to_parse is None: args_to_parse = ["input", "output"] parser = argparse.ArgumentParser(description=description) if args_to_parse is None: args_to_parse = [] if "input" in args_to_parse: parser.add_argument( "input", "-i", "--input", metavar="INPUT_FILE", type=str, required=True, help="Path to input DOCX file" ) if "output" in args_to_parse: parser.add_argument( "output", "-o", "--output", metavar="OUTPUT_FILE", type=str, required=False, help="Path to output DOCX file" ) if "lib_office_image" in args_to_parse: parser.add_argument("lib_office_image", "--loi", default="docx-field-refresh", help="Docker image name (default: docx-field-refresh)") if "default_style" in args_to_parse: parser.add_argument("default_style", "--dstyle", default="Normal", help="Docker image name (default: docx-field-refresh)") if "tables_folder" in args_to_parse: parser.add_argument("tables_folder", "--tf", default="input/document_content/tables", help="folder containing the json files for tables") args = parser.parse_args() # check input file input_file_path = Path.cwd() / args.input file_suffix = input_file_path.suffix print("Input file:\t", input_file_path) if not os.path.exists(input_file_path): print("Error: Input file doesn't exist") sys.exit() # exit if input file does't exist if file_suffix != '.docx': print('Error: Input file with wrong file suffix! Expect file suffix .docx') sys.exit() # exit if input file is not a markdown file def apply_standard_style_to_unformatted_paragraphs(config): docx_path = config.get("output_docx") output_path = config.get("output_docx") standard_style_name = config.get("standard_style_name", "etsi_standard") # check output file # create output file if output file does't exist if args.output is None: print("Using input file as output file") args.output = args.input output_file_path = Path.cwd() / args.output file_suffix = output_file_path.suffix file_path = output_file_path.parent print("Output file:\t", output_file_path) if file_suffix != '.docx': print('Error: Output file with wrong file suffix! Expect file suffix .docx') sys.exit() # exit if output file is not a markdown file if not os.path.exists(output_file_path): print("Output file doesn't exist. Create empty output file.") Path(file_path).mkdir(parents=True, exist_ok=True) # make directory Path(output_file_path).touch() # touch empty file # Return requested args return tuple(getattr(args, name) for name in args_to_parse) def update_toc_cli(): docx_input, docx_output = parse_input(description="Update a DOCX table of contents.") update_toc(docx_input, docx_output) def turn_table_contents_cli(): docx_input, docx_output = parse_input(description="Searches for cells beginning with [rotate] and turns the content 90 degree counterclockwise") postprocess_table_content(docx_input, docx_output) def refresh_docx_fields_cli(): docx_input, image = parse_input(["input", "lib_office_image"],description="Refresh DOCX fields using LibreOffice in Docker (in-place).") refresh_docx_fields(docx_input, image) def update_formats_cli(): docx_input, docx_output, style = parse_input(["input", "output", "default_style"], description="Set unformated paragraphs to standard styling.") apply_standard_style_to_unformatted_paragraphs(docx_input, docx_output, style) def table_width_adjustment_cli(): docx_input, docx_output, tables_folder = parse_input(["input", "output", "tables_folder"], description="Set the width of table columns according to values in json or if non there to be equal.") table_widths_adjustment(docx_input, docx_output, tables_folder) def apply_standard_style_to_unformatted_paragraphs(docx_input, docx_output, standard_style_name = "Normal"): # Filter warning warnings.filterwarnings( "ignore", Loading @@ -31,7 +126,7 @@ def apply_standard_style_to_unformatted_paragraphs(config): ) doc = Document(docx_path) doc = Document(docx_input) changed = 0 for p in doc.paragraphs: Loading @@ -42,7 +137,7 @@ def apply_standard_style_to_unformatted_paragraphs(config): changed += 1 print(f'Changed style to {standard_style_name} for {changed} paragraphs.') doc.save(output_path) doc.save(docx_output) def rotate_cell_text(cell): # Hole oder erstelle <w:tcPr> Loading Loading @@ -78,14 +173,6 @@ def postprocess_table_content(docx_path, output_path): rotate_cell_text(cell) doc.save(output_path) def turn_table_contents_cli(): parser = argparse.ArgumentParser(description="Searches for cells beginning with [rotate] and turns the content 90 degree counterclockwise") parser.add_argument("docx_input", help="Path to input DOCX file") parser.add_argument("docx_output", help="Path to output DOCX file") args = parser.parse_args() postprocess_table_content(args.docx_input, args.docx_output) def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> str: """ Refreshes fields in a DOCX file using LibreOffice inside a Docker container. Loading Loading @@ -199,44 +286,6 @@ def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> s return str(input_path) def refresh_docx_fields_cli(): parser = argparse.ArgumentParser(description="Refresh DOCX fields using LibreOffice in Docker (in-place).") parser.add_argument("input", help="Path to input DOCX file.") parser.add_argument("--image", default="docx-field-refresh", help="Docker image name (default: docx-field-refresh)") args = parser.parse_args() refresh_docx_fields(args.input, args.image) def insert_page_break_before_long_tables(config): docx_path = config.get("output_docx") output_path = config.get("output_docx") word = win32com.client.Dispatch("Word.Application") word.Visible = False doc = word.Documents.Open(docx_path) for i, table in enumerate(doc.Tables): # Tabellenbereich abrufen start = table.Range.Start end = table.Range.End # Seitenzahl berechnen start_page = doc.Range(start, start).Information(3) # wdActiveEndPageNumber = 3 end_page = doc.Range(end - 1, end - 1).Information(3) if end_page > start_page: print(f'Table {i + 1} is on a page break: {start_page} -> {end_page}') # Seitenumbruch einfügen para = doc.Range(start, start) para.InsertBreak(7) # wdPageBreak = 7 # Speichern unter neuem Namen doc.SaveAs(output_path) doc.Close() word.Quit() def format_toc_header(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}): root = etree.fromstring(xml_data) Loading Loading @@ -301,35 +350,40 @@ def update_toc(docx_input, docx_output): if os.path.exists(tmp_path): os.remove(tmp_path) def update_toc_cli(): docx_input, docx_output = parse_input() def get_all_files_from_dir(relative_path, ending=None, subfolder=True): """ Gibt eine Liste aller Dateien im angegebenen Verzeichnis (und optional in Unterordnern) zurück, die eine bestimmte Endung haben. update_toc(docx_input, docx_output) Parameter: relative_path (str | Path): Pfad zum Verzeichnis (relativ oder absolut) ending (str | None): Dateiendung (z. B. '.txt' oder 'txt'). Wenn None, werden alle Dateien berücksichtigt. subfolder (bool): Wenn True, werden Unterordner durchsucht. def parse_input(): parser = argparse.ArgumentParser(description="Update a DOCX table of contents.") parser.add_argument('docx_input', '-i', '--input', metavar='INPUT_FILE', required=True, type=str, default=None, help='Path to input DOCX file') Rückgabe: list[Path]: Liste von pathlib.Path-Objekten """ base_path = Path(relative_path) if not base_path.exists(): print(f"Error: Folder not found: {base_path}") parser.add_argument('docx_output', '-o', '--output', metavar='OUTPUT_FILE', required=False, type=str, default=None, help='Path to output DOCX file') args = parser.parse_args() # Endung normalisieren (z. B. 'txt' → '.txt') if ending is not None: if not ending.startswith('.'): ending = '.' + ending return args.docx_input, args.docx_output # Auswahl der passenden Dateien pattern = "**/*" if subfolder else "*" files = [p for p in base_path.glob(pattern) if p.is_file()] if ending is None: return files else: return [f for f in files if f.suffix.lower() == ending.lower()] def table_widths_adjustment(config): table_path = config.get("tables_folder") docx_path = config.get("output_docx") doc = Document(docx_path) def table_widths_adjustment(docx_input, docx_output, tables_folder): doc = Document(docx_input) def get_table_caption(table): """Liest den Alternativtext-Titel (Caption) aus einer Tabelle.""" tbl = table._tbl Loading Loading @@ -364,7 +418,7 @@ def table_widths_adjustment(config): # Falls Prozente als Ganzzahlen angegeben sind, normalisieren wir sie total_percent = sum(width_percentages) if total_percent == 0: ErrorHandler()("sum of percent-values is 0.") print("Error: sum of percent-values is 0.") # Umrechnen auf relative cm-Werte widths_cm = [Cm((p / total_percent) * total_width_cm) for p in width_percentages] Loading Loading @@ -394,21 +448,21 @@ def table_widths_adjustment(config): # preparing json_table list -> getting caption and width from table-json files table_list = get_all_files_from_dir(table_path, "json") table_list = get_all_files_from_dir(tables_folder, "json") json_tables = [] for table_path in table_list: with open(table_path, "r", encoding="utf-8") as f: for tables_folder in table_list: with open(tables_folder, "r", encoding="utf-8") as f: try: data = json.load(f) except json.JSONDecodeError as e: ErrorHandler()(f"Skipped {table_path}. Error in reading file: {e}") print(f"Skipped {tables_folder}. Error in reading file: {e}") continue caption = data.get("caption") widths = data.get("column_width") if not caption or not widths: ErrorHandler()(f"Skipping {table_path} – as caption and/or column_width are not set as expected") print(f"Skipping {tables_folder} – as caption and/or column_width are not set as expected") continue json_tables.append(data) Loading @@ -431,4 +485,4 @@ def table_widths_adjustment(config): for i, width in enumerate(col_widths): cell = row.cells[i] cell.width = width doc.save(docx_path) No newline at end of file doc.save(docx_output) No newline at end of file
generateBaseline/setup.py +4 −4 Original line number Diff line number Diff line Loading @@ -14,11 +14,11 @@ setup( 'console_scripts' : ['pandocFilter=pandocFilter:main', 'generateTOC=generateTOC:main', 'svg2png=svg2png:main', "update_references=postprocessing:update_word_fields", "update_formats=postprocessing:apply_standard_style_to_unformatted_paragraphs", #"update_references=postprocessing:update_word_fields", "update_formats=postprocessing:update_formats_cli", "turn_table_contents=postprocessing:turn_table_contents_cli", "table_width_adjustment=postprocessing:table_widths_adjustment", "check_multipage_tables=postprocessing:insert_page_break_before_long_tables", "table_width_adjustment=postprocessing:table_width_adjustment_cli", #"check_multipage_tables=postprocessing:insert_page_break_before_long_tables", #"apply_etsi_styling: postprocessing:postprocess_etsi_styles", "update_toc=postprocessing:update_toc_cli", "refresh_docx_fields=postprocessing:refresh_docx_fields_cli", Loading