Commit 9c968275 authored by Jonas Schüppen's avatar Jonas Schüppen
Browse files

Added cli for all possibly running postprocessing steps

parent 918aa82c
Loading
Loading
Loading
Loading
Loading
+139 −85
Original line number Diff line number Diff line
import json
import os
import re
import sys
import warnings
from pathlib import Path
import zipfile
from lxml import etree
import tempfile
import shutil
from typing import Union
import argparse
import subprocess

@@ -16,13 +16,108 @@ from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm

#from file_helper import get_all_files_from_dir
def parse_input(args_to_parse=None, description = None):
    """
    args_to_parse: list of arguments, e.g. ["input", "output"]
    """
    if args_to_parse is None:
        args_to_parse = ["input", "output"]

    parser = argparse.ArgumentParser(description=description)

    if args_to_parse is None:
        args_to_parse = []

    if "input" in args_to_parse:
        parser.add_argument(
            "input", "-i", "--input",
            metavar="INPUT_FILE",
            type=str,
            required=True,
            help="Path to input DOCX file"
        )

    if "output" in args_to_parse:
        parser.add_argument(
            "output", "-o", "--output",
            metavar="OUTPUT_FILE",
            type=str,
            required=False,
            help="Path to output DOCX file"
        )
    if "lib_office_image" in args_to_parse:
        parser.add_argument("lib_office_image", "--loi",
                            default="docx-field-refresh",
                            help="Docker image name (default: docx-field-refresh)")
    if "default_style" in args_to_parse:
        parser.add_argument("default_style", "--dstyle",
                            default="Normal",
                            help="Docker image name (default: docx-field-refresh)")

    if "tables_folder" in args_to_parse:
        parser.add_argument("tables_folder", "--tf",
                            default="input/document_content/tables",
                            help="folder containing the json files for tables")
    args = parser.parse_args()

    # check input file
    input_file_path = Path.cwd() / args.input
    file_suffix = input_file_path.suffix
    print("Input file:\t", input_file_path)

    if not os.path.exists(input_file_path):
        print("Error: Input file doesn't exist")
        sys.exit()  # exit if input file does't exist

    if file_suffix != '.docx':
        print('Error: Input file with wrong file suffix! Expect file suffix .docx')
        sys.exit()  # exit if input file is not a markdown file

def apply_standard_style_to_unformatted_paragraphs(config):
    docx_path = config.get("output_docx")
    output_path = config.get("output_docx")
    standard_style_name = config.get("standard_style_name", "etsi_standard")
    # check output file
    # create output file if output file does't exist
    if args.output is None:
        print("Using input file as output file")
        args.output = args.input

    output_file_path = Path.cwd() / args.output
    file_suffix = output_file_path.suffix
    file_path = output_file_path.parent
    print("Output file:\t", output_file_path)

    if file_suffix != '.docx':
        print('Error: Output file with wrong file suffix! Expect file suffix .docx')
        sys.exit()  # exit if output file is not a markdown file

    if not os.path.exists(output_file_path):
        print("Output file doesn't exist. Create empty output file.")
        Path(file_path).mkdir(parents=True, exist_ok=True)  # make directory
        Path(output_file_path).touch()  # touch empty file

    # Return requested args
    return tuple(getattr(args, name) for name in args_to_parse)

def update_toc_cli():
    docx_input, docx_output = parse_input(description="Update a DOCX table of contents.")
    update_toc(docx_input, docx_output)

def turn_table_contents_cli():
    docx_input, docx_output = parse_input(description="Searches for cells beginning with [rotate] and turns the content 90 degree counterclockwise")
    postprocess_table_content(docx_input, docx_output)

def refresh_docx_fields_cli():
    docx_input, image = parse_input(["input", "lib_office_image"],description="Refresh DOCX fields using LibreOffice in Docker (in-place).")
    refresh_docx_fields(docx_input, image)

def update_formats_cli():
    docx_input, docx_output, style = parse_input(["input", "output", "default_style"], description="Set unformated paragraphs to standard styling.")
    apply_standard_style_to_unformatted_paragraphs(docx_input, docx_output, style)

def table_width_adjustment_cli():
    docx_input, docx_output, tables_folder = parse_input(["input", "output", "tables_folder"],
                                                 description="Set the width of table columns according to values in json or if non there to be equal.")
    table_widths_adjustment(docx_input, docx_output, tables_folder)

def apply_standard_style_to_unformatted_paragraphs(docx_input, docx_output, standard_style_name = "Normal"):
    # Filter warning
    warnings.filterwarnings(
        "ignore",
@@ -31,7 +126,7 @@ def apply_standard_style_to_unformatted_paragraphs(config):
    )


    doc = Document(docx_path)
    doc = Document(docx_input)
    changed = 0

    for p in doc.paragraphs:
@@ -42,7 +137,7 @@ def apply_standard_style_to_unformatted_paragraphs(config):
            changed += 1

    print(f'Changed style to {standard_style_name} for {changed} paragraphs.')
    doc.save(output_path)
    doc.save(docx_output)

def rotate_cell_text(cell):
    # Hole oder erstelle <w:tcPr>
@@ -78,14 +173,6 @@ def postprocess_table_content(docx_path, output_path):
                    rotate_cell_text(cell)
    doc.save(output_path)

def turn_table_contents_cli():
    parser = argparse.ArgumentParser(description="Searches for cells beginning with [rotate] and turns the content 90 degree counterclockwise")
    parser.add_argument("docx_input", help="Path to input DOCX file")
    parser.add_argument("docx_output", help="Path to output DOCX file")
    args = parser.parse_args()

    postprocess_table_content(args.docx_input, args.docx_output)

def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> str:
    """
    Refreshes fields in a DOCX file using LibreOffice inside a Docker container.
@@ -199,44 +286,6 @@ def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> s
    return str(input_path)


def refresh_docx_fields_cli():
    
    parser = argparse.ArgumentParser(description="Refresh DOCX fields using LibreOffice in Docker (in-place).")
    parser.add_argument("input", help="Path to input DOCX file.")
    parser.add_argument("--image", default="docx-field-refresh", help="Docker image name (default: docx-field-refresh)")

    args = parser.parse_args()
    refresh_docx_fields(args.input, args.image)

def insert_page_break_before_long_tables(config):
    docx_path = config.get("output_docx")
    output_path = config.get("output_docx")

    word = win32com.client.Dispatch("Word.Application")
    word.Visible = False

    doc = word.Documents.Open(docx_path)

    for i, table in enumerate(doc.Tables):
        # Tabellenbereich abrufen
        start = table.Range.Start
        end = table.Range.End

        # Seitenzahl berechnen
        start_page = doc.Range(start, start).Information(3)  # wdActiveEndPageNumber = 3
        end_page = doc.Range(end - 1, end - 1).Information(3)

        if end_page > start_page:
            print(f'Table {i + 1} is on a page break: {start_page} -> {end_page}')
            # Seitenumbruch einfügen
            para = doc.Range(start, start)
            para.InsertBreak(7)  # wdPageBreak = 7

    # Speichern unter neuem Namen
    doc.SaveAs(output_path)
    doc.Close()
    word.Quit()

def format_toc_header(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}):
    root = etree.fromstring(xml_data)

@@ -301,35 +350,40 @@ def update_toc(docx_input, docx_output):
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def update_toc_cli():
    docx_input, docx_output = parse_input()
def get_all_files_from_dir(relative_path, ending=None, subfolder=True):
    """
    Gibt eine Liste aller Dateien im angegebenen Verzeichnis (und optional in Unterordnern) zurück,
    die eine bestimmte Endung haben.

    update_toc(docx_input, docx_output)
    Parameter:
        relative_path (str | Path): Pfad zum Verzeichnis (relativ oder absolut)
        ending (str | None): Dateiendung (z. B. '.txt' oder 'txt'). Wenn None, werden alle Dateien berücksichtigt.
        subfolder (bool): Wenn True, werden Unterordner durchsucht.

def parse_input():
    parser = argparse.ArgumentParser(description="Update a DOCX table of contents.")
    parser.add_argument('docx_input', '-i', '--input',
                        metavar='INPUT_FILE',
                        required=True,
                        type=str,
                        default=None,
                        help='Path to input DOCX file')
    Rückgabe:
        list[Path]: Liste von pathlib.Path-Objekten
    """
    base_path = Path(relative_path)
    if not base_path.exists():
        print(f"Error: Folder not found: {base_path}")

    parser.add_argument('docx_output', '-o', '--output',
                        metavar='OUTPUT_FILE',
                        required=False,
                        type=str,
                        default=None,
                        help='Path to output DOCX file')
    args = parser.parse_args()
    # Endung normalisieren (z. B. 'txt' → '.txt')
    if ending is not None:
        if not ending.startswith('.'):
            ending = '.' + ending

    return args.docx_input, args.docx_output
    # Auswahl der passenden Dateien
    pattern = "**/*" if subfolder else "*"
    files = [p for p in base_path.glob(pattern) if p.is_file()]

    if ending is None:
        return files
    else:
        return [f for f in files if f.suffix.lower() == ending.lower()]

def table_widths_adjustment(config):
    table_path = config.get("tables_folder")
    docx_path = config.get("output_docx")
    doc = Document(docx_path)

def table_widths_adjustment(docx_input, docx_output, tables_folder):
    doc = Document(docx_input)
    def get_table_caption(table):
        """Liest den Alternativtext-Titel (Caption) aus einer Tabelle."""
        tbl = table._tbl
@@ -364,7 +418,7 @@ def table_widths_adjustment(config):
        # Falls Prozente als Ganzzahlen angegeben sind, normalisieren wir sie
        total_percent = sum(width_percentages)
        if total_percent == 0:
            ErrorHandler()("sum of percent-values is 0.")
            print("Error: sum of percent-values is 0.")

        # Umrechnen auf relative cm-Werte
        widths_cm = [Cm((p / total_percent) * total_width_cm) for p in width_percentages]
@@ -394,21 +448,21 @@ def table_widths_adjustment(config):


    # preparing json_table list -> getting caption and width from table-json files
    table_list = get_all_files_from_dir(table_path, "json")
    table_list = get_all_files_from_dir(tables_folder, "json")
    json_tables = []
    for table_path in table_list:
        with open(table_path, "r", encoding="utf-8") as f:
    for tables_folder in table_list:
        with open(tables_folder, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                ErrorHandler()(f"Skipped {table_path}. Error in reading file: {e}")
                print(f"Skipped {tables_folder}. Error in reading file: {e}")
                continue

        caption = data.get("caption")
        widths = data.get("column_width")

        if not caption or not widths:
            ErrorHandler()(f"Skipping {table_path} – as caption and/or column_width are not set as expected")
            print(f"Skipping {tables_folder} – as caption and/or column_width are not set as expected")
            continue

        json_tables.append(data)
@@ -431,4 +485,4 @@ def table_widths_adjustment(config):
            for i, width in enumerate(col_widths):
                cell = row.cells[i]
                cell.width = width
    doc.save(docx_path)
 No newline at end of file
    doc.save(docx_output)
 No newline at end of file
+4 −4
Original line number Diff line number Diff line
@@ -14,11 +14,11 @@ setup(
            'console_scripts' : ['pandocFilter=pandocFilter:main',
								'generateTOC=generateTOC:main',
								'svg2png=svg2png:main',
								"update_references=postprocessing:update_word_fields",
        						"update_formats=postprocessing:apply_standard_style_to_unformatted_paragraphs",
								#"update_references=postprocessing:update_word_fields",
        						"update_formats=postprocessing:update_formats_cli",
        						"turn_table_contents=postprocessing:turn_table_contents_cli",
        						"table_width_adjustment=postprocessing:table_widths_adjustment",
        						"check_multipage_tables=postprocessing:insert_page_break_before_long_tables",
        						"table_width_adjustment=postprocessing:table_width_adjustment_cli",
        						#"check_multipage_tables=postprocessing:insert_page_break_before_long_tables",
        						#"apply_etsi_styling: postprocessing:postprocess_etsi_styles",
        						"update_toc=postprocessing:update_toc_cli",
								"refresh_docx_fields=postprocessing:refresh_docx_fields_cli",