Commit 45c2312a authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Adding some new features:

- Optionality for table separators corrector and figure paths replacement
- Enabling several postprocessing tools (TO BE COMPLETED). Only update_toc available
parent 898feabc
Loading
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -42,11 +42,13 @@ Build generateBaseline docker image:
    - if: $CI_COMMIT_BRANCH && $CI_PROJECT_NAME == "tools"
      changes:
        - generateBaseline/dockerfile
        - generateBaseline/dockerfile.pandoc
        - generateBaseline/setup.py
        - generateBaseline/requirements.txt
        - generateBaseline/pandocFilter.py
        - generateBaseline/generateTOC.py
        - generateBaseline/svg2png.py
        - generateBaseline/postprocessing.py
        
Build generateSpecWebSite docker image:
  stage: build
+10 −6
Original line number Diff line number Diff line
@@ -211,21 +211,25 @@ def correctTableSeparators(progress: Progress, mdLines: list[str]) -> list[str]:
	return _lines


def process(document:str, outDirectory:str) -> None:
def process(args) -> None:
	with Progress(TextColumn('{task.description}'),  TimeElapsedColumn()) as progress:
		mdLines = readMDFile(progress, document)
		mdLines = readMDFile(progress, args.document)
		mdLines = correctTOC(progress, mdLines)
		mdLines = replaceTableCaptions(progress, mdLines)
		mdLines = replaceFigureCaptions(progress, mdLines)
		if args.figure_paths:
			mdLines = replaceFiguresPathSvgToPng(progress, mdLines)
		mdLines = replaceLineBreaks(progress, mdLines)
		if args.table_separators:
			mdLines = correctTableSeparators(progress, mdLines)
		writeMDFile(progress, mdLines, document, outDirectory)
		writeMDFile(progress, mdLines, args.document, args.outDirectory)


def main(args=None):
	# Parse command line arguments
	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('-ts', '--table-separators',        action='store_true', required=False, default=False, help="Correct table separators")
	parser.add_argument('-fp', '--figure-paths',        action='store_true', required=False, default=False, help="Replace figure paths")
	parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>',  help = 'specify output directory')
	parser.add_argument('document',  help = 'document to parse')
	args = parser.parse_args()
@@ -233,7 +237,7 @@ def main(args=None):
	# Process documents and print output
	os.makedirs(args.outDirectory, exist_ok = True)

	process(args.document, args.outDirectory)
	process(args)

if __name__ == '__main__':
    sys.exit(main())
 No newline at end of file
+478 −0
Original line number Diff line number Diff line
import json
import os
import re
import warnings
from pathlib import Path
import zipfile
from lxml import etree
import tempfile
import shutil
from typing import Union
import argparse
import subprocess
#import win32com.client #pip install pywin32

from docx import Document  #pip install python-docx
from docx.oxml import OxmlElement
from docx.oxml.ns import qn
from docx.shared import Cm

#from file_helper import get_all_files_from_dir


def apply_standard_style_to_unformatted_paragraphs(config):
    docx_path = config.get("output_docx")
    output_path = config.get("output_docx")
    standard_style_name = config.get("standard_style_name", "etsi_standard")
    # Filter warning
    warnings.filterwarnings(
        "ignore",
        category=UserWarning,
        message=re.escape("style lookup by style_id is deprecated. Use style name as key instead.")
    )


    doc = Document(docx_path)
    changed = 0

    for p in doc.paragraphs:
        current_style = p.style.name if p.style else None
        # Prüfe, ob Stil nicht 'Standard' ist
        if current_style == "Body Text":
            p.style = standard_style_name
            changed += 1

    print(f'Changed style to {standard_style_name} for {changed} paragraphs.')
    doc.save(output_path)

def rotate_cell_text(cell):
    # Hole oder erstelle <w:tcPr>
    tcPr = cell._element.find(qn('w:tcPr'))
    if tcPr is None:
        tcPr = OxmlElement('w:tcPr')
        cell._element.insert(0, tcPr)

    # Erstelle oder ersetze <w:textDirection w:val="btLr"/>
    text_dir = tcPr.find(qn('w:textDirection'))
    if text_dir is None:
        text_dir = OxmlElement('w:textDirection')
        tcPr.append(text_dir)
    text_dir.set(qn('w:val'), 'btLr')  # bottom-to-top, left-to-right

def clean_and_set_text(cell, new_text):
    # Entferne alle vorhandenen Paragraphen
    for p in cell.paragraphs:
        p._element.getparent().remove(p._element)

    # Füge neuen Absatz mit dem bereinigten Text hinzu
    cell.add_paragraph(new_text)

def postprocess_table_content(config):
    docx_path = config.get("output_docx")
    output_path = config.get("output_docx")
    doc = Document(docx_path)
    for table in doc.tables:
        for row in table.rows:
            for cell in row.cells:
                if cell.text.strip().startswith("[rotate]"):
                    # Entferne den Marker und setze neuen Text
                    new_text = cell.text.replace("[rotate]", "").strip()
                    clean_and_set_text(cell, new_text)
                    rotate_cell_text(cell)
    doc.save(output_path)


def update_word_fields(config_path: Union[dict, str]):
    if os.path.isfile(config_path):
        docx_path = config_path
    else:
        docx_path = config_path.get("output_docx")
    # Prüfen, ob Datei existiert
    relativer_pfad = Path(docx_path)
    docx_absolute_path = relativer_pfad.resolve()
    if not os.path.isfile(docx_absolute_path):
        print(f'File not found: {docx_absolute_path}')

    # Word starten
    word = win32com.client.Dispatch("Word.Application")
    word.Visible = False  # unsichtbar im Hintergrund

    try:
        # Dokument öffnen
        doc = word.Documents.Open(str(docx_absolute_path))

        # Alle Felder im Dokument aktualisieren
        for field in doc.Fields:
            field.Update()

        # Dokument speichern
        doc.Save()

        # Schließen
        doc.Close()
        print(f'Fields in {docx_absolute_path} updated and saved')
    finally:
        word.Quit()

def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> str:
    """
    Refreshes fields in a DOCX file using LibreOffice inside a Docker container.
    The refreshed file overwrites the input file.

    Parameters
    ----------
    input_path : str
        Path to the input .docx file.
    image : str, optional
        Name of the Docker image (default: 'docx-field-refresh').

    Returns
    -------
    str
        Path to the refreshed (overwritten) .docx file.
    """
    input_path = Path(input_path).resolve()
    if not input_path.exists() or input_path.suffix.lower() != ".docx":
        raise FileNotFoundError(f"Invalid DOCX path: {input_path}")

    # Log input information
    print(f"📄 Input file path: {input_path}")
    print(f"📄 Input file name: {input_path.name}")
    print(f"📁 Input file parent: {input_path.parent}")
    print(f"📁 Input file parent name: {input_path.parent.name}")

    # Determine mount point (working folder) and file path in container
    # If file is in baseline/, mount the parent directory (working folder)
    # Otherwise mount the file's parent directory
    if input_path.parent.name == "baseline":
        mount_point_host = input_path.parent.parent.resolve()  # Working folder containing baseline
    else:
        mount_point_host = input_path.parent.resolve()
    
    mount_point_container = "/data"
    file_path_in_container = f"{mount_point_container}/{input_path.relative_to(mount_point_host)}"
    
    # Convert Path to string for Docker commands
    mount_point_host_str = str(mount_point_host)
    
    # Verify host path exists
    if not mount_point_host.exists():
        raise FileNotFoundError(f"Mount point does not exist on host: {mount_point_host_str}")
    if not mount_point_host.is_dir():
        raise ValueError(f"Mount point is not a directory: {mount_point_host_str}")
    
    print(f"📂 Mount point (host): {mount_point_host_str}")
    print(f"📂 Mount point (host absolute): {mount_point_host.absolute()}")
    print(f"📂 Mount point (container): {mount_point_container}")
    print(f"📂 File path in container: {file_path_in_container}")
    print(f"📂 Mount syntax: -v {mount_point_host_str}:{mount_point_container}")
    
    # Run LibreOffice to refresh fields (convert docx to docx refreshes fields)
    # Then run post-processing command in the same container
    # Original file is in baseline/, but LibreOffice creates output in mount_point with just the filename
    original_file = file_path_in_container  # e.g., /data/baseline/file.docx
    created_file = f"{mount_point_container}/{input_path.name}"  # e.g., /data/file.docx
    
    print(f"📄 Original file (in container): {original_file}")
    print(f"📄 Created file (in container): {created_file}")
    
    # First, save the original file's permissions, then convert, then apply to created file
    save_perms_cmd = f'ORIG_PERMS=$(stat -c "%u:%g" {original_file})'
    soffice_cmd = f"soffice --headless --convert-to docx --outdir {mount_point_container} {original_file}"
    # Apply original file permissions to the created/converted file
    post_cmd = f'chown $ORIG_PERMS {created_file}'
    combined_cmd = f"{save_perms_cmd} && {soffice_cmd} && {post_cmd}"
    
    print(f"🔧 save_perms_cmd: {save_perms_cmd}")
    print(f"🔧 soffice_cmd: {soffice_cmd}")
    print(f"🔧 post_cmd: {post_cmd}")
    print(f"🔧 combined_cmd: {combined_cmd}")
    
    cmd = [
        "docker", "run", "--rm",
        "-v", f'{mount_point_host_str}:{mount_point_container}',
        "-e", f'HOME={mount_point_container}',
        "--entrypoint", "/bin/bash",
        image,
        "-c", combined_cmd,
    ]
    
    print(f"🐳 Docker command: {' '.join(cmd)}")
    
    # Run diagnostic commands on the host
    print(f"🔍 Running diagnostic commands on HOST...")
    host_diag_cmd = f"ls -la {mount_point_host_str} && pwd && whoami && echo 'Mount point contents:' && ls -la {mount_point_host_str}/baseline/ 2>/dev/null || echo 'No baseline directory'"
    subprocess.run(host_diag_cmd, shell=True, check=False)  # Don't fail if diagnostic fails
    
    # Run diagnostic commands in the container
    print(f"🔍 Running diagnostic commands in CONTAINER...")
    print(f"🔍 Mount: {mount_point_host_str} -> {mount_point_container}")
    
    # First, test if we can see a known file from host in container
    test_file = mount_point_host / "baseline" / input_path.name
    print(f"🔍 Expected file on host: {test_file}")
    print(f"🔍 File exists on host: {test_file.exists()}")
    
    diag_cmd = [
        "docker", "run", "--rm",
        "-v", f"{mount_point_host_str}:{mount_point_container}",
        "--entrypoint", "/bin/bash",
        image,
        "-c", f"echo '=== Container Diagnostics ===' && echo 'Mount: {mount_point_host_str} -> {mount_point_container}' && echo 'Current directory:' && pwd && echo 'User:' && whoami && echo '' && echo '=== Testing mount ===' && echo 'Checking if {mount_point_container} is a directory:' && test -d {mount_point_container} && echo 'YES' || echo 'NO' && echo 'Checking if {mount_point_container} is mounted:' && mountpoint -q {mount_point_container} && echo 'YES (mountpoint)' || echo 'NO (mountpoint)' && echo '' && echo '=== {mount_point_container} contents ===' && ls -la {mount_point_container} && echo '' && echo '=== {mount_point_container}/baseline contents ===' && ls -la {mount_point_container}/baseline/ 2>/dev/null || echo 'No baseline directory' && echo '' && echo '=== Checking if file exists ===' && test -f {file_path_in_container} && echo 'FILE EXISTS: {file_path_in_container}' || echo 'FILE NOT FOUND: {file_path_in_container}' && ls -la {file_path_in_container} 2>/dev/null || true",
    ]
    subprocess.run(diag_cmd, check=False)  # Don't fail if diagnostic fails

    subprocess.run(cmd, check=True)

    return str(input_path)


def refresh_docx_fields_cli():
    
    parser = argparse.ArgumentParser(description="Refresh DOCX fields using LibreOffice in Docker (in-place).")
    parser.add_argument("input", help="Path to input DOCX file.")
    parser.add_argument("--image", default="docx-field-refresh", help="Docker image name (default: docx-field-refresh)")

    args = parser.parse_args()
    refresh_docx_fields(args.input, args.image)

def insert_page_break_before_long_tables(config):
    docx_path = config.get("output_docx")
    output_path = config.get("output_docx")

    word = win32com.client.Dispatch("Word.Application")
    word.Visible = False

    doc = word.Documents.Open(docx_path)

    for i, table in enumerate(doc.Tables):
        # Tabellenbereich abrufen
        start = table.Range.Start
        end = table.Range.End

        # Seitenzahl berechnen
        start_page = doc.Range(start, start).Information(3)  # wdActiveEndPageNumber = 3
        end_page = doc.Range(end - 1, end - 1).Information(3)

        if end_page > start_page:
            print(f'Table {i + 1} is on a page break: {start_page} -> {end_page}')
            # Seitenumbruch einfügen
            para = doc.Range(start, start)
            para.InsertBreak(7)  # wdPageBreak = 7

    # Speichern unter neuem Namen
    doc.SaveAs(output_path)
    doc.Close()
    word.Quit()

def format_toc_header(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}):
    root = etree.fromstring(xml_data)

    counter = 0
    # Find <w:pStyle w:val="TOCHeading">
    for pstyle in root.xpath('.//w:pStyle[@w:val="TOCHeading"]', namespaces=ns):
        # Change it to be <w:pStyle w:val="TT">
        old_text = pstyle
        pstyle.set(f"{{{ns['w']}}}val", "TT")
        counter+=1
    print(f'Changed Style "TOCHeading" to "TT" {counter} times')
    return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

def update_toc_level(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}):
    root = etree.fromstring(xml_data)
    new_range = "1-9"
    # Regex for \o "x-y" with x and y being numbers
    pattern = re.compile(r'(?<=\\o )"\d+-\d+"\s*')


    # Loop over all elements to find "TOC"
    for elem in root.xpath('.//w:instrText', namespaces=ns):
        if 'TOC' in elem.text:
            old_text = elem.text
            elem.text = pattern.sub('', elem.text)

            print(f'Changed TOC: {old_text}{elem.text}')

    return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

def update_toc(docx_input, docx_output):
    # read xml
    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")

    xml_data = update_toc_level(xml_data)

    xml_data = format_toc_header(xml_data)

    # create temp file
    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)  # Datei wird nur über zipfile geöffnet

    try:
        # write new docx to temp file
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        # Write to output file
        shutil.move(tmp_path, docx_output)
        # Set proper permissions (read/write for owner, read for group and others)
        os.chmod(docx_output, 0o644)

    finally:
        # delete temp file if still existing
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

#def update_toc_level(config):
#    docx_path = config.get("output_docx")
#    word = win32com.client.Dispatch("Word.Application")
#    word.Visible = False
#
#    doc = word.Documents.Open(docx_path)
#
#    # Wenn kein TOC vorhanden ist, kannst du eins hinzufügen:
#    if doc.TablesOfContents.Count == 0:
#        # Inhaltsverzeichnis am Anfang des Dokuments einfügen
#        doc.TablesOfContents.Add(
#            Range=doc.Range(0, 0),
#            UseHeadingStyles=True,
#            UpperHeadingLevel=1,
#            LowerHeadingLevel=9,  # 👉 bis Heading 9
#            UseHyperlinks=True,
#            HidePageNumbersInWeb=False,
#            UseOutlineLevels=True
#        )
#    # Vorhandenes TOC anpassen
#    toc = doc.TablesOfContents(1)
#    #Formating heading -> ToDo: last line not working so skipped for the moment
#    #toc_range = toc.Range
#    #heading_para = toc_range.Paragraphs(1)
#    #heading_para.Style = doc.Styles("Heading 1")
#    #set level range from 1-9
#    toc.UpperHeadingLevel = 1
#    toc.LowerHeadingLevel = 9
#    toc.Update()

#    doc.SaveAs(docx_path)
#    doc.Close()
#    word.Quit()

def update_toc_cli():
    parser = argparse.ArgumentParser(description="Update a DOCX table of contents.")
    parser.add_argument("docx_input", help="Path to input DOCX file")
    parser.add_argument("docx_output", help="Path to output DOCX file")
    args = parser.parse_args()

    update_toc(args.docx_input, args.docx_output)


def table_widths_adjustment(config):
    table_path = config.get("tables_folder")
    docx_path = config.get("output_docx")
    doc = Document(docx_path)
    def get_table_caption(table):
        """Liest den Alternativtext-Titel (Caption) aus einer Tabelle."""
        tbl = table._tbl
        tblPr = tbl.tblPr

        if tblPr is None:
            return None  # Tabelle hat keine Eigenschaften

        # Suche nach <w:tblCaption w:val="...">
        caption_el = tblPr.find(qn("w:tblCaption"))
        if caption_el is not None:
            return caption_el.get(qn("w:val"))

        # Manche Word-Versionen speichern den Text als direktes Element ohne w:val
        for el in tblPr:
            if el.tag == qn("w:tblCaption"):
                return el.text or None

        return None

    def percent_to_cm(width_percentages, total_width_cm):
        """
        Wandelt Prozentangaben (z. B. [30, 40, 30]) in absolute cm-Werte um.

        Args:
            width_percentages (list[float]): Prozentwerte (Summe kann ein beliebiger positiver wert (>0) sein)
            total_width_cm (float): Gesamtbreite der Tabelle in cm

        Returns:
            list[float]: Spaltenbreiten in cm
        """
        # Falls Prozente als Ganzzahlen angegeben sind, normalisieren wir sie
        total_percent = sum(width_percentages)
        if total_percent == 0:
            ErrorHandler()("sum of percent-values is 0.")

        # Umrechnen auf relative cm-Werte
        widths_cm = [Cm((p / total_percent) * total_width_cm) for p in width_percentages]
        return widths_cm

    def get_json_data(json_data_array, value, matching_json_field):
        """
        Sucht in einer Liste von JSON-Dictionaries nach einem bestimmten Feldwert.

        Args:
            json_data_array (list[dict]): Liste von JSON-Objekten
            value (str): Der gesuchte Wert
            matching_json_field (str): Name des JSON-Feldes, in dem gesucht werden soll

        Returns:
            dict | None: Das gefundene JSON-Objekt oder None, falls kein Treffer
        """
        for item in json_data_array:
            if not isinstance(item, dict):
                continue  # Überspringt ungültige Einträge
            if(matching_json_field==caption):
                if value.endswith(item.get(matching_json_field)):
                    return item
            if item.get(matching_json_field) == value:
                return item
        return None


    # preparing json_table list -> getting caption and width from table-json files
    table_list = get_all_files_from_dir(table_path, "json")
    json_tables = []
    for table_path in table_list:
        with open(table_path, "r", encoding="utf-8") as f:
            try:
                data = json.load(f)
            except json.JSONDecodeError as e:
                ErrorHandler()(f"Skipped {table_path}. Error in reading file: {e}")
                continue

        caption = data.get("caption")
        widths = data.get("column_width")

        if not caption or not widths:
            ErrorHandler()(f"Skipping {table_path} – as caption and/or column_width are not set as expected")
            continue

        json_tables.append(data)
    for table in doc.tables:
        # Get matching of table in docx and json
        docx_caption=get_table_caption(table)
        data = get_json_data(json_tables, docx_caption, "caption")
        #found json
        if(data is not None):
            total_width = data.get("total_width", 16.88)
            col_widths_percent = data.get("column_width")
        else:
            total_width = 16.88
            col_widths_percent = [1] * len(table.rows[0].cells)

        col_widths = percent_to_cm(col_widths_percent, total_width)

        # Jede Zelle in der jeweiligen Spalte auf Breite setzen
        for row in table.rows:
            for i, width in enumerate(col_widths):
                cell = row.cells[i]
                cell.width = width
    doc.save(docx_path)
 No newline at end of file
+7 −1
Original line number Diff line number Diff line
@@ -13,3 +13,9 @@ pygments==2.15.1
rich==13.4.2
    # via setup.py
cairosvg==2.7.1

lxml==4.9.3

python-docx==0.8.11

#pywin32
 No newline at end of file
+8 −0
Original line number Diff line number Diff line
@@ -14,6 +14,14 @@ setup(
            'console_scripts' : ['pandocFilter=pandocFilter:main',
								'generateTOC=generateTOC:main',
								'svg2png=svg2png:main',
								"update_references=postprocessing:update_word_fields",
        						"update_formats=postprocessing:apply_standard_style_to_unformatted_paragraphs",
        						"turn_table_contents=postprocessing:postprocess_table_content",
        						"table_width_adjustment=postprocessing:table_widths_adjustment",
        						"check_multipage_tables=postprocessing:insert_page_break_before_long_tables",
        						#"apply_etsi_styling: postprocessing:postprocess_etsi_styles",
        						"update_toc=postprocessing:update_toc_cli",
								"refresh_docx_fields=postprocessing:refresh_docx_fields_cli",
			]
            }