Added cli for all possibly running postprocessing steps (9c968275) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateBaseline/postprocessing.py

+139 −85

Original line number	Diff line number	Diff line
		import json
		import os
		import re
		import sys
		import warnings
		from pathlib import Path
		import zipfile
		from lxml import etree
		import tempfile
		import shutil
		from typing import Union
		import argparse
		import subprocess

		@@ -16,13 +16,108 @@ from docx.oxml import OxmlElement
		from docx.oxml.ns import qn
		from docx.shared import Cm

		#from file_helper import get_all_files_from_dir
		def parse_input(args_to_parse=None, description = None):
		"""
		args_to_parse: list of arguments, e.g. ["input", "output"]
		"""
		if args_to_parse is None:
		args_to_parse = ["input", "output"]

		parser = argparse.ArgumentParser(description=description)

		if args_to_parse is None:
		args_to_parse = []

		if "input" in args_to_parse:
		parser.add_argument(
		"input", "-i", "--input",
		metavar="INPUT_FILE",
		type=str,
		required=True,
		help="Path to input DOCX file"
		)

		if "output" in args_to_parse:
		parser.add_argument(
		"output", "-o", "--output",
		metavar="OUTPUT_FILE",
		type=str,
		required=False,
		help="Path to output DOCX file"
		)
		if "lib_office_image" in args_to_parse:
		parser.add_argument("lib_office_image", "--loi",
		default="docx-field-refresh",
		help="Docker image name (default: docx-field-refresh)")
		if "default_style" in args_to_parse:
		parser.add_argument("default_style", "--dstyle",
		default="Normal",
		help="Docker image name (default: docx-field-refresh)")

		if "tables_folder" in args_to_parse:
		parser.add_argument("tables_folder", "--tf",
		default="input/document_content/tables",
		help="folder containing the json files for tables")
		args = parser.parse_args()

		# check input file
		input_file_path = Path.cwd() / args.input
		file_suffix = input_file_path.suffix
		print("Input file:\t", input_file_path)

		if not os.path.exists(input_file_path):
		print("Error: Input file doesn't exist")
		sys.exit() # exit if input file does't exist

		if file_suffix != '.docx':
		print('Error: Input file with wrong file suffix! Expect file suffix .docx')
		sys.exit() # exit if input file is not a markdown file

		def apply_standard_style_to_unformatted_paragraphs(config):
		docx_path = config.get("output_docx")
		output_path = config.get("output_docx")
		standard_style_name = config.get("standard_style_name", "etsi_standard")
		# check output file
		# create output file if output file does't exist
		if args.output is None:
		print("Using input file as output file")
		args.output = args.input

		output_file_path = Path.cwd() / args.output
		file_suffix = output_file_path.suffix
		file_path = output_file_path.parent
		print("Output file:\t", output_file_path)

		if file_suffix != '.docx':
		print('Error: Output file with wrong file suffix! Expect file suffix .docx')
		sys.exit() # exit if output file is not a markdown file

		if not os.path.exists(output_file_path):
		print("Output file doesn't exist. Create empty output file.")
		Path(file_path).mkdir(parents=True, exist_ok=True) # make directory
		Path(output_file_path).touch() # touch empty file

		# Return requested args
		return tuple(getattr(args, name) for name in args_to_parse)

		def update_toc_cli():
		docx_input, docx_output = parse_input(description="Update a DOCX table of contents.")
		update_toc(docx_input, docx_output)

		def turn_table_contents_cli():
		docx_input, docx_output = parse_input(description="Searches for cells beginning with [rotate] and turns the content 90 degree counterclockwise")
		postprocess_table_content(docx_input, docx_output)

		def refresh_docx_fields_cli():
		docx_input, image = parse_input(["input", "lib_office_image"],description="Refresh DOCX fields using LibreOffice in Docker (in-place).")
		refresh_docx_fields(docx_input, image)

		def update_formats_cli():
		docx_input, docx_output, style = parse_input(["input", "output", "default_style"], description="Set unformated paragraphs to standard styling.")
		apply_standard_style_to_unformatted_paragraphs(docx_input, docx_output, style)

		def table_width_adjustment_cli():
		docx_input, docx_output, tables_folder = parse_input(["input", "output", "tables_folder"],
		description="Set the width of table columns according to values in json or if non there to be equal.")
		table_widths_adjustment(docx_input, docx_output, tables_folder)

		def apply_standard_style_to_unformatted_paragraphs(docx_input, docx_output, standard_style_name = "Normal"):
		# Filter warning
		warnings.filterwarnings(
		"ignore",
		@@ -31,7 +126,7 @@ def apply_standard_style_to_unformatted_paragraphs(config):
		)


		doc = Document(docx_path)
		doc = Document(docx_input)
		changed = 0

		for p in doc.paragraphs:
		@@ -42,7 +137,7 @@ def apply_standard_style_to_unformatted_paragraphs(config):
		changed += 1

		print(f'Changed style to {standard_style_name} for {changed} paragraphs.')
		doc.save(output_path)
		doc.save(docx_output)

		def rotate_cell_text(cell):
		# Hole oder erstelle <w:tcPr>
		@@ -78,14 +173,6 @@ def postprocess_table_content(docx_path, output_path):
		rotate_cell_text(cell)
		doc.save(output_path)

		def turn_table_contents_cli():
		parser = argparse.ArgumentParser(description="Searches for cells beginning with [rotate] and turns the content 90 degree counterclockwise")
		parser.add_argument("docx_input", help="Path to input DOCX file")
		parser.add_argument("docx_output", help="Path to output DOCX file")
		args = parser.parse_args()

		postprocess_table_content(args.docx_input, args.docx_output)

		def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> str:
		"""
		Refreshes fields in a DOCX file using LibreOffice inside a Docker container.
		@@ -199,44 +286,6 @@ def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> s
		return str(input_path)


		def refresh_docx_fields_cli():

		parser = argparse.ArgumentParser(description="Refresh DOCX fields using LibreOffice in Docker (in-place).")
		parser.add_argument("input", help="Path to input DOCX file.")
		parser.add_argument("--image", default="docx-field-refresh", help="Docker image name (default: docx-field-refresh)")

		args = parser.parse_args()
		refresh_docx_fields(args.input, args.image)

		def insert_page_break_before_long_tables(config):
		docx_path = config.get("output_docx")
		output_path = config.get("output_docx")

		word = win32com.client.Dispatch("Word.Application")
		word.Visible = False

		doc = word.Documents.Open(docx_path)

		for i, table in enumerate(doc.Tables):
		# Tabellenbereich abrufen
		start = table.Range.Start
		end = table.Range.End

		# Seitenzahl berechnen
		start_page = doc.Range(start, start).Information(3) # wdActiveEndPageNumber = 3
		end_page = doc.Range(end - 1, end - 1).Information(3)

		if end_page > start_page:
		print(f'Table {i + 1} is on a page break: {start_page} -> {end_page}')
		# Seitenumbruch einfügen
		para = doc.Range(start, start)
		para.InsertBreak(7) # wdPageBreak = 7

		# Speichern unter neuem Namen
		doc.SaveAs(output_path)
		doc.Close()
		word.Quit()

		def format_toc_header(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}):
		root = etree.fromstring(xml_data)

		@@ -301,35 +350,40 @@ def update_toc(docx_input, docx_output):
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		def update_toc_cli():
		docx_input, docx_output = parse_input()
		def get_all_files_from_dir(relative_path, ending=None, subfolder=True):
		"""
		Gibt eine Liste aller Dateien im angegebenen Verzeichnis (und optional in Unterordnern) zurück,
		die eine bestimmte Endung haben.

		update_toc(docx_input, docx_output)
		Parameter:
		relative_path (str \| Path): Pfad zum Verzeichnis (relativ oder absolut)
		ending (str \| None): Dateiendung (z. B. '.txt' oder 'txt'). Wenn None, werden alle Dateien berücksichtigt.
		subfolder (bool): Wenn True, werden Unterordner durchsucht.

		def parse_input():
		parser = argparse.ArgumentParser(description="Update a DOCX table of contents.")
		parser.add_argument('docx_input', '-i', '--input',
		metavar='INPUT_FILE',
		required=True,
		type=str,
		default=None,
		help='Path to input DOCX file')
		Rückgabe:
		list[Path]: Liste von pathlib.Path-Objekten
		"""
		base_path = Path(relative_path)
		if not base_path.exists():
		print(f"Error: Folder not found: {base_path}")

		parser.add_argument('docx_output', '-o', '--output',
		metavar='OUTPUT_FILE',
		required=False,
		type=str,
		default=None,
		help='Path to output DOCX file')
		args = parser.parse_args()
		# Endung normalisieren (z. B. 'txt' → '.txt')
		if ending is not None:
		if not ending.startswith('.'):
		ending = '.' + ending

		return args.docx_input, args.docx_output
		# Auswahl der passenden Dateien
		pattern = "*/" if subfolder else "*"
		files = [p for p in base_path.glob(pattern) if p.is_file()]

		if ending is None:
		return files
		else:
		return [f for f in files if f.suffix.lower() == ending.lower()]

		def table_widths_adjustment(config):
		table_path = config.get("tables_folder")
		docx_path = config.get("output_docx")
		doc = Document(docx_path)

		def table_widths_adjustment(docx_input, docx_output, tables_folder):
		doc = Document(docx_input)
		def get_table_caption(table):
		"""Liest den Alternativtext-Titel (Caption) aus einer Tabelle."""
		tbl = table._tbl
		@@ -364,7 +418,7 @@ def table_widths_adjustment(config):
		# Falls Prozente als Ganzzahlen angegeben sind, normalisieren wir sie
		total_percent = sum(width_percentages)
		if total_percent == 0:
		ErrorHandler()("sum of percent-values is 0.")
		print("Error: sum of percent-values is 0.")

		# Umrechnen auf relative cm-Werte
		widths_cm = [Cm((p / total_percent) * total_width_cm) for p in width_percentages]
		@@ -394,21 +448,21 @@ def table_widths_adjustment(config):


		# preparing json_table list -> getting caption and width from table-json files
		table_list = get_all_files_from_dir(table_path, "json")
		table_list = get_all_files_from_dir(tables_folder, "json")
		json_tables = []
		for table_path in table_list:
		with open(table_path, "r", encoding="utf-8") as f:
		for tables_folder in table_list:
		with open(tables_folder, "r", encoding="utf-8") as f:
		try:
		data = json.load(f)
		except json.JSONDecodeError as e:
		ErrorHandler()(f"Skipped {table_path}. Error in reading file: {e}")
		print(f"Skipped {tables_folder}. Error in reading file: {e}")
		continue

		caption = data.get("caption")
		widths = data.get("column_width")

		if not caption or not widths:
		ErrorHandler()(f"Skipping {table_path} – as caption and/or column_width are not set as expected")
		print(f"Skipping {tables_folder} – as caption and/or column_width are not set as expected")
		continue

		json_tables.append(data)
		@@ -431,4 +485,4 @@ def table_widths_adjustment(config):
		for i, width in enumerate(col_widths):
		cell = row.cells[i]
		cell.width = width
		doc.save(docx_path)
		No newline at end of file
		doc.save(docx_output)
		No newline at end of file

generateBaseline/setup.py

+4 −4

Original line number	Diff line number	Diff line
		@@ -14,11 +14,11 @@ setup(
		'console_scripts' : ['pandocFilter=pandocFilter:main',
		'generateTOC=generateTOC:main',
		'svg2png=svg2png:main',
		"update_references=postprocessing:update_word_fields",
		"update_formats=postprocessing:apply_standard_style_to_unformatted_paragraphs",
		#"update_references=postprocessing:update_word_fields",
		"update_formats=postprocessing:update_formats_cli",
		"turn_table_contents=postprocessing:turn_table_contents_cli",
		"table_width_adjustment=postprocessing:table_widths_adjustment",
		"check_multipage_tables=postprocessing:insert_page_break_before_long_tables",
		"table_width_adjustment=postprocessing:table_width_adjustment_cli",
		#"check_multipage_tables=postprocessing:insert_page_break_before_long_tables",
		#"apply_etsi_styling: postprocessing:postprocess_etsi_styles",
		"update_toc=postprocessing:update_toc_cli",
		"refresh_docx_fields=postprocessing:refresh_docx_fields_cli",