Adding some new features: (45c2312a) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

.gitlab-ci.yml

+2 −0

Original line number	Diff line number	Diff line
		@@ -42,11 +42,13 @@ Build generateBaseline docker image:
		- if: $CI_COMMIT_BRANCH && $CI_PROJECT_NAME == "tools"
		changes:
		- generateBaseline/dockerfile
		- generateBaseline/dockerfile.pandoc
		- generateBaseline/setup.py
		- generateBaseline/requirements.txt
		- generateBaseline/pandocFilter.py
		- generateBaseline/generateTOC.py
		- generateBaseline/svg2png.py
		- generateBaseline/postprocessing.py

		Build generateSpecWebSite docker image:
		stage: build

generateBaseline/pandocFilter.py

+10 −6

Original line number	Diff line number	Diff line
		@@ -211,21 +211,25 @@ def correctTableSeparators(progress: Progress, mdLines: list[str]) -> list[str]:
		return _lines


		def process(document:str, outDirectory:str) -> None:
		def process(args) -> None:
		with Progress(TextColumn('{task.description}'), TimeElapsedColumn()) as progress:
		mdLines = readMDFile(progress, document)
		mdLines = readMDFile(progress, args.document)
		mdLines = correctTOC(progress, mdLines)
		mdLines = replaceTableCaptions(progress, mdLines)
		mdLines = replaceFigureCaptions(progress, mdLines)
		if args.figure_paths:
		mdLines = replaceFiguresPathSvgToPng(progress, mdLines)
		mdLines = replaceLineBreaks(progress, mdLines)
		if args.table_separators:
		mdLines = correctTableSeparators(progress, mdLines)
		writeMDFile(progress, mdLines, document, outDirectory)
		writeMDFile(progress, mdLines, args.document, args.outDirectory)


		def main(args=None):
		# Parse command line arguments
		parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
		parser.add_argument('-ts', '--table-separators', action='store_true', required=False, default=False, help="Correct table separators")
		parser.add_argument('-fp', '--figure-paths', action='store_true', required=False, default=False, help="Replace figure paths")
		parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory')
		parser.add_argument('document', help = 'document to parse')
		args = parser.parse_args()
		@@ -233,7 +237,7 @@ def main(args=None):
		# Process documents and print output
		os.makedirs(args.outDirectory, exist_ok = True)

		process(args.document, args.outDirectory)
		process(args)

		if __name__ == '__main__':
		sys.exit(main())
		No newline at end of file

generateBaseline/postprocessing.py

0 → 100644

+478 −0

Original line number	Diff line number	Diff line
		import json
		import os
		import re
		import warnings
		from pathlib import Path
		import zipfile
		from lxml import etree
		import tempfile
		import shutil
		from typing import Union
		import argparse
		import subprocess
		#import win32com.client #pip install pywin32

		from docx import Document #pip install python-docx
		from docx.oxml import OxmlElement
		from docx.oxml.ns import qn
		from docx.shared import Cm

		#from file_helper import get_all_files_from_dir


		def apply_standard_style_to_unformatted_paragraphs(config):
		docx_path = config.get("output_docx")
		output_path = config.get("output_docx")
		standard_style_name = config.get("standard_style_name", "etsi_standard")
		# Filter warning
		warnings.filterwarnings(
		"ignore",
		category=UserWarning,
		message=re.escape("style lookup by style_id is deprecated. Use style name as key instead.")
		)


		doc = Document(docx_path)
		changed = 0

		for p in doc.paragraphs:
		current_style = p.style.name if p.style else None
		# Prüfe, ob Stil nicht 'Standard' ist
		if current_style == "Body Text":
		p.style = standard_style_name
		changed += 1

		print(f'Changed style to {standard_style_name} for {changed} paragraphs.')
		doc.save(output_path)

		def rotate_cell_text(cell):
		# Hole oder erstelle <w:tcPr>
		tcPr = cell._element.find(qn('w:tcPr'))
		if tcPr is None:
		tcPr = OxmlElement('w:tcPr')
		cell._element.insert(0, tcPr)

		# Erstelle oder ersetze <w:textDirection w:val="btLr"/>
		text_dir = tcPr.find(qn('w:textDirection'))
		if text_dir is None:
		text_dir = OxmlElement('w:textDirection')
		tcPr.append(text_dir)
		text_dir.set(qn('w:val'), 'btLr') # bottom-to-top, left-to-right

		def clean_and_set_text(cell, new_text):
		# Entferne alle vorhandenen Paragraphen
		for p in cell.paragraphs:
		p._element.getparent().remove(p._element)

		# Füge neuen Absatz mit dem bereinigten Text hinzu
		cell.add_paragraph(new_text)

		def postprocess_table_content(config):
		docx_path = config.get("output_docx")
		output_path = config.get("output_docx")
		doc = Document(docx_path)
		for table in doc.tables:
		for row in table.rows:
		for cell in row.cells:
		if cell.text.strip().startswith("[rotate]"):
		# Entferne den Marker und setze neuen Text
		new_text = cell.text.replace("[rotate]", "").strip()
		clean_and_set_text(cell, new_text)
		rotate_cell_text(cell)
		doc.save(output_path)


		def update_word_fields(config_path: Union[dict, str]):
		if os.path.isfile(config_path):
		docx_path = config_path
		else:
		docx_path = config_path.get("output_docx")
		# Prüfen, ob Datei existiert
		relativer_pfad = Path(docx_path)
		docx_absolute_path = relativer_pfad.resolve()
		if not os.path.isfile(docx_absolute_path):
		print(f'File not found: {docx_absolute_path}')

		# Word starten
		word = win32com.client.Dispatch("Word.Application")
		word.Visible = False # unsichtbar im Hintergrund

		try:
		# Dokument öffnen
		doc = word.Documents.Open(str(docx_absolute_path))

		# Alle Felder im Dokument aktualisieren
		for field in doc.Fields:
		field.Update()

		# Dokument speichern
		doc.Save()

		# Schließen
		doc.Close()
		print(f'Fields in {docx_absolute_path} updated and saved')
		finally:
		word.Quit()

		def refresh_docx_fields(input_path: str, image: str = "docx-field-refresh") -> str:
		"""
		Refreshes fields in a DOCX file using LibreOffice inside a Docker container.
		The refreshed file overwrites the input file.

		Parameters
		----------
		input_path : str
		Path to the input .docx file.
		image : str, optional
		Name of the Docker image (default: 'docx-field-refresh').

		Returns
		-------
		str
		Path to the refreshed (overwritten) .docx file.
		"""
		input_path = Path(input_path).resolve()
		if not input_path.exists() or input_path.suffix.lower() != ".docx":
		raise FileNotFoundError(f"Invalid DOCX path: {input_path}")

		# Log input information
		print(f"📄 Input file path: {input_path}")
		print(f"📄 Input file name: {input_path.name}")
		print(f"📁 Input file parent: {input_path.parent}")
		print(f"📁 Input file parent name: {input_path.parent.name}")

		# Determine mount point (working folder) and file path in container
		# If file is in baseline/, mount the parent directory (working folder)
		# Otherwise mount the file's parent directory
		if input_path.parent.name == "baseline":
		mount_point_host = input_path.parent.parent.resolve() # Working folder containing baseline
		else:
		mount_point_host = input_path.parent.resolve()

		mount_point_container = "/data"
		file_path_in_container = f"{mount_point_container}/{input_path.relative_to(mount_point_host)}"

		# Convert Path to string for Docker commands
		mount_point_host_str = str(mount_point_host)

		# Verify host path exists
		if not mount_point_host.exists():
		raise FileNotFoundError(f"Mount point does not exist on host: {mount_point_host_str}")
		if not mount_point_host.is_dir():
		raise ValueError(f"Mount point is not a directory: {mount_point_host_str}")

		print(f"📂 Mount point (host): {mount_point_host_str}")
		print(f"📂 Mount point (host absolute): {mount_point_host.absolute()}")
		print(f"📂 Mount point (container): {mount_point_container}")
		print(f"📂 File path in container: {file_path_in_container}")
		print(f"📂 Mount syntax: -v {mount_point_host_str}:{mount_point_container}")

		# Run LibreOffice to refresh fields (convert docx to docx refreshes fields)
		# Then run post-processing command in the same container
		# Original file is in baseline/, but LibreOffice creates output in mount_point with just the filename
		original_file = file_path_in_container # e.g., /data/baseline/file.docx
		created_file = f"{mount_point_container}/{input_path.name}" # e.g., /data/file.docx

		print(f"📄 Original file (in container): {original_file}")
		print(f"📄 Created file (in container): {created_file}")

		# First, save the original file's permissions, then convert, then apply to created file
		save_perms_cmd = f'ORIG_PERMS=$(stat -c "%u:%g" {original_file})'
		soffice_cmd = f"soffice --headless --convert-to docx --outdir {mount_point_container} {original_file}"
		# Apply original file permissions to the created/converted file
		post_cmd = f'chown $ORIG_PERMS {created_file}'
		combined_cmd = f"{save_perms_cmd} && {soffice_cmd} && {post_cmd}"

		print(f"🔧 save_perms_cmd: {save_perms_cmd}")
		print(f"🔧 soffice_cmd: {soffice_cmd}")
		print(f"🔧 post_cmd: {post_cmd}")
		print(f"🔧 combined_cmd: {combined_cmd}")

		cmd = [
		"docker", "run", "--rm",
		"-v", f'{mount_point_host_str}:{mount_point_container}',
		"-e", f'HOME={mount_point_container}',
		"--entrypoint", "/bin/bash",
		image,
		"-c", combined_cmd,
		]

		print(f"🐳 Docker command: {' '.join(cmd)}")

		# Run diagnostic commands on the host
		print(f"🔍 Running diagnostic commands on HOST...")
		host_diag_cmd = f"ls -la {mount_point_host_str} && pwd && whoami && echo 'Mount point contents:' && ls -la {mount_point_host_str}/baseline/ 2>/dev/null \|\| echo 'No baseline directory'"
		subprocess.run(host_diag_cmd, shell=True, check=False) # Don't fail if diagnostic fails

		# Run diagnostic commands in the container
		print(f"🔍 Running diagnostic commands in CONTAINER...")
		print(f"🔍 Mount: {mount_point_host_str} -> {mount_point_container}")

		# First, test if we can see a known file from host in container
		test_file = mount_point_host / "baseline" / input_path.name
		print(f"🔍 Expected file on host: {test_file}")
		print(f"🔍 File exists on host: {test_file.exists()}")

		diag_cmd = [
		"docker", "run", "--rm",
		"-v", f"{mount_point_host_str}:{mount_point_container}",
		"--entrypoint", "/bin/bash",
		image,
		"-c", f"echo '=== Container Diagnostics ===' && echo 'Mount: {mount_point_host_str} -> {mount_point_container}' && echo 'Current directory:' && pwd && echo 'User:' && whoami && echo '' && echo '=== Testing mount ===' && echo 'Checking if {mount_point_container} is a directory:' && test -d {mount_point_container} && echo 'YES' \|\| echo 'NO' && echo 'Checking if {mount_point_container} is mounted:' && mountpoint -q {mount_point_container} && echo 'YES (mountpoint)' \|\| echo 'NO (mountpoint)' && echo '' && echo '=== {mount_point_container} contents ===' && ls -la {mount_point_container} && echo '' && echo '=== {mount_point_container}/baseline contents ===' && ls -la {mount_point_container}/baseline/ 2>/dev/null \|\| echo 'No baseline directory' && echo '' && echo '=== Checking if file exists ===' && test -f {file_path_in_container} && echo 'FILE EXISTS: {file_path_in_container}' \|\| echo 'FILE NOT FOUND: {file_path_in_container}' && ls -la {file_path_in_container} 2>/dev/null \|\| true",
		]
		subprocess.run(diag_cmd, check=False) # Don't fail if diagnostic fails

		subprocess.run(cmd, check=True)

		return str(input_path)


		def refresh_docx_fields_cli():

		parser = argparse.ArgumentParser(description="Refresh DOCX fields using LibreOffice in Docker (in-place).")
		parser.add_argument("input", help="Path to input DOCX file.")
		parser.add_argument("--image", default="docx-field-refresh", help="Docker image name (default: docx-field-refresh)")

		args = parser.parse_args()
		refresh_docx_fields(args.input, args.image)

		def insert_page_break_before_long_tables(config):
		docx_path = config.get("output_docx")
		output_path = config.get("output_docx")

		word = win32com.client.Dispatch("Word.Application")
		word.Visible = False

		doc = word.Documents.Open(docx_path)

		for i, table in enumerate(doc.Tables):
		# Tabellenbereich abrufen
		start = table.Range.Start
		end = table.Range.End

		# Seitenzahl berechnen
		start_page = doc.Range(start, start).Information(3) # wdActiveEndPageNumber = 3
		end_page = doc.Range(end - 1, end - 1).Information(3)

		if end_page > start_page:
		print(f'Table {i + 1} is on a page break: {start_page} -> {end_page}')
		# Seitenumbruch einfügen
		para = doc.Range(start, start)
		para.InsertBreak(7) # wdPageBreak = 7

		# Speichern unter neuem Namen
		doc.SaveAs(output_path)
		doc.Close()
		word.Quit()

		def format_toc_header(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}):
		root = etree.fromstring(xml_data)

		counter = 0
		# Find <w:pStyle w:val="TOCHeading">
		for pstyle in root.xpath('.//w:pStyle[@w:val="TOCHeading"]', namespaces=ns):
		# Change it to be <w:pStyle w:val="TT">
		old_text = pstyle
		pstyle.set(f"{{{ns['w']}}}val", "TT")
		counter+=1
		print(f'Changed Style "TOCHeading" to "TT" {counter} times')
		return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		def update_toc_level(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}):
		root = etree.fromstring(xml_data)
		new_range = "1-9"
		# Regex for \o "x-y" with x and y being numbers
		pattern = re.compile(r'(?<=\\o )"\d+-\d+"\s*')


		# Loop over all elements to find "TOC"
		for elem in root.xpath('.//w:instrText', namespaces=ns):
		if 'TOC' in elem.text:
		old_text = elem.text
		elem.text = pattern.sub('', elem.text)

		print(f'Changed TOC: {old_text} → {elem.text}')

		return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		def update_toc(docx_input, docx_output):
		# read xml
		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		xml_data = update_toc_level(xml_data)

		xml_data = format_toc_header(xml_data)

		# create temp file
		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd) # Datei wird nur über zipfile geöffnet

		try:
		# write new docx to temp file
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		# Write to output file
		shutil.move(tmp_path, docx_output)
		# Set proper permissions (read/write for owner, read for group and others)
		os.chmod(docx_output, 0o644)

		finally:
		# delete temp file if still existing
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		#def update_toc_level(config):
		# docx_path = config.get("output_docx")
		# word = win32com.client.Dispatch("Word.Application")
		# word.Visible = False
		#
		# doc = word.Documents.Open(docx_path)
		#
		# # Wenn kein TOC vorhanden ist, kannst du eins hinzufügen:
		# if doc.TablesOfContents.Count == 0:
		# # Inhaltsverzeichnis am Anfang des Dokuments einfügen
		# doc.TablesOfContents.Add(
		# Range=doc.Range(0, 0),
		# UseHeadingStyles=True,
		# UpperHeadingLevel=1,
		# LowerHeadingLevel=9, # 👉 bis Heading 9
		# UseHyperlinks=True,
		# HidePageNumbersInWeb=False,
		# UseOutlineLevels=True
		# )
		# # Vorhandenes TOC anpassen
		# toc = doc.TablesOfContents(1)
		# #Formating heading -> ToDo: last line not working so skipped for the moment
		# #toc_range = toc.Range
		# #heading_para = toc_range.Paragraphs(1)
		# #heading_para.Style = doc.Styles("Heading 1")
		# #set level range from 1-9
		# toc.UpperHeadingLevel = 1
		# toc.LowerHeadingLevel = 9
		# toc.Update()

		# doc.SaveAs(docx_path)
		# doc.Close()
		# word.Quit()

		def update_toc_cli():
		parser = argparse.ArgumentParser(description="Update a DOCX table of contents.")
		parser.add_argument("docx_input", help="Path to input DOCX file")
		parser.add_argument("docx_output", help="Path to output DOCX file")
		args = parser.parse_args()

		update_toc(args.docx_input, args.docx_output)


		def table_widths_adjustment(config):
		table_path = config.get("tables_folder")
		docx_path = config.get("output_docx")
		doc = Document(docx_path)
		def get_table_caption(table):
		"""Liest den Alternativtext-Titel (Caption) aus einer Tabelle."""
		tbl = table._tbl
		tblPr = tbl.tblPr

		if tblPr is None:
		return None # Tabelle hat keine Eigenschaften

		# Suche nach <w:tblCaption w:val="...">
		caption_el = tblPr.find(qn("w:tblCaption"))
		if caption_el is not None:
		return caption_el.get(qn("w:val"))

		# Manche Word-Versionen speichern den Text als direktes Element ohne w:val
		for el in tblPr:
		if el.tag == qn("w:tblCaption"):
		return el.text or None

		return None

		def percent_to_cm(width_percentages, total_width_cm):
		"""
		Wandelt Prozentangaben (z. B. [30, 40, 30]) in absolute cm-Werte um.

		Args:
		width_percentages (list[float]): Prozentwerte (Summe kann ein beliebiger positiver wert (>0) sein)
		total_width_cm (float): Gesamtbreite der Tabelle in cm

		Returns:
		list[float]: Spaltenbreiten in cm
		"""
		# Falls Prozente als Ganzzahlen angegeben sind, normalisieren wir sie
		total_percent = sum(width_percentages)
		if total_percent == 0:
		ErrorHandler()("sum of percent-values is 0.")

		# Umrechnen auf relative cm-Werte
		widths_cm = [Cm((p / total_percent) * total_width_cm) for p in width_percentages]
		return widths_cm

		def get_json_data(json_data_array, value, matching_json_field):
		"""
		Sucht in einer Liste von JSON-Dictionaries nach einem bestimmten Feldwert.

		Args:
		json_data_array (list[dict]): Liste von JSON-Objekten
		value (str): Der gesuchte Wert
		matching_json_field (str): Name des JSON-Feldes, in dem gesucht werden soll

		Returns:
		dict \| None: Das gefundene JSON-Objekt oder None, falls kein Treffer
		"""
		for item in json_data_array:
		if not isinstance(item, dict):
		continue # Überspringt ungültige Einträge
		if(matching_json_field==caption):
		if value.endswith(item.get(matching_json_field)):
		return item
		if item.get(matching_json_field) == value:
		return item
		return None


		# preparing json_table list -> getting caption and width from table-json files
		table_list = get_all_files_from_dir(table_path, "json")
		json_tables = []
		for table_path in table_list:
		with open(table_path, "r", encoding="utf-8") as f:
		try:
		data = json.load(f)
		except json.JSONDecodeError as e:
		ErrorHandler()(f"Skipped {table_path}. Error in reading file: {e}")
		continue

		caption = data.get("caption")
		widths = data.get("column_width")

		if not caption or not widths:
		ErrorHandler()(f"Skipping {table_path} – as caption and/or column_width are not set as expected")
		continue

		json_tables.append(data)
		for table in doc.tables:
		# Get matching of table in docx and json
		docx_caption=get_table_caption(table)
		data = get_json_data(json_tables, docx_caption, "caption")
		#found json
		if(data is not None):
		total_width = data.get("total_width", 16.88)
		col_widths_percent = data.get("column_width")
		else:
		total_width = 16.88
		col_widths_percent = [1] * len(table.rows[0].cells)

		col_widths = percent_to_cm(col_widths_percent, total_width)

		# Jede Zelle in der jeweiligen Spalte auf Breite setzen
		for row in table.rows:
		for i, width in enumerate(col_widths):
		cell = row.cells[i]
		cell.width = width
		doc.save(docx_path)
		No newline at end of file

generateBaseline/requirements.txt

+7 −1

Original line number	Diff line number	Diff line
		@@ -13,3 +13,9 @@ pygments==2.15.1
		rich==13.4.2
		# via setup.py
		cairosvg==2.7.1

		lxml==4.9.3

		python-docx==0.8.11

		#pywin32
		No newline at end of file

generateBaseline/setup.py

+8 −0

Original line number	Diff line number	Diff line
		@@ -14,6 +14,14 @@ setup(
		'console_scripts' : ['pandocFilter=pandocFilter:main',
		'generateTOC=generateTOC:main',
		'svg2png=svg2png:main',
		"update_references=postprocessing:update_word_fields",
		"update_formats=postprocessing:apply_standard_style_to_unformatted_paragraphs",
		"turn_table_contents=postprocessing:postprocess_table_content",
		"table_width_adjustment=postprocessing:table_widths_adjustment",
		"check_multipage_tables=postprocessing:insert_page_break_before_long_tables",
		#"apply_etsi_styling: postprocessing:postprocess_etsi_styles",
		"update_toc=postprocessing:update_toc_cli",
		"refresh_docx_fields=postprocessing:refresh_docx_fields_cli",
		]
		}