Add several format styles update: (a3abe79b) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateBaseline/postprocessing.py

+619 −1

Original line number	Diff line number	Diff line
		@@ -284,6 +284,8 @@ def update_toc_level(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wor
		# Regex for \o "x-y" with x and y being numbers
		pattern = re.compile(r'(?<=\\o )"\d+-\d+"\s*')

		# Track TOC paragraphs to insert page break after
		toc_paragraphs = []

		# Loop over all elements to find "TOC"
		for elem in root.xpath('.//w:instrText', namespaces=ns):
		@@ -293,6 +295,35 @@ def update_toc_level(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wor

		print(f'Changed TOC: {old_text} → {elem.text}')

		# Find the paragraph containing this TOC field
		toc_para = elem
		while toc_para is not None and toc_para.tag != f"{{{ns['w']}}}p":
		toc_para = toc_para.getparent()

		if toc_para is not None and toc_para not in toc_paragraphs:
		toc_paragraphs.append(toc_para)

		# Insert page break after each TOC paragraph
		for toc_para in toc_paragraphs:
		parent = toc_para.getparent()
		if parent is None:
		continue

		# Find the position of the TOC paragraph
		para_index = list(parent).index(toc_para)

		# Create a new paragraph with a page break
		page_break_para = OxmlElement('w:p')
		page_break_run = OxmlElement('w:r')
		page_break = OxmlElement('w:br')
		page_break.set(f"{{{ns['w']}}}type", "page")
		page_break_run.append(page_break)
		page_break_para.append(page_break_run)

		# Insert the page break paragraph after the TOC paragraph
		parent.insert(para_index + 1, page_break_para)
		print(f'Inserted page break after TOC')

		return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		def update_toc(docx_input, docx_output):
		@@ -476,3 +507,590 @@ def table_widths_adjustment(config):
		cell = row.cells[i]
		cell.width = width
		doc.save(docx_path)



		def update_figure_captions(docx_input, docx_output):
		"""
		Updates figure caption styles from 'ImageCaption' to 'TF' in a DOCX file.

		Parameters
		----------
		docx_input : str
		Path to the input DOCX file.
		docx_output : str
		Path to the output DOCX file.
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

		# Read XML
		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		root = etree.fromstring(xml_data)
		new_style = "TF"
		counter = 0

		# Loop over all elements to find "ImageCaption" and change to "TF"
		for elem in root.xpath('.//w:pStyle[@w:val="ImageCaption"]', namespaces=ns):
		old_val = elem.get(f"{{{ns['w']}}}val")
		elem.set(f"{{{ns['w']}}}val", new_style)
		counter += 1

		print(f'Changed style "ImageCaption" to "TF" {counter} times')

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		# create temp file
		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd) # Datei wird nur über zipfile geöffnet

		try:
		# write new docx to temp file
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		# Write to output file
		shutil.move(tmp_path, docx_output)
		# Set proper permissions (read/write for owner, read for group and others)
		os.chmod(docx_output, 0o644)

		finally:
		# delete temp file if still existing
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		def update_figure_style(docx_input, docx_output):
		"""
		Updates figure style from 'ImageCaption' to 'TF' in a DOCX file.

		Parameters
		----------
		docx_input : str
		Path to the input DOCX file.
		docx_output : str
		Path to the output DOCX file.
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

		# Read XML
		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		root = etree.fromstring(xml_data)
		new_style = "FL"
		counter = 0

		# Loop over all elements to find "ImageCaption" and change to "TF"
		for elem in root.xpath('.//w:pStyle[@w:val="CaptionedFigure"]', namespaces=ns):
		old_val = elem.get(f"{{{ns['w']}}}val")
		elem.set(f"{{{ns['w']}}}val", new_style)
		counter += 1

		print(f'Changed style "CaptionedFigure" to "FL" {counter} times')

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		# create temp file
		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd) # Datei wird nur über zipfile geöffnet

		try:
		# write new docx to temp file
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		# Write to output file
		shutil.move(tmp_path, docx_output)
		# Set proper permissions (read/write for owner, read for group and others)
		os.chmod(docx_output, 0o644)

		finally:
		# delete temp file if still existing
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		def update_heading_styles(docx_input, docx_output):
		"""
		Updates heading runs to split number from text with a tab.
		Transforms: "6 Architecture model..." to "6" + tab + "Architecture model..."

		Parameters
		----------
		docx_input : str
		Path to the input DOCX file.
		docx_output : str
		Path to the output DOCX file.
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

		# Read XML
		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		root = etree.fromstring(xml_data)
		counter = 0

		# Find all paragraphs with heading styles (Heading1, Heading2, etc.)
		for para in root.xpath('.//w:p[w:pPr/w:pStyle[starts-with(@w:val, "Heading")]]', namespaces=ns):
		# Get all runs in this paragraph
		runs = para.xpath('.//w:r', namespaces=ns)
		if not runs:
		continue

		# Collect all text from runs
		full_text = ""
		for run in runs:
		text_elem = run.find('.//w:t', namespaces=ns)
		if text_elem is not None and text_elem.text:
		full_text += text_elem.text

		# Check if text starts with a clause number followed by space
		match = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', full_text)
		if match:
		number = match.group(1)
		rest_text = match.group(2)

		# Clear all existing runs from the paragraph
		for run in runs:
		para.remove(run)

		# Create new run with number + tab + rest
		new_run = OxmlElement('w:r')

		# Number text
		num_t = OxmlElement('w:t')
		num_t.text = number
		new_run.append(num_t)

		# Tab
		tab = OxmlElement('w:tab')
		new_run.append(tab)

		# Rest of text
		rest_t = OxmlElement('w:t')
		rest_t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
		rest_t.text = rest_text
		new_run.append(rest_t)

		# Add the new run to the paragraph
		para.append(new_run)
		counter += 1

		print(f'Updated {counter} heading runs')

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		# create temp file
		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd) # Datei wird nur über zipfile geöffnet

		try:
		# write new docx to temp file
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		# Write to output file
		shutil.move(tmp_path, docx_output)
		# Set proper permissions (read/write for owner, read for group and others)
		os.chmod(docx_output, 0o644)

		finally:
		# delete temp file if still existing
		if os.path.exists(tmp_path):
		os.remove(tmp_path)


		def update_unnumbered_lists(docx_input, docx_output):
		"""
		Updates unnumbered list items (starting with "- ") in tables to appear as bulleted lists.
		For list items in tables: removes "- " prefix and creates separate paragraphs with FP style and numPr.
		For list items outside tables: removes "- " prefix and adds B1 style.

		Parameters
		----------
		docx_input : str
		Path to the input DOCX file.
		docx_output : str
		Path to the output DOCX file.
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

		# Read XML
		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		root = etree.fromstring(xml_data)
		counter_table = 0
		counter_regular = 0

		# Track processed paragraphs to avoid reprocessing
		processed_paras = set()

		# Find all paragraphs - need to collect them first since we'll be modifying the tree
		paragraphs = root.xpath('.//w:p', namespaces=ns)

		def is_list_item_para(para):
		"""Check if paragraph contains a list item (starts with '- ')"""
		runs = para.xpath('./w:r', namespaces=ns)
		for run in runs:
		text_elem = run.find('.//w:t', namespaces=ns)
		if text_elem is not None and text_elem.text and text_elem.text.startswith('- '):
		return True
		return False

		def is_blank_para(para):
		"""Check if paragraph is blank (empty or only whitespace)"""
		runs = para.xpath('./w:r', namespaces=ns)
		if not runs:
		return True
		all_text = ''
		for run in runs:
		text_elems = run.xpath('.//w:t', namespaces=ns)
		for text_elem in text_elems:
		if text_elem.text:
		all_text += text_elem.text
		return not all_text.strip()

		def get_para_text(para):
		"""Get all text from a paragraph"""
		runs = para.xpath('./w:r', namespaces=ns)
		text = ''
		for run in runs:
		text_elems = run.xpath('.//w:t', namespaces=ns)
		for text_elem in text_elems:
		if text_elem.text:
		text += text_elem.text
		return text

		for para in paragraphs:
		# Skip if already processed
		if id(para) in processed_paras:
		continue

		# Get all direct child runs (not nested runs)
		runs = para.xpath('./w:r', namespaces=ns)
		if not runs:
		continue

		# Find ALL list item runs (runs starting with "- ")
		list_item_runs = []
		all_children = list(para)
		for idx, child in enumerate(all_children):
		if child.tag == f"{{{ns['w']}}}r":
		text_elem = child.find('.//w:t', namespaces=ns)
		if text_elem is not None and text_elem.text and text_elem.text.startswith('- '):
		list_item_runs.append((idx, child, text_elem))

		# If we found list items, process each one separately
		if list_item_runs:
		# Get the parent element (usually the document body or table cell)
		parent = para.getparent()
		if parent is None:
		continue

		# Find the position of this paragraph
		para_index = list(parent).index(para)

		# Check if paragraph is inside a table
		is_in_table = bool(para.xpath('ancestor::w:tbl', namespaces=ns))

		# If in table and there are runs before the first list item, update original para pStyle to FP
		if is_in_table and list_item_runs[0][0] > 0:
		# Get or create pPr for the original paragraph
		orig_pPr = para.find('.//w:pPr', namespaces=ns)
		if orig_pPr is None:
		orig_pPr = OxmlElement('w:pPr')
		para.insert(0, orig_pPr)
		else:
		# Remove existing pStyle if any
		existing_pStyle = orig_pPr.find('.//w:pStyle', namespaces=ns)
		if existing_pStyle is not None:
		orig_pPr.remove(existing_pStyle)

		# Add FP style
		pStyle = OxmlElement('w:pStyle')
		pStyle.set(f"{{{ns['w']}}}val", "FP")
		orig_pPr.insert(0, pStyle) # Insert at beginning

		# Process each list item run separately
		insert_offset = 0 # Track where to insert new paragraphs
		for list_idx, (run_idx, list_item_run, list_item_text_elem) in enumerate(list_item_runs):
		# Remove the "- " prefix
		list_item_text_elem.text = list_item_text_elem.text[2:]

		# Create a new paragraph for this list item
		new_para = OxmlElement('w:p')

		# Create pPr
		pPr = OxmlElement('w:pPr')

		if is_in_table:
		# Bulleted list structure for table list items
		# pStyle
		pStyle = OxmlElement('w:pStyle')
		pStyle.set(f"{{{ns['w']}}}val", "FP")
		pPr.append(pStyle)

		# keepNext
		keepNext = OxmlElement('w:keepNext')
		pPr.append(keepNext)

		# numPr (for bulleted list)
		numPr = OxmlElement('w:numPr')
		ilvl = OxmlElement('w:ilvl')
		ilvl.set(f"{{{ns['w']}}}val", "0")
		numId = OxmlElement('w:numId')
		numId.set(f"{{{ns['w']}}}val", "14")
		numPr.append(ilvl)
		numPr.append(numId)
		pPr.append(numPr)

		# tabs
		tabs = OxmlElement('w:tabs')
		tab = OxmlElement('w:tab')
		tab.set(f"{{{ns['w']}}}val", "left")
		tab.set(f"{{{ns['w']}}}pos", "3118")
		tabs.append(tab)
		pPr.append(tabs)

		# spacing
		spacing = OxmlElement('w:spacing')
		spacing.set(f"{{{ns['w']}}}before", "80")
		spacing.set(f"{{{ns['w']}}}after", "80")
		pPr.append(spacing)

		# Left alignment
		jc = OxmlElement('w:jc')
		jc.set(f"{{{ns['w']}}}val", "left")
		pPr.append(jc)

		counter_table += 1
		else:
		# Simple structure for regular list items (outside tables)
		pStyle = OxmlElement('w:pStyle')
		pStyle.set(f"{{{ns['w']}}}val", "B1")
		pPr.append(pStyle)

		counter_regular += 1

		new_para.append(pPr)

		# Find runs that belong to this list item
		# From this list item run until the next list item run (or end of paragraph)
		start_idx = run_idx
		end_idx = list_item_runs[list_idx + 1][0] if list_idx + 1 < len(list_item_runs) else len(all_children)

		# Move runs for this list item to the new paragraph
		runs_to_move = []
		for idx in range(start_idx, end_idx):
		child = all_children[idx]
		if child.tag != f"{{{ns['w']}}}pPr":
		runs_to_move.append(child)

		# Remove from original and add to new paragraph
		for run in runs_to_move:
		if run in para:
		para.remove(run)
		new_para.append(run)

		# Now look at subsequent paragraphs in the same parent and merge them
		# until we hit another list item, blank line, or end of parent
		current_para_pos = para_index + insert_offset + 1
		next_index = current_para_pos
		while next_index < len(parent):
		next_para = parent[next_index]

		# Stop if we hit another list item
		if is_list_item_para(next_para):
		break

		# Stop if we hit a blank line
		if is_blank_para(next_para):
		break

		# Stop if paragraph is in a different table cell (different parent)
		if next_para.getparent() != parent:
		break

		# Merge this paragraph's runs into the list item paragraph
		for run in list(next_para):
		if run.tag != f"{{{ns['w']}}}pPr":
		next_para.remove(run)
		new_para.append(run)

		# Mark as processed and remove the merged paragraph
		processed_paras.add(id(next_para))
		parent.remove(next_para)
		# Don't increment next_index since we removed an element

		# Insert the new paragraph
		current_para_pos = para_index + insert_offset + 1
		parent.insert(current_para_pos, new_para)
		insert_offset += 1

		# Only remove the original paragraph if it has no content left (only pPr or empty)
		remaining_runs = [c for c in para if c.tag != f"{{{ns['w']}}}pPr"]
		if not remaining_runs:
		processed_paras.add(id(para))
		parent.remove(para)

		print(f'Updated {counter_table} unnumbered list items in tables, {counter_regular} outside tables')

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		# Create temp file
		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd)

		try:
		# Write new docx to temp file
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		# Write to output file
		shutil.move(tmp_path, docx_output)
		# Set proper permissions (read/write for owner, read for group and others)
		os.chmod(docx_output, 0o644)

		finally:
		# Delete temp file if still existing
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		def update_table_captions(docx_input, docx_output):
		"""
		Updates table caption styles from 'TableCaption' to 'TF' in a DOCX file.

		Parameters
		----------
		docx_input : str
		Path to the input DOCX file.
		docx_output : str
		Path to the output DOCX file.
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

		# Read XML
		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		root = etree.fromstring(xml_data)
		new_style = "TH"
		counter = 0

		# Loop over all elements to find "TableCaption" and change to "TH"
		for elem in root.xpath('.//w:pStyle[@w:val="TableCaption"]', namespaces=ns):
		old_val = elem.get(f"{{{ns['w']}}}val")
		elem.set(f"{{{ns['w']}}}val", new_style)
		counter += 1

		print(f'Changed style "TableCaption" to "TH" {counter} times')

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		# create temp file
		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd) # Datei wird nur über zipfile geöffnet

		try:
		# write new docx to temp file
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		# Write to output file
		shutil.move(tmp_path, docx_output)
		# Set proper permissions (read/write for owner, read for group and others)
		os.chmod(docx_output, 0o644)

		finally:
		# delete temp file if still existing
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		def update_abbreviations(docx_input, docx_output):
		"""
		Updates abbreviations styles from 'Abbreviation' to 'TF' in a DOCX file.

		Parameters
		----------
		docx_input : str
		Path to the input DOCX file.
		docx_output : str
		Path to the output DOCX file.
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

		# Read XML
		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		root = etree.fromstring(xml_data)
		new_style = "EW"
		counter = 0

		# Loop over all elements to find "VerbatimChar" and change to "EX"
		for elem in root.xpath('.//w:rStyle[@w:val="VerbatimChar"]', namespaces=ns):
		old_val = elem.get(f"{{{ns['w']}}}val")
		elem.set(f"{{{ns['w']}}}val", new_style)
		counter += 1

		print(f'Changed style "VerbatimChar" to "EW" {counter} times')

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		# create temp file
		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd) # Datei wird nur über zipfile geöffnet

		try:
		# write new docx to temp file
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		# Write to output file
		shutil.move(tmp_path, docx_output)
		# Set proper permissions (read/write for owner, read for group and others)
		os.chmod(docx_output, 0o644)

		finally:
		# delete temp file if still existing
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		def update_format_styles_cli():
		parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.")
		parser.add_argument("docx_input", help="Path to input DOCX file")
		parser.add_argument("docx_output", help="Path to output DOCX file")
		args = parser.parse_args()

		update_figure_captions(args.docx_input, args.docx_output)
		update_heading_styles(args.docx_input, args.docx_output)
		update_figure_style(args.docx_input, args.docx_output)
		update_unnumbered_lists(args.docx_input, args.docx_output)
		update_table_captions(args.docx_input, args.docx_output)
		update_abbreviations(args.docx_input, args.docx_output)
		No newline at end of file

generateBaseline/setup.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -21,6 +21,7 @@ setup(
		"check_multipage_tables=postprocessing:insert_page_break_before_long_tables",
		#"apply_etsi_styling: postprocessing:postprocess_etsi_styles",
		"update_toc=postprocessing:update_toc_cli",
		"update_format_styles=postprocessing:update_format_styles_cli",
		"refresh_docx_fields=postprocessing:refresh_docx_fields_cli",
		]
		}