Adding support for the alternative approach for abbreviations (0db8bef5) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateBaseline/postprocessing.py

+88 −36

Original line number	Diff line number	Diff line
		@@ -664,6 +664,57 @@ def table_widths_adjustment(config):
		cell.width = width
		doc.save(docx_path)

		def _paragraph_text(elem):
		"""
		Returns the text of a paragraph, respecting how headings are created in update_heading_styles:
		number + <w:tab/> + title in separate run parts.
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
		if elem is None or elem.tag != f"{{{ns['w']}}}p":
		return ""
		# Build text from runs to respect how headings are created in update_heading_styles:
		# number + <w:tab/> + title in separate run parts.
		parts = []
		for run in elem.xpath('./w:r', namespaces=ns):
		for child in run:
		if child.tag == f"{{{ns['w']}}}t" and child.text:
		parts.append(child.text)
		elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"):
		parts.append(" ")
		# Fallback for paragraphs containing text in non-direct run descendants.
		if not parts:
		parts = elem.xpath('.//w:t/text()', namespaces=ns)
		return ''.join(parts).strip()


		def _paragraph_style_val(para, ns):
		w = ns['w']
		p_pr = para.find(f'{{{w}}}pPr')
		if p_pr is None:
		return None
		p_style = p_pr.find(f'{{{w}}}pStyle')
		if p_style is None:
		return None
		return p_style.get(f'{{{w}}}val')


		def _normalize_ws(text):
		return re.sub(r'\s+', ' ', text.replace('\u00A0', ' ')).strip()


		def _is_in_abbreviations_clause(para, ns):
		"""True when the nearest preceding Heading2 section is Abbreviations."""
		w = ns['w']
		previous = para.getprevious()
		while previous is not None:
		if previous.tag != f'{{{w}}}p':
		previous = previous.getprevious()
		continue
		if _paragraph_style_val(previous, ns) == 'Heading2':
		heading_text = _normalize_ws(_paragraph_text(previous))
		return bool(re.search(r'\babbreviations\b', heading_text, flags=re.IGNORECASE))
		previous = previous.getprevious()
		return False


		def update_figure_captions(docx_input, docx_output):
		@@ -2053,6 +2104,8 @@ def update_notes(docx_input, docx_output):

		prev_pStyle_val = prev_pStyle[0].get(f"{{{ns['w']}}}val")
		if prev_pStyle_val == example_style:
		# EW in the Abbreviations clause is an abbreviation entry, not an example.
		if not _is_in_abbreviations_clause(para, ns):
		pStyle_elem.set(f"{{{ns['w']}}}val", "PL")
		print(f'Changed style "BlockText" to "PL" because it is preceded by a paragraph with style "EW" (line {prev_para.sourceline})')
		break
		@@ -2110,7 +2163,6 @@ def update_abbreviations(docx_input, docx_output):

		# Find all paragraphs with BodyText style
		paragraphs_to_process = root.xpath('.//w:p[w:pPr/w:pStyle[@w:val="BodyText"]]', namespaces=ns)

		for para in paragraphs_to_process:
		# Check if this paragraph contains runs with Verbatim or VerbatimChar style
		verbatim_runs = para.xpath('./w:r[w:rPr/w:rStyle[@w:val="Verbatim" or @w:val="VerbatimChar"]]', namespaces=ns)
		@@ -2118,6 +2170,9 @@ def update_abbreviations(docx_input, docx_output):
		if not verbatim_runs:
		continue

		if not _is_in_abbreviations_clause(para, ns):
		continue

		# Get the parent element to insert new paragraphs
		parent = para.getparent()
		if parent is None:
		@@ -2128,20 +2183,30 @@ def update_abbreviations(docx_input, docx_output):

		# Process each verbatim run separately
		new_paragraphs = []
		option2_definition = False
		for verbatim_run in verbatim_runs:
		# Extract text from this specific verbatim run
		full_text = ""
		full_text_in_verbatim_run = ""
		text_elems = verbatim_run.xpath('.//w:t', namespaces=ns)
		for text_elem in text_elems:
		if text_elem.text:
		full_text += text_elem.text

		full_text_in_verbatim_run += text_elem.text
		# Two options for abbreviations:
		# 1. `ABBR Abbreviation`
		# - abbreviation followed by 2+ spaces followed by definition (usually all in a verbatim run)
		# 2. `ABBR` Abbreviation
		# - abbreviation (in a verbatim run) followed by definition (in the remaining runs of the paragraph) separated by one or more spaces
		# Check if text contains multiple spaces (2 or more) separating two parts
		# Pattern: abbreviation followed by 2+ spaces followed by definition
		match = re.match(r'^(.+?)\s{2,}(.+)$', full_text.strip())
		match = re.match(r'^(.+?)\s{2,}(.+)$', full_text_in_verbatim_run.strip())
		if not match:
		continue

		# Option 2: abbreviation (in a verbatim run) followed by definition (in the remaining runs of the paragraph) separated by one or more spaces
		# Take all remaining runs of the paragraph as they are, except the verbatim run
		remaining_runs = [run for run in para.xpath('./w:r', namespaces=ns) if run != verbatim_run]
		abbreviation = full_text_in_verbatim_run.strip()
		option2_definition = True
		#definition = ''.join([run.xpath('.//w:t/text()', namespaces=ns)[0].text for run in remaining_runs]).strip()
		else:
		abbreviation = match.group(1).strip()
		definition = match.group(2).strip()

		@@ -2168,13 +2233,17 @@ def update_abbreviations(docx_input, docx_output):
		run_tab.append(tab)
		new_para.append(run_tab)

		if not option2_definition:
		# Create second run with definition
		run2 = OxmlElement('w:r')
		t2 = OxmlElement('w:t')
		t2.text = definition
		run2.append(t2)
		new_para.append(run2)

		else:
		# Create second run with definition, append all remaining runs of the paragraph
		for run in remaining_runs:
		new_para.append(run)
		new_paragraphs.append(new_para)
		counter += 1

		@@ -2779,23 +2848,6 @@ def add_break_after_code_blocks_and_tables(docx_input, docx_output):
		return None
		return style_elems[0].get(w_val)

		def _paragraph_text(elem):
		if elem is None or elem.tag != w_p:
		return ""
		# Build text from runs to respect how headings are created in update_heading_styles:
		# number + <w:tab/> + title in separate run parts.
		parts = []
		for run in elem.xpath('./w:r', namespaces=ns):
		for child in run:
		if child.tag == f"{{{ns['w']}}}t" and child.text:
		parts.append(child.text)
		elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"):
		parts.append(" ")
		# Fallback for paragraphs containing text in non-direct run descendants.
		if not parts:
		parts = elem.xpath('.//w:t/text()', namespaces=ns)
		return ''.join(parts).strip()

		def _normalize_ws(text):
		# Normalize tabs/non-breaking spaces/multiple spaces for robust heading match.
		return re.sub(r'\s+', ' ', text.replace('\u00A0', ' ')).strip()