Another try to fix lists (65745d38) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateBaseline/postprocessing.py

+52 −4

Original line number	Diff line number	Diff line
		@@ -758,8 +758,8 @@ def update_heading_styles(docx_input, docx_output):
		# Insert page break before "Scope" Heading1 and "History" Heading1
		if re.match(r'^1\s+Scope\s*', full_text):
		insert_page_break_before_heading(para, full_text)
		# Annex heading: "Annex A: Title" or "Annex A (informative): Title"
		elif (match := re.match(r'^(Annex\s[A-Z](?:\s$(?:informative\|normative\|informative or normative\|normative or informative)$)?:)\s(.+)$', full_text, re.IGNORECASE)) is not None:
		# Annex heading: "Annex A: Title" or "Annex A (informative): Title" or "Annex: Title"
		elif (match := re.match(r'^(Annex\s[A-Z](?:\s$(?:informative\|normative\|informative or normative\|normative or informative)$)?:\|Annex:\s)\s*(.+)$', full_text, re.IGNORECASE)) is not None:
		annex_letter = match.group(1)
		annex_title = match.group(2)

		@@ -1050,6 +1050,45 @@ def update_lists(docx_input, docx_output):
		old_id = abstract.get(qn('w:val'))
		abstract.set(qn('w:val'), abstract_id_bn)

		def split_run_with_breaks(para, run):
		import copy
		try:
		index = list(para).index(run)
		except ValueError:
		return

		elements = list(run)
		new_runs = []
		current_run_elements = []
		run_props = run.find(f".//{{{ns['w']}}}rPr")

		def create_run(elems):
		r = OxmlElement('w:r')
		if run_props is not None:
		r.append(copy.deepcopy(run_props))
		for el in elems:
		r.append(el)
		return r

		for el in elements:
		if el.tag == f"{{{ns['w']}}}rPr":
		continue

		if el.tag in (f"{{{ns['w']}}}br", f"{{{ns['w']}}}cr"):
		if current_run_elements:
		new_runs.append(create_run(current_run_elements))
		current_run_elements = []
		new_runs.append(create_run([el]))
		else:
		current_run_elements.append(el)

		if current_run_elements:
		new_runs.append(create_run(current_run_elements))

		para.remove(run)
		for i, r in enumerate(new_runs):
		para.insert(index + i, r)

		for para in paragraphs:
		# Skip if already processed
		if id(para) in processed_paras:
		@@ -1144,6 +1183,12 @@ def update_lists(docx_input, docx_output):
		counter_compact += 1
		continue

		# Pre-process: split runs with breaks
		for child in list(para):
		if child.tag == f"{{{ns['w']}}}r":
		if any(node.tag in (f"{{{ns['w']}}}br", f"{{{ns['w']}}}cr") for node in child):
		split_run_with_breaks(para, child)

		# Get all direct child runs (not nested runs)
		runs = para.xpath('./w:r', namespaces=ns)
		if not runs:
		@@ -1158,9 +1203,12 @@ def update_lists(docx_input, docx_output):
		if child.tag == f"{{{ns['w']}}}r":
		text_elem = child.find('.//w:t', namespaces=ns)

		if is_start_of_line and text_elem is not None and text_elem.text:
		if text_elem is not None and text_elem.text:
		text_stripped = text_elem.text.lstrip()
		if text_stripped.startswith('- ') or re.match(r'^\d+\.\s', text_stripped):
		is_numbered_candidate = bool(re.match(r'^\d+\.\s', text_stripped))
		is_bullet_candidate = text_stripped.startswith('- ')

		if is_numbered_candidate or (is_bullet_candidate and is_start_of_line):
		list_item_runs.append((idx, child, text_elem))

		# Update state