Add validation of processed lists in the document (1c3272ac) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateBaseline/postprocessing.py

+102 −0

Original line number	Diff line number	Diff line
		@@ -29,6 +29,7 @@ DEFAULT_FORMAT_STYLE_UPDATES = {
		"update_notes": True,
		"update_references": True,
		"update_lists": True,
		"repair_invalid_numbering_references": True,
		"update_body_text_style": True,
		"add_no_break_hyphens": True,
		"update_references_style": True,
		@@ -1464,6 +1465,106 @@ def update_lists(docx_input, docx_output):
		if os.path.exists(tmp_path):
		os.remove(tmp_path)


		def repair_invalid_numbering_references(docx_input, docx_output):
		"""
		Repair invalid paragraph numbering references that can make DOCX files
		unreadable in Word.

		A paragraph numbering reference is considered invalid when:
		- w:numPr is missing w:numId or w:ilvl
		- w:numId points to a non-existing w:num in numbering.xml
		- w:ilvl is not defined for the referenced abstract numbering definition
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")
		try:
		numbering_data = zin.read("word/numbering.xml")
		numbering_root = etree.fromstring(numbering_data)
		except KeyError:
		numbering_root = None

		# Nothing to validate if there is no numbering part.
		if numbering_root is None:
		return

		root = etree.fromstring(xml_data)

		num_to_abstract = {}
		abstract_levels = {}

		for abstract in numbering_root.xpath('.//w:abstractNum', namespaces=ns):
		abstract_id = abstract.get(f"{{{ns['w']}}}abstractNumId")
		levels = set()
		for lvl in abstract.xpath('./w:lvl', namespaces=ns):
		ilvl = lvl.get(f"{{{ns['w']}}}ilvl")
		if ilvl is not None:
		levels.add(ilvl)
		abstract_levels[abstract_id] = levels

		for num in numbering_root.xpath('.//w:num', namespaces=ns):
		num_id = num.get(f"{{{ns['w']}}}numId")
		abstract_id_elem = num.xpath('./w:abstractNumId', namespaces=ns)
		abstract_id = abstract_id_elem[0].get(f"{{{ns['w']}}}val") if abstract_id_elem else None
		num_to_abstract[num_id] = abstract_id

		repaired = 0
		for para in root.xpath('.//w:p', namespaces=ns):
		ppr = para.xpath('./w:pPr', namespaces=ns)
		if not ppr:
		continue
		ppr = ppr[0]

		numpr = ppr.xpath('./w:numPr', namespaces=ns)
		if not numpr:
		continue
		numpr = numpr[0]

		num_id_elem = numpr.xpath('./w:numId', namespaces=ns)
		ilvl_elem = numpr.xpath('./w:ilvl', namespaces=ns)

		invalid_ref = False
		if not num_id_elem or not ilvl_elem:
		invalid_ref = True
		else:
		num_id = num_id_elem[0].get(f"{{{ns['w']}}}val")
		ilvl = ilvl_elem[0].get(f"{{{ns['w']}}}val")
		abstract_id = num_to_abstract.get(num_id)
		valid_levels = abstract_levels.get(abstract_id, set())
		if abstract_id is None or ilvl not in valid_levels:
		invalid_ref = True

		if invalid_ref:
		ppr.remove(numpr)
		repaired += 1

		if repaired == 0:
		print("List numbering validation: no invalid numbering references found")
		return

		print(f"List numbering validation: repaired {repaired} invalid numbering reference(s)")

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd)

		try:
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		shutil.move(tmp_path, docx_output)
		os.chmod(docx_output, 0o644)
		finally:
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		def update_table_captions(docx_input, docx_output):
		"""
		Updates table caption styles from 'TableCaption' to 'TF' in a DOCX file.
		@@ -2791,6 +2892,7 @@ def update_format_styles_cli():
		("update_notes", update_notes),
		("update_references", update_references),
		("update_lists", update_lists),
		("repair_invalid_numbering_references", repair_invalid_numbering_references),
		("update_body_text_style", update_body_text_style),
		("add_no_break_hyphens", add_no_break_hyphens),
		("update_references_style", update_references_style),