Loading generateBaseline/postprocessing.py +102 −0 Original line number Diff line number Diff line Loading @@ -29,6 +29,7 @@ DEFAULT_FORMAT_STYLE_UPDATES = { "update_notes": True, "update_references": True, "update_lists": True, "repair_invalid_numbering_references": True, "update_body_text_style": True, "add_no_break_hyphens": True, "update_references_style": True, Loading Loading @@ -1464,6 +1465,106 @@ def update_lists(docx_input, docx_output): if os.path.exists(tmp_path): os.remove(tmp_path) def repair_invalid_numbering_references(docx_input, docx_output): """ Repair invalid paragraph numbering references that can make DOCX files unreadable in Word. A paragraph numbering reference is considered invalid when: - w:numPr is missing w:numId or w:ilvl - w:numId points to a non-existing w:num in numbering.xml - w:ilvl is not defined for the referenced abstract numbering definition """ ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} with zipfile.ZipFile(docx_input, 'r') as zin: xml_data = zin.read("word/document.xml") try: numbering_data = zin.read("word/numbering.xml") numbering_root = etree.fromstring(numbering_data) except KeyError: numbering_root = None # Nothing to validate if there is no numbering part. if numbering_root is None: return root = etree.fromstring(xml_data) num_to_abstract = {} abstract_levels = {} for abstract in numbering_root.xpath('.//w:abstractNum', namespaces=ns): abstract_id = abstract.get(f"{{{ns['w']}}}abstractNumId") levels = set() for lvl in abstract.xpath('./w:lvl', namespaces=ns): ilvl = lvl.get(f"{{{ns['w']}}}ilvl") if ilvl is not None: levels.add(ilvl) abstract_levels[abstract_id] = levels for num in numbering_root.xpath('.//w:num', namespaces=ns): num_id = num.get(f"{{{ns['w']}}}numId") abstract_id_elem = num.xpath('./w:abstractNumId', namespaces=ns) abstract_id = abstract_id_elem[0].get(f"{{{ns['w']}}}val") if abstract_id_elem else None num_to_abstract[num_id] = abstract_id repaired = 0 for para in root.xpath('.//w:p', namespaces=ns): ppr = para.xpath('./w:pPr', namespaces=ns) if not ppr: continue ppr = ppr[0] numpr = ppr.xpath('./w:numPr', namespaces=ns) if not numpr: continue numpr = numpr[0] num_id_elem = numpr.xpath('./w:numId', namespaces=ns) ilvl_elem = numpr.xpath('./w:ilvl', namespaces=ns) invalid_ref = False if not num_id_elem or not ilvl_elem: invalid_ref = True else: num_id = num_id_elem[0].get(f"{{{ns['w']}}}val") ilvl = ilvl_elem[0].get(f"{{{ns['w']}}}val") abstract_id = num_to_abstract.get(num_id) valid_levels = abstract_levels.get(abstract_id, set()) if abstract_id is None or ilvl not in valid_levels: invalid_ref = True if invalid_ref: ppr.remove(numpr) repaired += 1 if repaired == 0: print("List numbering validation: no invalid numbering references found") return print(f"List numbering validation: repaired {repaired} invalid numbering reference(s)") xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) try: with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) shutil.move(tmp_path, docx_output) os.chmod(docx_output, 0o644) finally: if os.path.exists(tmp_path): os.remove(tmp_path) def update_table_captions(docx_input, docx_output): """ Updates table caption styles from 'TableCaption' to 'TF' in a DOCX file. Loading Loading @@ -2791,6 +2892,7 @@ def update_format_styles_cli(): ("update_notes", update_notes), ("update_references", update_references), ("update_lists", update_lists), ("repair_invalid_numbering_references", repair_invalid_numbering_references), ("update_body_text_style", update_body_text_style), ("add_no_break_hyphens", add_no_break_hyphens), ("update_references_style", update_references_style), Loading Loading
generateBaseline/postprocessing.py +102 −0 Original line number Diff line number Diff line Loading @@ -29,6 +29,7 @@ DEFAULT_FORMAT_STYLE_UPDATES = { "update_notes": True, "update_references": True, "update_lists": True, "repair_invalid_numbering_references": True, "update_body_text_style": True, "add_no_break_hyphens": True, "update_references_style": True, Loading Loading @@ -1464,6 +1465,106 @@ def update_lists(docx_input, docx_output): if os.path.exists(tmp_path): os.remove(tmp_path) def repair_invalid_numbering_references(docx_input, docx_output): """ Repair invalid paragraph numbering references that can make DOCX files unreadable in Word. A paragraph numbering reference is considered invalid when: - w:numPr is missing w:numId or w:ilvl - w:numId points to a non-existing w:num in numbering.xml - w:ilvl is not defined for the referenced abstract numbering definition """ ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} with zipfile.ZipFile(docx_input, 'r') as zin: xml_data = zin.read("word/document.xml") try: numbering_data = zin.read("word/numbering.xml") numbering_root = etree.fromstring(numbering_data) except KeyError: numbering_root = None # Nothing to validate if there is no numbering part. if numbering_root is None: return root = etree.fromstring(xml_data) num_to_abstract = {} abstract_levels = {} for abstract in numbering_root.xpath('.//w:abstractNum', namespaces=ns): abstract_id = abstract.get(f"{{{ns['w']}}}abstractNumId") levels = set() for lvl in abstract.xpath('./w:lvl', namespaces=ns): ilvl = lvl.get(f"{{{ns['w']}}}ilvl") if ilvl is not None: levels.add(ilvl) abstract_levels[abstract_id] = levels for num in numbering_root.xpath('.//w:num', namespaces=ns): num_id = num.get(f"{{{ns['w']}}}numId") abstract_id_elem = num.xpath('./w:abstractNumId', namespaces=ns) abstract_id = abstract_id_elem[0].get(f"{{{ns['w']}}}val") if abstract_id_elem else None num_to_abstract[num_id] = abstract_id repaired = 0 for para in root.xpath('.//w:p', namespaces=ns): ppr = para.xpath('./w:pPr', namespaces=ns) if not ppr: continue ppr = ppr[0] numpr = ppr.xpath('./w:numPr', namespaces=ns) if not numpr: continue numpr = numpr[0] num_id_elem = numpr.xpath('./w:numId', namespaces=ns) ilvl_elem = numpr.xpath('./w:ilvl', namespaces=ns) invalid_ref = False if not num_id_elem or not ilvl_elem: invalid_ref = True else: num_id = num_id_elem[0].get(f"{{{ns['w']}}}val") ilvl = ilvl_elem[0].get(f"{{{ns['w']}}}val") abstract_id = num_to_abstract.get(num_id) valid_levels = abstract_levels.get(abstract_id, set()) if abstract_id is None or ilvl not in valid_levels: invalid_ref = True if invalid_ref: ppr.remove(numpr) repaired += 1 if repaired == 0: print("List numbering validation: no invalid numbering references found") return print(f"List numbering validation: repaired {repaired} invalid numbering reference(s)") xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) try: with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) shutil.move(tmp_path, docx_output) os.chmod(docx_output, 0o644) finally: if os.path.exists(tmp_path): os.remove(tmp_path) def update_table_captions(docx_input, docx_output): """ Updates table caption styles from 'TableCaption' to 'TF' in a DOCX file. Loading Loading @@ -2791,6 +2892,7 @@ def update_format_styles_cli(): ("update_notes", update_notes), ("update_references", update_references), ("update_lists", update_lists), ("repair_invalid_numbering_references", repair_invalid_numbering_references), ("update_body_text_style", update_body_text_style), ("add_no_break_hyphens", add_no_break_hyphens), ("update_references_style", update_references_style), Loading