Commit 1c3272ac authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Add validation of processed lists in the document

parent 12fd2c30
Loading
Loading
Loading
Loading
Loading
+102 −0
Original line number Diff line number Diff line
@@ -29,6 +29,7 @@ DEFAULT_FORMAT_STYLE_UPDATES = {
    "update_notes": True,
    "update_references": True,
    "update_lists": True,
    "repair_invalid_numbering_references": True,
    "update_body_text_style": True,
    "add_no_break_hyphens": True,
    "update_references_style": True,
@@ -1464,6 +1465,106 @@ def update_lists(docx_input, docx_output):
        if os.path.exists(tmp_path):
            os.remove(tmp_path)


def repair_invalid_numbering_references(docx_input, docx_output):
    """
    Repair invalid paragraph numbering references that can make DOCX files
    unreadable in Word.

    A paragraph numbering reference is considered invalid when:
    - w:numPr is missing w:numId or w:ilvl
    - w:numId points to a non-existing w:num in numbering.xml
    - w:ilvl is not defined for the referenced abstract numbering definition
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")
        try:
            numbering_data = zin.read("word/numbering.xml")
            numbering_root = etree.fromstring(numbering_data)
        except KeyError:
            numbering_root = None

    # Nothing to validate if there is no numbering part.
    if numbering_root is None:
        return

    root = etree.fromstring(xml_data)

    num_to_abstract = {}
    abstract_levels = {}

    for abstract in numbering_root.xpath('.//w:abstractNum', namespaces=ns):
        abstract_id = abstract.get(f"{{{ns['w']}}}abstractNumId")
        levels = set()
        for lvl in abstract.xpath('./w:lvl', namespaces=ns):
            ilvl = lvl.get(f"{{{ns['w']}}}ilvl")
            if ilvl is not None:
                levels.add(ilvl)
        abstract_levels[abstract_id] = levels

    for num in numbering_root.xpath('.//w:num', namespaces=ns):
        num_id = num.get(f"{{{ns['w']}}}numId")
        abstract_id_elem = num.xpath('./w:abstractNumId', namespaces=ns)
        abstract_id = abstract_id_elem[0].get(f"{{{ns['w']}}}val") if abstract_id_elem else None
        num_to_abstract[num_id] = abstract_id

    repaired = 0
    for para in root.xpath('.//w:p', namespaces=ns):
        ppr = para.xpath('./w:pPr', namespaces=ns)
        if not ppr:
            continue
        ppr = ppr[0]

        numpr = ppr.xpath('./w:numPr', namespaces=ns)
        if not numpr:
            continue
        numpr = numpr[0]

        num_id_elem = numpr.xpath('./w:numId', namespaces=ns)
        ilvl_elem = numpr.xpath('./w:ilvl', namespaces=ns)

        invalid_ref = False
        if not num_id_elem or not ilvl_elem:
            invalid_ref = True
        else:
            num_id = num_id_elem[0].get(f"{{{ns['w']}}}val")
            ilvl = ilvl_elem[0].get(f"{{{ns['w']}}}val")
            abstract_id = num_to_abstract.get(num_id)
            valid_levels = abstract_levels.get(abstract_id, set())
            if abstract_id is None or ilvl not in valid_levels:
                invalid_ref = True

        if invalid_ref:
            ppr.remove(numpr)
            repaired += 1

    if repaired == 0:
        print("List numbering validation: no invalid numbering references found")
        return

    print(f"List numbering validation: repaired {repaired} invalid numbering reference(s)")

    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)

    try:
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        shutil.move(tmp_path, docx_output)
        os.chmod(docx_output, 0o644)
    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def update_table_captions(docx_input, docx_output):
    """
    Updates table caption styles from 'TableCaption' to 'TF' in a DOCX file.
@@ -2791,6 +2892,7 @@ def update_format_styles_cli():
        ("update_notes", update_notes),
        ("update_references", update_references),
        ("update_lists", update_lists),
        ("repair_invalid_numbering_references", repair_invalid_numbering_references),
        ("update_body_text_style", update_body_text_style),
        ("add_no_break_hyphens", add_no_break_hyphens),
        ("update_references_style", update_references_style),