Commit 65745d38 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Another try to fix lists

parent c9231aaa
Loading
Loading
Loading
Loading
Loading
+52 −4
Original line number Diff line number Diff line
@@ -758,8 +758,8 @@ def update_heading_styles(docx_input, docx_output):
            # Insert page break before "Scope" Heading1 and "History" Heading1
            if re.match(r'^1\s+Scope\s*', full_text):
                insert_page_break_before_heading(para, full_text)
        # Annex heading: "Annex A: Title" or "Annex A (informative): Title"
        elif (match := re.match(r'^(Annex\s[A-Z](?:\s*\((?:informative|normative|informative or normative|normative or informative)\))?:)\s*(.+)$', full_text, re.IGNORECASE)) is not None:
        # Annex heading: "Annex A: Title" or "Annex A (informative): Title" or "Annex: Title"
        elif (match := re.match(r'^(Annex\s[A-Z](?:\s*\((?:informative|normative|informative or normative|normative or informative)\))?:|Annex:\s*)\s*(.+)$', full_text, re.IGNORECASE)) is not None:
            annex_letter = match.group(1)
            annex_title = match.group(2)
            
@@ -1050,6 +1050,45 @@ def update_lists(docx_input, docx_output):
            old_id = abstract.get(qn('w:val'))
            abstract.set(qn('w:val'), abstract_id_bn)

    def split_run_with_breaks(para, run):
        import copy
        try:
            index = list(para).index(run)
        except ValueError:
            return

        elements = list(run)
        new_runs = []
        current_run_elements = []
        run_props = run.find(f".//{{{ns['w']}}}rPr")
        
        def create_run(elems):
            r = OxmlElement('w:r')
            if run_props is not None:
                r.append(copy.deepcopy(run_props))
            for el in elems:
                r.append(el)
            return r

        for el in elements:
            if el.tag == f"{{{ns['w']}}}rPr":
                continue
            
            if el.tag in (f"{{{ns['w']}}}br", f"{{{ns['w']}}}cr"):
                if current_run_elements:
                    new_runs.append(create_run(current_run_elements))
                    current_run_elements = []
                new_runs.append(create_run([el]))
            else:
                current_run_elements.append(el)
        
        if current_run_elements:
            new_runs.append(create_run(current_run_elements))
            
        para.remove(run)
        for i, r in enumerate(new_runs):
            para.insert(index + i, r)

    for para in paragraphs:
        # Skip if already processed
        if id(para) in processed_paras:
@@ -1144,6 +1183,12 @@ def update_lists(docx_input, docx_output):
                counter_compact += 1
                continue
       
        # Pre-process: split runs with breaks
        for child in list(para):
            if child.tag == f"{{{ns['w']}}}r":
                 if any(node.tag in (f"{{{ns['w']}}}br", f"{{{ns['w']}}}cr") for node in child):
                      split_run_with_breaks(para, child)

        # Get all direct child runs (not nested runs)
        runs = para.xpath('./w:r', namespaces=ns)
        if not runs:
@@ -1158,9 +1203,12 @@ def update_lists(docx_input, docx_output):
            if child.tag == f"{{{ns['w']}}}r":
                text_elem = child.find('.//w:t', namespaces=ns)
                
                if is_start_of_line and text_elem is not None and text_elem.text:
                if text_elem is not None and text_elem.text:
                    text_stripped = text_elem.text.lstrip()
                    if text_stripped.startswith('- ') or re.match(r'^\d+\.\s', text_stripped):
                    is_numbered_candidate = bool(re.match(r'^\d+\.\s', text_stripped))
                    is_bullet_candidate = text_stripped.startswith('- ')
                    
                    if is_numbered_candidate or (is_bullet_candidate and is_start_of_line):
                        list_item_runs.append((idx, child, text_elem))
                
                # Update state