Commit c9231aaa authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Fixes for lists postprocessing

parent 6ab04a64
Loading
Loading
Loading
Loading
Loading
+54 −17
Original line number Diff line number Diff line
@@ -828,11 +828,11 @@ def update_heading_styles(docx_input, docx_output):
            os.remove(tmp_path)


def update_unnumbered_lists(docx_input, docx_output):
def update_lists(docx_input, docx_output):
    """
    Updates unnumbered list items (starting with "- ") in tables to appear as bulleted lists.
    For list items in tables: removes "- " prefix and creates separate paragraphs with FP style and numPr.
    For list items outside tables: removes "- " prefix and adds B1 style.
    Updates list items (starting with "- " or "1. ") in tables to appear as bulleted/numbered lists.
    For list items in tables: removes prefix and creates separate paragraphs with FP style and numPr.
    For list items outside tables: removes prefix and adds B1 (bullet) or BN (numbered) style.
    
    Parameters
    ----------
@@ -978,10 +978,20 @@ def update_unnumbered_lists(docx_input, docx_output):
    
    def is_list_item_para(para):
        """Check if paragraph contains a list item (starts with '- ')"""
        runs = para.xpath('./w:r', namespaces=ns)
        for run in runs:
            text_elem = run.find('.//w:t', namespaces=ns)
            if text_elem is not None and text_elem.text and text_elem.text.startswith('- '):
        full_text = ''
        for node in para.xpath('.//w:t | .//w:br | .//w:cr | .//w:tab', namespaces=ns):
            tag = node.tag
            if tag.endswith('}t'):
                if node.text:
                    full_text += node.text
            elif tag.endswith('}tab'):
                full_text += '\t'
            else: # br or cr
                full_text += '\n'
        
        for line in full_text.split('\n'):
            line = line.lstrip()
            if line.startswith('- ') or re.match(r'^\d+\.\s', line):
                return True
        return False
    
@@ -1139,15 +1149,31 @@ def update_unnumbered_lists(docx_input, docx_output):
        if not runs:
            continue
        
        # Find ALL list item runs (runs starting with "- ")
        # Find ALL list item runs (runs starting with "- " at start of line)
        list_item_runs = []
        all_children = list(para)
        is_start_of_line = True
        
        for idx, child in enumerate(all_children):
            if child.tag == f"{{{ns['w']}}}r":
                text_elem = child.find('.//w:t', namespaces=ns)
                if text_elem is not None and text_elem.text and text_elem.text.startswith('- '):
                
                if is_start_of_line and text_elem is not None and text_elem.text:
                    text_stripped = text_elem.text.lstrip()
                    if text_stripped.startswith('- ') or re.match(r'^\d+\.\s', text_stripped):
                        list_item_runs.append((idx, child, text_elem))
                
                # Update state
                for node in child:
                    tag = node.tag
                    if tag == f"{{{ns['w']}}}br" or tag == f"{{{ns['w']}}}cr":
                        is_start_of_line = True
                    elif tag == f"{{{ns['w']}}}t":
                        if node.text and node.text.strip():
                            is_start_of_line = False
            elif child.tag == f"{{{ns['w']}}}hyperlink":
                is_start_of_line = False
        
        # If we found list items, process each one separately
        if list_item_runs:
            # Get the parent element (usually the document body or table cell)
@@ -1182,8 +1208,16 @@ def update_unnumbered_lists(docx_input, docx_output):
            # Process each list item run separately
            insert_offset = 0  # Track where to insert new paragraphs
            for list_idx, (run_idx, list_item_run, list_item_text_elem) in enumerate(list_item_runs):
                # Remove the "- " prefix
                list_item_text_elem.text = list_item_text_elem.text[2:]
                
                # Determine list type
                text = list_item_text_elem.text.lstrip()
                is_numbered_manual = bool(re.match(r'^\d+\.', text))

                # Remove the prefix
                if is_numbered_manual:
                     list_item_text_elem.text = re.sub(r'^\s*\d+\.\s*', '', list_item_text_elem.text)
                else:
                     list_item_text_elem.text = re.sub(r'^\s*-\s+', '', list_item_text_elem.text)

                # Create a new paragraph for this list item
                new_para = OxmlElement('w:p')
@@ -1235,10 +1269,13 @@ def update_unnumbered_lists(docx_input, docx_output):
                else:
                    # Simple structure for regular list items (outside tables)
                    pStyle = OxmlElement('w:pStyle')
                    if is_numbered_manual:
                        pStyle.set(f"{{{ns['w']}}}val", "BN")
                        counter_numbered += 1
                    else:
                        pStyle.set(f"{{{ns['w']}}}val", "B1")
                    pPr.append(pStyle)
                    
                        counter_regular += 1
                    pPr.append(pStyle)
                
                new_para.append(pPr)
                
@@ -2475,7 +2512,7 @@ def update_format_styles_cli():
    update_table_rows(args.docx_input, args.docx_output)
    update_notes(args.docx_input, args.docx_output)
    update_references(args.docx_input, args.docx_output)
    update_unnumbered_lists(args.docx_input, args.docx_output)
    update_lists(args.docx_input, args.docx_output)
    update_body_text_style(args.docx_input, args.docx_output)
    add_no_break_hyphens(args.docx_input, args.docx_output)
    update_references_style(args.docx_input, args.docx_output)