Commit a3abe79b authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Add several format styles update:

- figure captions
- figure layout
- table captions
- heading format
- unnumbered lists
- abbreviations
parent 45c2312a
Loading
Loading
Loading
Loading
Loading
+619 −1
Original line number Original line Diff line number Diff line
@@ -284,6 +284,8 @@ def update_toc_level(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wor
    # Regex for \o "x-y" with x and y being numbers
    # Regex for \o "x-y" with x and y being numbers
    pattern = re.compile(r'(?<=\\o )"\d+-\d+"\s*')
    pattern = re.compile(r'(?<=\\o )"\d+-\d+"\s*')


    # Track TOC paragraphs to insert page break after
    toc_paragraphs = []


    # Loop over all elements to find "TOC"
    # Loop over all elements to find "TOC"
    for elem in root.xpath('.//w:instrText', namespaces=ns):
    for elem in root.xpath('.//w:instrText', namespaces=ns):
@@ -293,6 +295,35 @@ def update_toc_level(xml_data, ns = {"w": "http://schemas.openxmlformats.org/wor


            print(f'Changed TOC: {old_text}{elem.text}')
            print(f'Changed TOC: {old_text}{elem.text}')
            
            
            # Find the paragraph containing this TOC field
            toc_para = elem
            while toc_para is not None and toc_para.tag != f"{{{ns['w']}}}p":
                toc_para = toc_para.getparent()
            
            if toc_para is not None and toc_para not in toc_paragraphs:
                toc_paragraphs.append(toc_para)
    
    # Insert page break after each TOC paragraph
    for toc_para in toc_paragraphs:
        parent = toc_para.getparent()
        if parent is None:
            continue
        
        # Find the position of the TOC paragraph
        para_index = list(parent).index(toc_para)
        
        # Create a new paragraph with a page break
        page_break_para = OxmlElement('w:p')
        page_break_run = OxmlElement('w:r')
        page_break = OxmlElement('w:br')
        page_break.set(f"{{{ns['w']}}}type", "page")
        page_break_run.append(page_break)
        page_break_para.append(page_break_run)
        
        # Insert the page break paragraph after the TOC paragraph
        parent.insert(para_index + 1, page_break_para)
        print(f'Inserted page break after TOC')

    return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")
    return etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")


def update_toc(docx_input, docx_output):
def update_toc(docx_input, docx_output):
@@ -476,3 +507,590 @@ def table_widths_adjustment(config):
                cell = row.cells[i]
                cell = row.cells[i]
                cell.width = width
                cell.width = width
    doc.save(docx_path)
    doc.save(docx_path)



def update_figure_captions(docx_input, docx_output):
    """
    Updates figure caption styles from 'ImageCaption' to 'TF' in a DOCX file.
    
    Parameters
    ----------
    docx_input : str
        Path to the input DOCX file.
    docx_output : str
        Path to the output DOCX file.
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    
    # Read XML
    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")

    root = etree.fromstring(xml_data)
    new_style = "TF"
    counter = 0

    # Loop over all elements to find "ImageCaption" and change to "TF"
    for elem in root.xpath('.//w:pStyle[@w:val="ImageCaption"]', namespaces=ns):
        old_val = elem.get(f"{{{ns['w']}}}val")
        elem.set(f"{{{ns['w']}}}val", new_style)
        counter += 1
        
    print(f'Changed style "ImageCaption" to "TF" {counter} times')

    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")
    
    # create temp file
    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)  # Datei wird nur über zipfile geöffnet

    try:
        # write new docx to temp file
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        # Write to output file
        shutil.move(tmp_path, docx_output)
        # Set proper permissions (read/write for owner, read for group and others)
        os.chmod(docx_output, 0o644)

    finally:
        # delete temp file if still existing
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def update_figure_style(docx_input, docx_output):
    """
    Updates figure style from 'ImageCaption' to 'TF' in a DOCX file.
    
    Parameters
    ----------
    docx_input : str
        Path to the input DOCX file.
    docx_output : str
        Path to the output DOCX file.
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    
    # Read XML
    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")

    root = etree.fromstring(xml_data)
    new_style = "FL"
    counter = 0

    # Loop over all elements to find "ImageCaption" and change to "TF"
    for elem in root.xpath('.//w:pStyle[@w:val="CaptionedFigure"]', namespaces=ns):
        old_val = elem.get(f"{{{ns['w']}}}val")
        elem.set(f"{{{ns['w']}}}val", new_style)
        counter += 1
        
    print(f'Changed style "CaptionedFigure" to "FL" {counter} times')

    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")
    
    # create temp file
    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)  # Datei wird nur über zipfile geöffnet

    try:
        # write new docx to temp file
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        # Write to output file
        shutil.move(tmp_path, docx_output)
        # Set proper permissions (read/write for owner, read for group and others)
        os.chmod(docx_output, 0o644)

    finally:
        # delete temp file if still existing
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def update_heading_styles(docx_input, docx_output):
    """
    Updates heading runs to split number from text with a tab.
    Transforms: "6 Architecture model..." to "6" + tab + "Architecture model..."
    
    Parameters
    ----------
    docx_input : str
        Path to the input DOCX file.
    docx_output : str
        Path to the output DOCX file.
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    
    # Read XML
    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")

    root = etree.fromstring(xml_data)
    counter = 0

    # Find all paragraphs with heading styles (Heading1, Heading2, etc.)
    for para in root.xpath('.//w:p[w:pPr/w:pStyle[starts-with(@w:val, "Heading")]]', namespaces=ns):
        # Get all runs in this paragraph
        runs = para.xpath('.//w:r', namespaces=ns)
        if not runs:
            continue
            
        # Collect all text from runs
        full_text = ""
        for run in runs:
            text_elem = run.find('.//w:t', namespaces=ns)
            if text_elem is not None and text_elem.text:
                full_text += text_elem.text
        
        # Check if text starts with a clause number followed by space
        match = re.match(r'^(\d+(?:\.\d+)*)\s+(.+)$', full_text)
        if match:
            number = match.group(1)
            rest_text = match.group(2)
            
            # Clear all existing runs from the paragraph
            for run in runs:
                para.remove(run)
            
            # Create new run with number + tab + rest
            new_run = OxmlElement('w:r')
            
            # Number text
            num_t = OxmlElement('w:t')
            num_t.text = number
            new_run.append(num_t)
            
            # Tab
            tab = OxmlElement('w:tab')
            new_run.append(tab)
            
            # Rest of text
            rest_t = OxmlElement('w:t')
            rest_t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
            rest_t.text = rest_text
            new_run.append(rest_t)
            
            # Add the new run to the paragraph
            para.append(new_run)
            counter += 1

    print(f'Updated {counter} heading runs')

    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")
    
    # create temp file
    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)  # Datei wird nur über zipfile geöffnet

    try:
        # write new docx to temp file
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        # Write to output file
        shutil.move(tmp_path, docx_output)
        # Set proper permissions (read/write for owner, read for group and others)
        os.chmod(docx_output, 0o644)

    finally:
        # delete temp file if still existing
        if os.path.exists(tmp_path):
            os.remove(tmp_path)


def update_unnumbered_lists(docx_input, docx_output):
    """
    Updates unnumbered list items (starting with "- ") in tables to appear as bulleted lists.
    For list items in tables: removes "- " prefix and creates separate paragraphs with FP style and numPr.
    For list items outside tables: removes "- " prefix and adds B1 style.
    
    Parameters
    ----------
    docx_input : str
        Path to the input DOCX file.
    docx_output : str
        Path to the output DOCX file.
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    
    # Read XML
    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")

    root = etree.fromstring(xml_data)
    counter_table = 0
    counter_regular = 0
    
    # Track processed paragraphs to avoid reprocessing
    processed_paras = set()

    # Find all paragraphs - need to collect them first since we'll be modifying the tree
    paragraphs = root.xpath('.//w:p', namespaces=ns)
    
    def is_list_item_para(para):
        """Check if paragraph contains a list item (starts with '- ')"""
        runs = para.xpath('./w:r', namespaces=ns)
        for run in runs:
            text_elem = run.find('.//w:t', namespaces=ns)
            if text_elem is not None and text_elem.text and text_elem.text.startswith('- '):
                return True
        return False
    
    def is_blank_para(para):
        """Check if paragraph is blank (empty or only whitespace)"""
        runs = para.xpath('./w:r', namespaces=ns)
        if not runs:
            return True
        all_text = ''
        for run in runs:
            text_elems = run.xpath('.//w:t', namespaces=ns)
            for text_elem in text_elems:
                if text_elem.text:
                    all_text += text_elem.text
        return not all_text.strip()
    
    def get_para_text(para):
        """Get all text from a paragraph"""
        runs = para.xpath('./w:r', namespaces=ns)
        text = ''
        for run in runs:
            text_elems = run.xpath('.//w:t', namespaces=ns)
            for text_elem in text_elems:
                if text_elem.text:
                    text += text_elem.text
        return text
    
    for para in paragraphs:
        # Skip if already processed
        if id(para) in processed_paras:
            continue
        
        # Get all direct child runs (not nested runs)
        runs = para.xpath('./w:r', namespaces=ns)
        if not runs:
            continue
        
        # Find ALL list item runs (runs starting with "- ")
        list_item_runs = []
        all_children = list(para)
        for idx, child in enumerate(all_children):
            if child.tag == f"{{{ns['w']}}}r":
                text_elem = child.find('.//w:t', namespaces=ns)
                if text_elem is not None and text_elem.text and text_elem.text.startswith('- '):
                    list_item_runs.append((idx, child, text_elem))
        
        # If we found list items, process each one separately
        if list_item_runs:
            # Get the parent element (usually the document body or table cell)
            parent = para.getparent()
            if parent is None:
                continue
            
            # Find the position of this paragraph
            para_index = list(parent).index(para)
            
            # Check if paragraph is inside a table
            is_in_table = bool(para.xpath('ancestor::w:tbl', namespaces=ns))
            
            # If in table and there are runs before the first list item, update original para pStyle to FP
            if is_in_table and list_item_runs[0][0] > 0:
                # Get or create pPr for the original paragraph
                orig_pPr = para.find('.//w:pPr', namespaces=ns)
                if orig_pPr is None:
                    orig_pPr = OxmlElement('w:pPr')
                    para.insert(0, orig_pPr)
                else:
                    # Remove existing pStyle if any
                    existing_pStyle = orig_pPr.find('.//w:pStyle', namespaces=ns)
                    if existing_pStyle is not None:
                        orig_pPr.remove(existing_pStyle)
                
                # Add FP style
                pStyle = OxmlElement('w:pStyle')
                pStyle.set(f"{{{ns['w']}}}val", "FP")
                orig_pPr.insert(0, pStyle)  # Insert at beginning
            
            # Process each list item run separately
            insert_offset = 0  # Track where to insert new paragraphs
            for list_idx, (run_idx, list_item_run, list_item_text_elem) in enumerate(list_item_runs):
                # Remove the "- " prefix
                list_item_text_elem.text = list_item_text_elem.text[2:]
                
                # Create a new paragraph for this list item
                new_para = OxmlElement('w:p')
                
                # Create pPr
                pPr = OxmlElement('w:pPr')
                
                if is_in_table:
                    # Bulleted list structure for table list items
                    # pStyle
                    pStyle = OxmlElement('w:pStyle')
                    pStyle.set(f"{{{ns['w']}}}val", "FP")
                    pPr.append(pStyle)
                    
                    # keepNext
                    keepNext = OxmlElement('w:keepNext')
                    pPr.append(keepNext)
                    
                    # numPr (for bulleted list)
                    numPr = OxmlElement('w:numPr')
                    ilvl = OxmlElement('w:ilvl')
                    ilvl.set(f"{{{ns['w']}}}val", "0")
                    numId = OxmlElement('w:numId')
                    numId.set(f"{{{ns['w']}}}val", "14")
                    numPr.append(ilvl)
                    numPr.append(numId)
                    pPr.append(numPr)
                    
                    # tabs
                    tabs = OxmlElement('w:tabs')
                    tab = OxmlElement('w:tab')
                    tab.set(f"{{{ns['w']}}}val", "left")
                    tab.set(f"{{{ns['w']}}}pos", "3118")
                    tabs.append(tab)
                    pPr.append(tabs)
                    
                    # spacing
                    spacing = OxmlElement('w:spacing')
                    spacing.set(f"{{{ns['w']}}}before", "80")
                    spacing.set(f"{{{ns['w']}}}after", "80")
                    pPr.append(spacing)
                    
                    # Left alignment
                    jc = OxmlElement('w:jc')
                    jc.set(f"{{{ns['w']}}}val", "left")
                    pPr.append(jc)
                    
                    counter_table += 1
                else:
                    # Simple structure for regular list items (outside tables)
                    pStyle = OxmlElement('w:pStyle')
                    pStyle.set(f"{{{ns['w']}}}val", "B1")
                    pPr.append(pStyle)
                    
                    counter_regular += 1
                
                new_para.append(pPr)
                
                # Find runs that belong to this list item
                # From this list item run until the next list item run (or end of paragraph)
                start_idx = run_idx
                end_idx = list_item_runs[list_idx + 1][0] if list_idx + 1 < len(list_item_runs) else len(all_children)
                
                # Move runs for this list item to the new paragraph
                runs_to_move = []
                for idx in range(start_idx, end_idx):
                    child = all_children[idx]
                    if child.tag != f"{{{ns['w']}}}pPr":
                        runs_to_move.append(child)
                
                # Remove from original and add to new paragraph
                for run in runs_to_move:
                    if run in para:
                        para.remove(run)
                    new_para.append(run)
                
                # Now look at subsequent paragraphs in the same parent and merge them
                # until we hit another list item, blank line, or end of parent
                current_para_pos = para_index + insert_offset + 1
                next_index = current_para_pos
                while next_index < len(parent):
                    next_para = parent[next_index]
                    
                    # Stop if we hit another list item
                    if is_list_item_para(next_para):
                        break
                    
                    # Stop if we hit a blank line
                    if is_blank_para(next_para):
                        break
                    
                    # Stop if paragraph is in a different table cell (different parent)
                    if next_para.getparent() != parent:
                        break
                    
                    # Merge this paragraph's runs into the list item paragraph
                    for run in list(next_para):
                        if run.tag != f"{{{ns['w']}}}pPr":
                            next_para.remove(run)
                            new_para.append(run)
                    
                    # Mark as processed and remove the merged paragraph
                    processed_paras.add(id(next_para))
                    parent.remove(next_para)
                    # Don't increment next_index since we removed an element
                
                # Insert the new paragraph
                current_para_pos = para_index + insert_offset + 1
                parent.insert(current_para_pos, new_para)
                insert_offset += 1
            
            # Only remove the original paragraph if it has no content left (only pPr or empty)
            remaining_runs = [c for c in para if c.tag != f"{{{ns['w']}}}pPr"]
            if not remaining_runs:
                processed_paras.add(id(para))
                parent.remove(para)

    print(f'Updated {counter_table} unnumbered list items in tables, {counter_regular} outside tables')

    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")
    
    # Create temp file
    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)

    try:
        # Write new docx to temp file
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        # Write to output file
        shutil.move(tmp_path, docx_output)
        # Set proper permissions (read/write for owner, read for group and others)
        os.chmod(docx_output, 0o644)

    finally:
        # Delete temp file if still existing
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def update_table_captions(docx_input, docx_output):
    """
    Updates table caption styles from 'TableCaption' to 'TF' in a DOCX file.
    
    Parameters
    ----------
    docx_input : str
        Path to the input DOCX file.
    docx_output : str
        Path to the output DOCX file.
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    
    # Read XML
    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")

    root = etree.fromstring(xml_data)
    new_style = "TH"
    counter = 0

    # Loop over all elements to find "TableCaption" and change to "TH"
    for elem in root.xpath('.//w:pStyle[@w:val="TableCaption"]', namespaces=ns):
        old_val = elem.get(f"{{{ns['w']}}}val")
        elem.set(f"{{{ns['w']}}}val", new_style)
        counter += 1
        
    print(f'Changed style "TableCaption" to "TH" {counter} times')

    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")
    
    # create temp file
    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)  # Datei wird nur über zipfile geöffnet

    try:
        # write new docx to temp file
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        # Write to output file
        shutil.move(tmp_path, docx_output)
        # Set proper permissions (read/write for owner, read for group and others)
        os.chmod(docx_output, 0o644)

    finally:
        # delete temp file if still existing
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def update_abbreviations(docx_input, docx_output):
    """
    Updates abbreviations styles from 'Abbreviation' to 'TF' in a DOCX file.
    
    Parameters
    ----------
    docx_input : str
        Path to the input DOCX file.
    docx_output : str
        Path to the output DOCX file.
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    
    # Read XML
    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")

    root = etree.fromstring(xml_data)
    new_style = "EW"
    counter = 0

    # Loop over all elements to find "VerbatimChar" and change to "EX"
    for elem in root.xpath('.//w:rStyle[@w:val="VerbatimChar"]', namespaces=ns):
        old_val = elem.get(f"{{{ns['w']}}}val")
        elem.set(f"{{{ns['w']}}}val", new_style)
        counter += 1
        
    print(f'Changed style "VerbatimChar" to "EW" {counter} times')

    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")
    
    # create temp file
    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)  # Datei wird nur über zipfile geöffnet

    try:
        # write new docx to temp file
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        # Write to output file
        shutil.move(tmp_path, docx_output)
        # Set proper permissions (read/write for owner, read for group and others)
        os.chmod(docx_output, 0o644)

    finally:
        # delete temp file if still existing
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def update_format_styles_cli():
    parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.")
    parser.add_argument("docx_input", help="Path to input DOCX file")
    parser.add_argument("docx_output", help="Path to output DOCX file")
    args = parser.parse_args()

    update_figure_captions(args.docx_input, args.docx_output)
    update_heading_styles(args.docx_input, args.docx_output)
    update_figure_style(args.docx_input, args.docx_output)
    update_unnumbered_lists(args.docx_input, args.docx_output)
    update_table_captions(args.docx_input, args.docx_output)
    update_abbreviations(args.docx_input, args.docx_output)
 No newline at end of file
+1 −0
Original line number Original line Diff line number Diff line
@@ -21,6 +21,7 @@ setup(
        						"check_multipage_tables=postprocessing:insert_page_break_before_long_tables",
        						"check_multipage_tables=postprocessing:insert_page_break_before_long_tables",
        						#"apply_etsi_styling: postprocessing:postprocess_etsi_styles",
        						#"apply_etsi_styling: postprocessing:postprocess_etsi_styles",
        						"update_toc=postprocessing:update_toc_cli",
        						"update_toc=postprocessing:update_toc_cli",
								"update_format_styles=postprocessing:update_format_styles_cli",
								"refresh_docx_fields=postprocessing:refresh_docx_fields_cli",
								"refresh_docx_fields=postprocessing:refresh_docx_fields_cli",
			]
			]
            }
            }