Commit 0db8bef5 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Adding support for the alternative approach for abbreviations

parent 7e08c6c2
Loading
Loading
Loading
Loading
Loading
+88 −36
Original line number Diff line number Diff line
@@ -664,6 +664,57 @@ def table_widths_adjustment(config):
                cell.width = width
    doc.save(docx_path)

def _paragraph_text(elem):
    """
    Returns the text of a paragraph, respecting how headings are created in update_heading_styles:
    number + <w:tab/> + title in separate run parts.
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    if elem is None or elem.tag != f"{{{ns['w']}}}p":
        return ""
    # Build text from runs to respect how headings are created in update_heading_styles:
    # number + <w:tab/> + title in separate run parts.
    parts = []
    for run in elem.xpath('./w:r', namespaces=ns):
        for child in run:
            if child.tag == f"{{{ns['w']}}}t" and child.text:
                parts.append(child.text)
            elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"):
                parts.append(" ")
    # Fallback for paragraphs containing text in non-direct run descendants.
    if not parts:
        parts = elem.xpath('.//w:t/text()', namespaces=ns)
    return ''.join(parts).strip()


def _paragraph_style_val(para, ns):
    w = ns['w']
    p_pr = para.find(f'{{{w}}}pPr')
    if p_pr is None:
        return None
    p_style = p_pr.find(f'{{{w}}}pStyle')
    if p_style is None:
        return None
    return p_style.get(f'{{{w}}}val')


def _normalize_ws(text):
    return re.sub(r'\s+', ' ', text.replace('\u00A0', ' ')).strip()


def _is_in_abbreviations_clause(para, ns):
    """True when the nearest preceding Heading2 section is Abbreviations."""
    w = ns['w']
    previous = para.getprevious()
    while previous is not None:
        if previous.tag != f'{{{w}}}p':
            previous = previous.getprevious()
            continue
        if _paragraph_style_val(previous, ns) == 'Heading2':
            heading_text = _normalize_ws(_paragraph_text(previous))
            return bool(re.search(r'\babbreviations\b', heading_text, flags=re.IGNORECASE))
        previous = previous.getprevious()
    return False


def update_figure_captions(docx_input, docx_output):
@@ -2053,6 +2104,8 @@ def update_notes(docx_input, docx_output):

                prev_pStyle_val = prev_pStyle[0].get(f"{{{ns['w']}}}val")
                if prev_pStyle_val == example_style:
                    # EW in the Abbreviations clause is an abbreviation entry, not an example.
                    if not _is_in_abbreviations_clause(para, ns):
                        pStyle_elem.set(f"{{{ns['w']}}}val", "PL")
                        print(f'Changed style "BlockText" to "PL" because it is preceded by a paragraph with style "EW" (line {prev_para.sourceline})')
                break
@@ -2110,7 +2163,6 @@ def update_abbreviations(docx_input, docx_output):

    # Find all paragraphs with BodyText style
    paragraphs_to_process = root.xpath('.//w:p[w:pPr/w:pStyle[@w:val="BodyText"]]', namespaces=ns)
    
    for para in paragraphs_to_process:
        # Check if this paragraph contains runs with Verbatim or VerbatimChar style
        verbatim_runs = para.xpath('./w:r[w:rPr/w:rStyle[@w:val="Verbatim" or @w:val="VerbatimChar"]]', namespaces=ns)
@@ -2118,6 +2170,9 @@ def update_abbreviations(docx_input, docx_output):
        if not verbatim_runs:
            continue

        if not _is_in_abbreviations_clause(para, ns):
            continue

        # Get the parent element to insert new paragraphs
        parent = para.getparent()
        if parent is None:
@@ -2128,20 +2183,30 @@ def update_abbreviations(docx_input, docx_output):
        
        # Process each verbatim run separately
        new_paragraphs = []
        option2_definition = False
        for verbatim_run in verbatim_runs:
            # Extract text from this specific verbatim run
            full_text = ""
            full_text_in_verbatim_run = ""
            text_elems = verbatim_run.xpath('.//w:t', namespaces=ns)
            for text_elem in text_elems:
                if text_elem.text:
                    full_text += text_elem.text
            
                    full_text_in_verbatim_run += text_elem.text
            # Two options for abbreviations:
            # 1. `ABBR        Abbreviation`
            #    - abbreviation followed by 2+ spaces followed by definition (usually all in a verbatim run)
            # 2. `ABBR` Abbreviation
            #    - abbreviation (in a verbatim run) followed by definition (in the remaining runs of the paragraph) separated by one or more spaces
            # Check if text contains multiple spaces (2 or more) separating two parts
            # Pattern: abbreviation followed by 2+ spaces followed by definition
            match = re.match(r'^(.+?)\s{2,}(.+)$', full_text.strip())
            match = re.match(r'^(.+?)\s{2,}(.+)$', full_text_in_verbatim_run.strip())
            if not match:
                continue
            
                # Option 2: abbreviation (in a verbatim run) followed by definition (in the remaining runs of the paragraph) separated by one or more spaces
                # Take all remaining runs of the paragraph as they are, except the verbatim run
                remaining_runs = [run for run in para.xpath('./w:r', namespaces=ns) if run != verbatim_run]
                abbreviation = full_text_in_verbatim_run.strip()
                option2_definition = True
                #definition = ''.join([run.xpath('.//w:t/text()', namespaces=ns)[0].text for run in remaining_runs]).strip()
            else:
                abbreviation = match.group(1).strip()
                definition = match.group(2).strip()
            
@@ -2168,13 +2233,17 @@ def update_abbreviations(docx_input, docx_output):
            run_tab.append(tab)
            new_para.append(run_tab)
            
            if not option2_definition:
                # Create second run with definition
                run2 = OxmlElement('w:r')
                t2 = OxmlElement('w:t')
                t2.text = definition
                run2.append(t2)
                new_para.append(run2)
            
            else:
                # Create second run with definition, append all remaining runs of the paragraph
                for run in remaining_runs:
                    new_para.append(run)
            new_paragraphs.append(new_para)
            counter += 1
        
@@ -2779,23 +2848,6 @@ def add_break_after_code_blocks_and_tables(docx_input, docx_output):
            return None
        return style_elems[0].get(w_val)

    def _paragraph_text(elem):
        if elem is None or elem.tag != w_p:
            return ""
        # Build text from runs to respect how headings are created in update_heading_styles:
        # number + <w:tab/> + title in separate run parts.
        parts = []
        for run in elem.xpath('./w:r', namespaces=ns):
            for child in run:
                if child.tag == f"{{{ns['w']}}}t" and child.text:
                    parts.append(child.text)
                elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"):
                    parts.append(" ")
        # Fallback for paragraphs containing text in non-direct run descendants.
        if not parts:
            parts = elem.xpath('.//w:t/text()', namespaces=ns)
        return ''.join(parts).strip()

    def _normalize_ws(text):
        # Normalize tabs/non-breaking spaces/multiple spaces for robust heading match.
        return re.sub(r'\s+', ' ', text.replace('\u00A0', ' ')).strip()