Loading generateBaseline/postprocessing.py +88 −36 Original line number Diff line number Diff line Loading @@ -664,6 +664,57 @@ def table_widths_adjustment(config): cell.width = width doc.save(docx_path) def _paragraph_text(elem): """ Returns the text of a paragraph, respecting how headings are created in update_heading_styles: number + <w:tab/> + title in separate run parts. """ ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} if elem is None or elem.tag != f"{{{ns['w']}}}p": return "" # Build text from runs to respect how headings are created in update_heading_styles: # number + <w:tab/> + title in separate run parts. parts = [] for run in elem.xpath('./w:r', namespaces=ns): for child in run: if child.tag == f"{{{ns['w']}}}t" and child.text: parts.append(child.text) elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"): parts.append(" ") # Fallback for paragraphs containing text in non-direct run descendants. if not parts: parts = elem.xpath('.//w:t/text()', namespaces=ns) return ''.join(parts).strip() def _paragraph_style_val(para, ns): w = ns['w'] p_pr = para.find(f'{{{w}}}pPr') if p_pr is None: return None p_style = p_pr.find(f'{{{w}}}pStyle') if p_style is None: return None return p_style.get(f'{{{w}}}val') def _normalize_ws(text): return re.sub(r'\s+', ' ', text.replace('\u00A0', ' ')).strip() def _is_in_abbreviations_clause(para, ns): """True when the nearest preceding Heading2 section is Abbreviations.""" w = ns['w'] previous = para.getprevious() while previous is not None: if previous.tag != f'{{{w}}}p': previous = previous.getprevious() continue if _paragraph_style_val(previous, ns) == 'Heading2': heading_text = _normalize_ws(_paragraph_text(previous)) return bool(re.search(r'\babbreviations\b', heading_text, flags=re.IGNORECASE)) previous = previous.getprevious() return False def update_figure_captions(docx_input, docx_output): Loading Loading @@ -2053,6 +2104,8 @@ def update_notes(docx_input, docx_output): prev_pStyle_val = prev_pStyle[0].get(f"{{{ns['w']}}}val") if prev_pStyle_val == example_style: # EW in the Abbreviations clause is an abbreviation entry, not an example. if not _is_in_abbreviations_clause(para, ns): pStyle_elem.set(f"{{{ns['w']}}}val", "PL") print(f'Changed style "BlockText" to "PL" because it is preceded by a paragraph with style "EW" (line {prev_para.sourceline})') break Loading Loading @@ -2110,7 +2163,6 @@ def update_abbreviations(docx_input, docx_output): # Find all paragraphs with BodyText style paragraphs_to_process = root.xpath('.//w:p[w:pPr/w:pStyle[@w:val="BodyText"]]', namespaces=ns) for para in paragraphs_to_process: # Check if this paragraph contains runs with Verbatim or VerbatimChar style verbatim_runs = para.xpath('./w:r[w:rPr/w:rStyle[@w:val="Verbatim" or @w:val="VerbatimChar"]]', namespaces=ns) Loading @@ -2118,6 +2170,9 @@ def update_abbreviations(docx_input, docx_output): if not verbatim_runs: continue if not _is_in_abbreviations_clause(para, ns): continue # Get the parent element to insert new paragraphs parent = para.getparent() if parent is None: Loading @@ -2128,20 +2183,30 @@ def update_abbreviations(docx_input, docx_output): # Process each verbatim run separately new_paragraphs = [] option2_definition = False for verbatim_run in verbatim_runs: # Extract text from this specific verbatim run full_text = "" full_text_in_verbatim_run = "" text_elems = verbatim_run.xpath('.//w:t', namespaces=ns) for text_elem in text_elems: if text_elem.text: full_text += text_elem.text full_text_in_verbatim_run += text_elem.text # Two options for abbreviations: # 1. `ABBR Abbreviation` # - abbreviation followed by 2+ spaces followed by definition (usually all in a verbatim run) # 2. `ABBR` Abbreviation # - abbreviation (in a verbatim run) followed by definition (in the remaining runs of the paragraph) separated by one or more spaces # Check if text contains multiple spaces (2 or more) separating two parts # Pattern: abbreviation followed by 2+ spaces followed by definition match = re.match(r'^(.+?)\s{2,}(.+)$', full_text.strip()) match = re.match(r'^(.+?)\s{2,}(.+)$', full_text_in_verbatim_run.strip()) if not match: continue # Option 2: abbreviation (in a verbatim run) followed by definition (in the remaining runs of the paragraph) separated by one or more spaces # Take all remaining runs of the paragraph as they are, except the verbatim run remaining_runs = [run for run in para.xpath('./w:r', namespaces=ns) if run != verbatim_run] abbreviation = full_text_in_verbatim_run.strip() option2_definition = True #definition = ''.join([run.xpath('.//w:t/text()', namespaces=ns)[0].text for run in remaining_runs]).strip() else: abbreviation = match.group(1).strip() definition = match.group(2).strip() Loading @@ -2168,13 +2233,17 @@ def update_abbreviations(docx_input, docx_output): run_tab.append(tab) new_para.append(run_tab) if not option2_definition: # Create second run with definition run2 = OxmlElement('w:r') t2 = OxmlElement('w:t') t2.text = definition run2.append(t2) new_para.append(run2) else: # Create second run with definition, append all remaining runs of the paragraph for run in remaining_runs: new_para.append(run) new_paragraphs.append(new_para) counter += 1 Loading Loading @@ -2779,23 +2848,6 @@ def add_break_after_code_blocks_and_tables(docx_input, docx_output): return None return style_elems[0].get(w_val) def _paragraph_text(elem): if elem is None or elem.tag != w_p: return "" # Build text from runs to respect how headings are created in update_heading_styles: # number + <w:tab/> + title in separate run parts. parts = [] for run in elem.xpath('./w:r', namespaces=ns): for child in run: if child.tag == f"{{{ns['w']}}}t" and child.text: parts.append(child.text) elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"): parts.append(" ") # Fallback for paragraphs containing text in non-direct run descendants. if not parts: parts = elem.xpath('.//w:t/text()', namespaces=ns) return ''.join(parts).strip() def _normalize_ws(text): # Normalize tabs/non-breaking spaces/multiple spaces for robust heading match. return re.sub(r'\s+', ' ', text.replace('\u00A0', ' ')).strip() Loading Loading
generateBaseline/postprocessing.py +88 −36 Original line number Diff line number Diff line Loading @@ -664,6 +664,57 @@ def table_widths_adjustment(config): cell.width = width doc.save(docx_path) def _paragraph_text(elem): """ Returns the text of a paragraph, respecting how headings are created in update_heading_styles: number + <w:tab/> + title in separate run parts. """ ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} if elem is None or elem.tag != f"{{{ns['w']}}}p": return "" # Build text from runs to respect how headings are created in update_heading_styles: # number + <w:tab/> + title in separate run parts. parts = [] for run in elem.xpath('./w:r', namespaces=ns): for child in run: if child.tag == f"{{{ns['w']}}}t" and child.text: parts.append(child.text) elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"): parts.append(" ") # Fallback for paragraphs containing text in non-direct run descendants. if not parts: parts = elem.xpath('.//w:t/text()', namespaces=ns) return ''.join(parts).strip() def _paragraph_style_val(para, ns): w = ns['w'] p_pr = para.find(f'{{{w}}}pPr') if p_pr is None: return None p_style = p_pr.find(f'{{{w}}}pStyle') if p_style is None: return None return p_style.get(f'{{{w}}}val') def _normalize_ws(text): return re.sub(r'\s+', ' ', text.replace('\u00A0', ' ')).strip() def _is_in_abbreviations_clause(para, ns): """True when the nearest preceding Heading2 section is Abbreviations.""" w = ns['w'] previous = para.getprevious() while previous is not None: if previous.tag != f'{{{w}}}p': previous = previous.getprevious() continue if _paragraph_style_val(previous, ns) == 'Heading2': heading_text = _normalize_ws(_paragraph_text(previous)) return bool(re.search(r'\babbreviations\b', heading_text, flags=re.IGNORECASE)) previous = previous.getprevious() return False def update_figure_captions(docx_input, docx_output): Loading Loading @@ -2053,6 +2104,8 @@ def update_notes(docx_input, docx_output): prev_pStyle_val = prev_pStyle[0].get(f"{{{ns['w']}}}val") if prev_pStyle_val == example_style: # EW in the Abbreviations clause is an abbreviation entry, not an example. if not _is_in_abbreviations_clause(para, ns): pStyle_elem.set(f"{{{ns['w']}}}val", "PL") print(f'Changed style "BlockText" to "PL" because it is preceded by a paragraph with style "EW" (line {prev_para.sourceline})') break Loading Loading @@ -2110,7 +2163,6 @@ def update_abbreviations(docx_input, docx_output): # Find all paragraphs with BodyText style paragraphs_to_process = root.xpath('.//w:p[w:pPr/w:pStyle[@w:val="BodyText"]]', namespaces=ns) for para in paragraphs_to_process: # Check if this paragraph contains runs with Verbatim or VerbatimChar style verbatim_runs = para.xpath('./w:r[w:rPr/w:rStyle[@w:val="Verbatim" or @w:val="VerbatimChar"]]', namespaces=ns) Loading @@ -2118,6 +2170,9 @@ def update_abbreviations(docx_input, docx_output): if not verbatim_runs: continue if not _is_in_abbreviations_clause(para, ns): continue # Get the parent element to insert new paragraphs parent = para.getparent() if parent is None: Loading @@ -2128,20 +2183,30 @@ def update_abbreviations(docx_input, docx_output): # Process each verbatim run separately new_paragraphs = [] option2_definition = False for verbatim_run in verbatim_runs: # Extract text from this specific verbatim run full_text = "" full_text_in_verbatim_run = "" text_elems = verbatim_run.xpath('.//w:t', namespaces=ns) for text_elem in text_elems: if text_elem.text: full_text += text_elem.text full_text_in_verbatim_run += text_elem.text # Two options for abbreviations: # 1. `ABBR Abbreviation` # - abbreviation followed by 2+ spaces followed by definition (usually all in a verbatim run) # 2. `ABBR` Abbreviation # - abbreviation (in a verbatim run) followed by definition (in the remaining runs of the paragraph) separated by one or more spaces # Check if text contains multiple spaces (2 or more) separating two parts # Pattern: abbreviation followed by 2+ spaces followed by definition match = re.match(r'^(.+?)\s{2,}(.+)$', full_text.strip()) match = re.match(r'^(.+?)\s{2,}(.+)$', full_text_in_verbatim_run.strip()) if not match: continue # Option 2: abbreviation (in a verbatim run) followed by definition (in the remaining runs of the paragraph) separated by one or more spaces # Take all remaining runs of the paragraph as they are, except the verbatim run remaining_runs = [run for run in para.xpath('./w:r', namespaces=ns) if run != verbatim_run] abbreviation = full_text_in_verbatim_run.strip() option2_definition = True #definition = ''.join([run.xpath('.//w:t/text()', namespaces=ns)[0].text for run in remaining_runs]).strip() else: abbreviation = match.group(1).strip() definition = match.group(2).strip() Loading @@ -2168,13 +2233,17 @@ def update_abbreviations(docx_input, docx_output): run_tab.append(tab) new_para.append(run_tab) if not option2_definition: # Create second run with definition run2 = OxmlElement('w:r') t2 = OxmlElement('w:t') t2.text = definition run2.append(t2) new_para.append(run2) else: # Create second run with definition, append all remaining runs of the paragraph for run in remaining_runs: new_para.append(run) new_paragraphs.append(new_para) counter += 1 Loading Loading @@ -2779,23 +2848,6 @@ def add_break_after_code_blocks_and_tables(docx_input, docx_output): return None return style_elems[0].get(w_val) def _paragraph_text(elem): if elem is None or elem.tag != w_p: return "" # Build text from runs to respect how headings are created in update_heading_styles: # number + <w:tab/> + title in separate run parts. parts = [] for run in elem.xpath('./w:r', namespaces=ns): for child in run: if child.tag == f"{{{ns['w']}}}t" and child.text: parts.append(child.text) elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"): parts.append(" ") # Fallback for paragraphs containing text in non-direct run descendants. if not parts: parts = elem.xpath('.//w:t/text()', namespaces=ns) return ''.join(parts).strip() def _normalize_ws(text): # Normalize tabs/non-breaking spaces/multiple spaces for robust heading match. return re.sub(r'\s+', ' ', text.replace('\u00A0', ' ')).strip() Loading