Loading generateBaseline/postprocessing.py +30 −7 Original line number Diff line number Diff line Loading @@ -1897,9 +1897,27 @@ def update_references(docx_input, docx_output): new_style = "EX" counter = 0 def _paragraph_text_from_runs(para): # Build paragraph text from runs (including runs inside hyperlinks). parts = [] for run in para.xpath('.//w:r', namespaces=ns): for child in run: if child.tag == f"{{{ns['w']}}}t" and child.text: parts.append(child.text) elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"): parts.append(" ") return ''.join(parts).strip() # Loop over all paragraphs with style "BodyText" or "FirstParagraph" and change to "EX" for para in root.xpath('.//w:p[w:pPr/w:pStyle[@w:val="BodyText" or @w:val="FirstParagraph"]]', namespaces=ns): old_val = para.get(f"{{{ns['w']}}}val") # Keep Editor's Notes untouched here; they are formatted later in # update_body_text_style (including highlighting). para_text = _paragraph_text_from_runs(para) if re.search(r'Editor[\'’]s\s+Note:', para_text, re.IGNORECASE): continue # A reference is a paragraph that contains: # - A pPr with pStyle "BodyText" or "FirstParagraph" (already checked in loop) # - A bookmarkStart and a bookmarkEnd Loading Loading @@ -2001,6 +2019,17 @@ def update_body_text_style(docx_input, docx_output): counter = 0 h6_counter = 0 def _paragraph_text_from_runs(para): # Build paragraph text from runs (including runs inside hyperlinks). parts = [] for run in para.xpath('.//w:r', namespaces=ns): for child in run: if child.tag == f"{{{ns['w']}}}t" and child.text: parts.append(child.text) elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"): parts.append(" ") return ''.join(parts).strip() # Loop over all elements to find "BodyText" and "FirstParagraph" and change to "Normal" # Combine both XPath queries body_text_elems = root.xpath('.//w:pStyle[@w:val="BodyText"]', namespaces=ns) Loading @@ -2008,7 +2037,6 @@ def update_body_text_style(docx_input, docx_output): all_elems = body_text_elems + first_para_elems for elem in all_elems: full_text = "" # elem is w:pStyle, its parent is w:pPr, and w:pPr's parent is w:p pPr = elem.getparent() if pPr is None: Loading @@ -2016,12 +2044,7 @@ def update_body_text_style(docx_input, docx_output): para = pPr.getparent() if para is None: continue runs = para.xpath('./w:r', namespaces=ns) for run in runs: text_elems = run.xpath('.//w:t', namespaces=ns) for text_elem in text_elems: if text_elem.text: full_text += text_elem.text full_text = _paragraph_text_from_runs(para) # Strip whitespace and check if text matches "Essential patents" or "Trademarks" full_text_stripped = full_text.strip() Loading Loading
generateBaseline/postprocessing.py +30 −7 Original line number Diff line number Diff line Loading @@ -1897,9 +1897,27 @@ def update_references(docx_input, docx_output): new_style = "EX" counter = 0 def _paragraph_text_from_runs(para): # Build paragraph text from runs (including runs inside hyperlinks). parts = [] for run in para.xpath('.//w:r', namespaces=ns): for child in run: if child.tag == f"{{{ns['w']}}}t" and child.text: parts.append(child.text) elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"): parts.append(" ") return ''.join(parts).strip() # Loop over all paragraphs with style "BodyText" or "FirstParagraph" and change to "EX" for para in root.xpath('.//w:p[w:pPr/w:pStyle[@w:val="BodyText" or @w:val="FirstParagraph"]]', namespaces=ns): old_val = para.get(f"{{{ns['w']}}}val") # Keep Editor's Notes untouched here; they are formatted later in # update_body_text_style (including highlighting). para_text = _paragraph_text_from_runs(para) if re.search(r'Editor[\'’]s\s+Note:', para_text, re.IGNORECASE): continue # A reference is a paragraph that contains: # - A pPr with pStyle "BodyText" or "FirstParagraph" (already checked in loop) # - A bookmarkStart and a bookmarkEnd Loading Loading @@ -2001,6 +2019,17 @@ def update_body_text_style(docx_input, docx_output): counter = 0 h6_counter = 0 def _paragraph_text_from_runs(para): # Build paragraph text from runs (including runs inside hyperlinks). parts = [] for run in para.xpath('.//w:r', namespaces=ns): for child in run: if child.tag == f"{{{ns['w']}}}t" and child.text: parts.append(child.text) elif child.tag in (f"{{{ns['w']}}}tab", f"{{{ns['w']}}}br"): parts.append(" ") return ''.join(parts).strip() # Loop over all elements to find "BodyText" and "FirstParagraph" and change to "Normal" # Combine both XPath queries body_text_elems = root.xpath('.//w:pStyle[@w:val="BodyText"]', namespaces=ns) Loading @@ -2008,7 +2037,6 @@ def update_body_text_style(docx_input, docx_output): all_elems = body_text_elems + first_para_elems for elem in all_elems: full_text = "" # elem is w:pStyle, its parent is w:pPr, and w:pPr's parent is w:p pPr = elem.getparent() if pPr is None: Loading @@ -2016,12 +2044,7 @@ def update_body_text_style(docx_input, docx_output): para = pPr.getparent() if para is None: continue runs = para.xpath('./w:r', namespaces=ns) for run in runs: text_elems = run.xpath('.//w:t', namespaces=ns) for text_elem in text_elems: if text_elem.text: full_text += text_elem.text full_text = _paragraph_text_from_runs(para) # Strip whitespace and check if text matches "Essential patents" or "Trademarks" full_text_stripped = full_text.strip() Loading