Loading generateBaseline/postprocessing.py +103 −36 Original line number Diff line number Diff line Loading @@ -913,20 +913,23 @@ def update_unnumbered_lists(docx_input, docx_output): namespaces=ns ) if not abstract_num: return (None, None, None) return (None, None, None, None) # Check the format for this level lvl = abstract_num[0].xpath( numFmt_elem = abstract_num[0].xpath( f'./w:lvl[@w:ilvl="{ilvl}"]/w:numFmt', namespaces=ns ) if not lvl: return (None, None, None) if not numFmt_elem: return (None, None, None, None) num_fmt = lvl[0].get(f"{{{ns['w']}}}val") if num_fmt is None: return (None, None, None) numFmt = numFmt_elem[0].get(f"{{{ns['w']}}}val") lvl_text_elem = abstract_num[0].xpath(f'./w:lvl[@w:ilvl="{ilvl}"]/w:lvlText', namespaces=ns) if not lvl_text_elem: lvl_text = None lvl_text = lvl_text_elem[0].get(f"{{{ns['w']}}}val") # Numbered formats numbered_formats = ['decimal', 'lowerLetter', 'upperLetter', 'lowerRoman', 'upperRoman', 'arabic', 'ordinal', Loading @@ -951,17 +954,17 @@ def update_unnumbered_lists(docx_input, docx_output): unnumbered_formats = ['bullet', 'circle', 'square', 'dash', 'diamond', 'check', 'arrow', 'arrowhead', 'rtArrow', 'hyphen'] if num_fmt in numbered_formats: return (True, num_fmt, abstract_num_id) # Return (True, format string) elif num_fmt in unnumbered_formats: return (False, num_fmt, abstract_num_id) # Return (False, format string) if numFmt in numbered_formats: return (True, numFmt, lvl_text, abstract_num_id) # Return (True, format string) elif numFmt in unnumbered_formats: return (False, numFmt, lvl_text, abstract_num_id) # Return (False, format string) else: # Unknown format, default to unnumbered return (False, num_fmt, abstract_num_id) return (False, numFmt, lvl_text, abstract_num_id) except Exception as e: # If any error occurs, return None to fall back to heuristic return (None, None, None) return (None, None, None, None) counter_regular = 0 counter_b1 = 0 counter_b2 = 0 Loading Loading @@ -1041,6 +1044,7 @@ def update_unnumbered_lists(docx_input, docx_output): abstract.set(qn('w:val'), abstract_id_bn) for para in paragraphs: para_removed = False # Skip if already processed if id(para) in processed_paras: continue Loading @@ -1056,7 +1060,7 @@ def update_unnumbered_lists(docx_input, docx_output): else: compact_style = OxmlElement('w:pStyle') # Check if it is a numbered list and get the format is_numbered, num_format, abstract_num_id = is_numbered_list(para, numbering_root) is_numbered, num_format, lvl_text, abstract_num_id = is_numbered_list(para, numbering_root) if is_numbered: # is_numbered is True if numbered, False if unnumbered, None if cannot determine #pStyle = OxmlElement('w:pStyle') # If format is decimal (numbers), use BN; otherwise use BL Loading @@ -1076,6 +1080,8 @@ def update_unnumbered_lists(docx_input, docx_output): #numPr.remove(numPr.xpath('./w:numId', namespaces=ns)[0]) counter_numbered += 1 else: #print(f'The lvl_text is {lvl_text} and the abstract_num_id is {abstract_num_id}') if para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="0"]', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): abstract_id_tb1 = get_style_abstract_id(styles_root, numbering_root, "TB1") Loading @@ -1085,12 +1091,32 @@ def update_unnumbered_lists(docx_input, docx_output): compact_style.set(f"{{{ns['w']}}}val", "TB1") counter_table += 1 else: if not lvl_text == " ": abstract_id_b1 = get_style_abstract_id(styles_root, numbering_root, "B1") if abstract_id_b1 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b1) compact_style.set(f"{{{ns['w']}}}val", "B1") else: #print(f'Found text belonging to a numbered list item') # Take the text from the paragraph text = get_para_text(para) # Insert it as new run to the previous paragraph. New run starting by <w:br/> previous_para = para.getprevious() if previous_para is not None: new_run = OxmlElement('w:r') new_run.append(OxmlElement('w:br')) text_t = OxmlElement('w:t') text_t.text = text new_run.append(text_t) previous_para.append(new_run) # Remove the original paragraph para.getparent().remove(para) para_removed = True #print(f'Removed the original paragraph') #compact_style.set(f"{{{ns['w']}}}val", "FP") #Remove numPr from pPr if not para_removed: numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] pPr.remove(numPr) counter_b1 += 1 Loading @@ -1103,12 +1129,32 @@ def update_unnumbered_lists(docx_input, docx_output): compact_style.set(f"{{{ns['w']}}}val", "TB2") counter_table += 1 else: if not lvl_text == " ": abstract_id_b2 = get_style_abstract_id(styles_root, numbering_root, "B2") if abstract_id_b2 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b2) compact_style.set(f"{{{ns['w']}}}val", "B2") else: #print(f'Found text belonging to a numbered list item') # Take the text from the paragraph text = get_para_text(para) # Insert it as new run to the previous paragraph. New run starting by <w:br/> previous_para = para.getprevious() if previous_para is not None: new_run = OxmlElement('w:r') new_run.append(OxmlElement('w:br')) text_t = OxmlElement('w:t') text_t.text = text new_run.append(text_t) previous_para.append(new_run) # Remove the original paragraph para.getparent().remove(para) para_removed = True #print(f'Removed the original paragraph') #compact_style.set(f"{{{ns['w']}}}val", "FP") #Remove numPr from pPr if not para_removed: numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] pPr.remove(numPr) counter_b2 += 1 Loading @@ -1121,15 +1167,36 @@ def update_unnumbered_lists(docx_input, docx_output): compact_style.set(f"{{{ns['w']}}}val", "TB3") counter_table += 1 else: if not lvl_text == " ": abstract_id_b3 = get_style_abstract_id(styles_root, numbering_root, "B3") if abstract_id_b3 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b3) compact_style.set(f"{{{ns['w']}}}val", "B3") else: #print(f'Found text belonging to a numbered list item') # Take the text from the paragraph text = get_para_text(para) # Insert it as new run to the previous paragraph. New run starting by <w:br/> previous_para = para.getprevious() if previous_para is not None: new_run = OxmlElement('w:r') new_run.append(OxmlElement('w:br')) text_t = OxmlElement('w:t') text_t.text = text new_run.append(text_t) previous_para.append(new_run) # Remove the original paragraph para.getparent().remove(para) para_removed = True #print(f'Removed the original paragraph') #compact_style.set(f"{{{ns['w']}}}val", "FP") #Remove numPr from pPr if not para_removed: numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] pPr.remove(numPr) counter_b3 += 1 pPr.insert(0, compact_style) counter_compact += 1 continue Loading Loading
generateBaseline/postprocessing.py +103 −36 Original line number Diff line number Diff line Loading @@ -913,20 +913,23 @@ def update_unnumbered_lists(docx_input, docx_output): namespaces=ns ) if not abstract_num: return (None, None, None) return (None, None, None, None) # Check the format for this level lvl = abstract_num[0].xpath( numFmt_elem = abstract_num[0].xpath( f'./w:lvl[@w:ilvl="{ilvl}"]/w:numFmt', namespaces=ns ) if not lvl: return (None, None, None) if not numFmt_elem: return (None, None, None, None) num_fmt = lvl[0].get(f"{{{ns['w']}}}val") if num_fmt is None: return (None, None, None) numFmt = numFmt_elem[0].get(f"{{{ns['w']}}}val") lvl_text_elem = abstract_num[0].xpath(f'./w:lvl[@w:ilvl="{ilvl}"]/w:lvlText', namespaces=ns) if not lvl_text_elem: lvl_text = None lvl_text = lvl_text_elem[0].get(f"{{{ns['w']}}}val") # Numbered formats numbered_formats = ['decimal', 'lowerLetter', 'upperLetter', 'lowerRoman', 'upperRoman', 'arabic', 'ordinal', Loading @@ -951,17 +954,17 @@ def update_unnumbered_lists(docx_input, docx_output): unnumbered_formats = ['bullet', 'circle', 'square', 'dash', 'diamond', 'check', 'arrow', 'arrowhead', 'rtArrow', 'hyphen'] if num_fmt in numbered_formats: return (True, num_fmt, abstract_num_id) # Return (True, format string) elif num_fmt in unnumbered_formats: return (False, num_fmt, abstract_num_id) # Return (False, format string) if numFmt in numbered_formats: return (True, numFmt, lvl_text, abstract_num_id) # Return (True, format string) elif numFmt in unnumbered_formats: return (False, numFmt, lvl_text, abstract_num_id) # Return (False, format string) else: # Unknown format, default to unnumbered return (False, num_fmt, abstract_num_id) return (False, numFmt, lvl_text, abstract_num_id) except Exception as e: # If any error occurs, return None to fall back to heuristic return (None, None, None) return (None, None, None, None) counter_regular = 0 counter_b1 = 0 counter_b2 = 0 Loading Loading @@ -1041,6 +1044,7 @@ def update_unnumbered_lists(docx_input, docx_output): abstract.set(qn('w:val'), abstract_id_bn) for para in paragraphs: para_removed = False # Skip if already processed if id(para) in processed_paras: continue Loading @@ -1056,7 +1060,7 @@ def update_unnumbered_lists(docx_input, docx_output): else: compact_style = OxmlElement('w:pStyle') # Check if it is a numbered list and get the format is_numbered, num_format, abstract_num_id = is_numbered_list(para, numbering_root) is_numbered, num_format, lvl_text, abstract_num_id = is_numbered_list(para, numbering_root) if is_numbered: # is_numbered is True if numbered, False if unnumbered, None if cannot determine #pStyle = OxmlElement('w:pStyle') # If format is decimal (numbers), use BN; otherwise use BL Loading @@ -1076,6 +1080,8 @@ def update_unnumbered_lists(docx_input, docx_output): #numPr.remove(numPr.xpath('./w:numId', namespaces=ns)[0]) counter_numbered += 1 else: #print(f'The lvl_text is {lvl_text} and the abstract_num_id is {abstract_num_id}') if para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="0"]', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): abstract_id_tb1 = get_style_abstract_id(styles_root, numbering_root, "TB1") Loading @@ -1085,12 +1091,32 @@ def update_unnumbered_lists(docx_input, docx_output): compact_style.set(f"{{{ns['w']}}}val", "TB1") counter_table += 1 else: if not lvl_text == " ": abstract_id_b1 = get_style_abstract_id(styles_root, numbering_root, "B1") if abstract_id_b1 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b1) compact_style.set(f"{{{ns['w']}}}val", "B1") else: #print(f'Found text belonging to a numbered list item') # Take the text from the paragraph text = get_para_text(para) # Insert it as new run to the previous paragraph. New run starting by <w:br/> previous_para = para.getprevious() if previous_para is not None: new_run = OxmlElement('w:r') new_run.append(OxmlElement('w:br')) text_t = OxmlElement('w:t') text_t.text = text new_run.append(text_t) previous_para.append(new_run) # Remove the original paragraph para.getparent().remove(para) para_removed = True #print(f'Removed the original paragraph') #compact_style.set(f"{{{ns['w']}}}val", "FP") #Remove numPr from pPr if not para_removed: numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] pPr.remove(numPr) counter_b1 += 1 Loading @@ -1103,12 +1129,32 @@ def update_unnumbered_lists(docx_input, docx_output): compact_style.set(f"{{{ns['w']}}}val", "TB2") counter_table += 1 else: if not lvl_text == " ": abstract_id_b2 = get_style_abstract_id(styles_root, numbering_root, "B2") if abstract_id_b2 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b2) compact_style.set(f"{{{ns['w']}}}val", "B2") else: #print(f'Found text belonging to a numbered list item') # Take the text from the paragraph text = get_para_text(para) # Insert it as new run to the previous paragraph. New run starting by <w:br/> previous_para = para.getprevious() if previous_para is not None: new_run = OxmlElement('w:r') new_run.append(OxmlElement('w:br')) text_t = OxmlElement('w:t') text_t.text = text new_run.append(text_t) previous_para.append(new_run) # Remove the original paragraph para.getparent().remove(para) para_removed = True #print(f'Removed the original paragraph') #compact_style.set(f"{{{ns['w']}}}val", "FP") #Remove numPr from pPr if not para_removed: numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] pPr.remove(numPr) counter_b2 += 1 Loading @@ -1121,15 +1167,36 @@ def update_unnumbered_lists(docx_input, docx_output): compact_style.set(f"{{{ns['w']}}}val", "TB3") counter_table += 1 else: if not lvl_text == " ": abstract_id_b3 = get_style_abstract_id(styles_root, numbering_root, "B3") if abstract_id_b3 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b3) compact_style.set(f"{{{ns['w']}}}val", "B3") else: #print(f'Found text belonging to a numbered list item') # Take the text from the paragraph text = get_para_text(para) # Insert it as new run to the previous paragraph. New run starting by <w:br/> previous_para = para.getprevious() if previous_para is not None: new_run = OxmlElement('w:r') new_run.append(OxmlElement('w:br')) text_t = OxmlElement('w:t') text_t.text = text new_run.append(text_t) previous_para.append(new_run) # Remove the original paragraph para.getparent().remove(para) para_removed = True #print(f'Removed the original paragraph') #compact_style.set(f"{{{ns['w']}}}val", "FP") #Remove numPr from pPr if not para_removed: numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] pPr.remove(numPr) counter_b3 += 1 pPr.insert(0, compact_style) counter_compact += 1 continue Loading