Loading generateBaseline/postprocessing.py +78 −14 Original line number Original line Diff line number Diff line Loading @@ -852,6 +852,13 @@ def update_unnumbered_lists(docx_input, docx_output): except KeyError: except KeyError: numbering_root = None numbering_root = None # Try to read styles.xml, if it doesn't exist, styles_root will be None try: styles_data = zin.read("word/styles.xml") styles_root = etree.fromstring(styles_data) except KeyError: styles_root = None root = etree.fromstring(xml_data) root = etree.fromstring(xml_data) def is_numbered_list(para, numbering_root): def is_numbered_list(para, numbering_root): Loading Loading @@ -888,16 +895,16 @@ def update_unnumbered_lists(docx_input, docx_output): namespaces=ns namespaces=ns ) ) if not num_elem: if not num_elem: return (None, None) return (None, None, None) # Get the abstractNumId # Get the abstractNumId abstract_num_id_elem = num_elem[0].xpath('./w:abstractNumId', namespaces=ns) abstract_num_id_elem = num_elem[0].xpath('./w:abstractNumId', namespaces=ns) if not abstract_num_id_elem: if not abstract_num_id_elem: return (None, None) return (None, None, None) abstract_num_id = abstract_num_id_elem[0].get(f"{{{ns['w']}}}val") abstract_num_id = abstract_num_id_elem[0].get(f"{{{ns['w']}}}val") if abstract_num_id is None: if abstract_num_id is None: return (None, None) return (None, None, None) # Find the abstractNum # Find the abstractNum abstract_num = numbering_root.xpath( abstract_num = numbering_root.xpath( Loading @@ -905,7 +912,7 @@ def update_unnumbered_lists(docx_input, docx_output): namespaces=ns namespaces=ns ) ) if not abstract_num: if not abstract_num: return (None, None) return (None, None, None) # Check the format for this level # Check the format for this level lvl = abstract_num[0].xpath( lvl = abstract_num[0].xpath( Loading @@ -913,11 +920,11 @@ def update_unnumbered_lists(docx_input, docx_output): namespaces=ns namespaces=ns ) ) if not lvl: if not lvl: return (None, None) return (None, None, None) num_fmt = lvl[0].get(f"{{{ns['w']}}}val") num_fmt = lvl[0].get(f"{{{ns['w']}}}val") if num_fmt is None: if num_fmt is None: return (None, None) return (None, None, None) # Numbered formats # Numbered formats numbered_formats = ['decimal', 'lowerLetter', 'upperLetter', numbered_formats = ['decimal', 'lowerLetter', 'upperLetter', Loading @@ -944,16 +951,16 @@ def update_unnumbered_lists(docx_input, docx_output): 'check', 'arrow', 'arrowhead', 'rtArrow', 'hyphen'] 'check', 'arrow', 'arrowhead', 'rtArrow', 'hyphen'] if num_fmt in numbered_formats: if num_fmt in numbered_formats: return (True, num_fmt) # Return (True, format string) return (True, num_fmt, abstract_num_id) # Return (True, format string) elif num_fmt in unnumbered_formats: elif num_fmt in unnumbered_formats: return (False, num_fmt) # Return (False, format string) return (False, num_fmt, abstract_num_id) # Return (False, format string) else: else: # Unknown format, default to unnumbered # Unknown format, default to unnumbered return (False, num_fmt) return (False, num_fmt, abstract_num_id) except Exception as e: except Exception as e: # If any error occurs, return None to fall back to heuristic # If any error occurs, return None to fall back to heuristic return (None, None) return (None, None, None) counter_regular = 0 counter_regular = 0 counter_b1 = 0 counter_b1 = 0 counter_b2 = 0 counter_b2 = 0 Loading Loading @@ -1001,6 +1008,33 @@ def update_unnumbered_lists(docx_input, docx_output): text += text_elem.text text += text_elem.text return text return text def get_style_abstract_id(styles_root, numbering_root, style_id): """ Find the abstractNumId used by the given style. """ styles = styles_root style = styles.xpath(f'//w:style[@w:styleId="{style_id}"]', namespaces=ns)[0] num_id_el = style.xpath('.//w:numId', namespaces=ns) if not num_id_el: return None #raise ValueError(f"Style {style_id} has no numId") # Remove numPr from style if it exists num_pr = style.xpath('.//w:numPr', namespaces=ns) if num_pr: for el in num_pr: el.getparent().remove(el) return num_id_el[0].get(qn('w:val')) def update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_bn): for num in numbering_root.xpath(f'//w:num[@w:abstractNumId="{abstract_num_id}"]', namespaces=ns): abstract = num.xpath('./w:abstractNumId', namespaces=ns)[0] old_id = abstract.get(qn('w:val')) abstract.set(qn('w:val'), abstract_id_bn) for para in paragraphs: for para in paragraphs: # Skip if already processed # Skip if already processed if id(para) in processed_paras: if id(para) in processed_paras: Loading @@ -1017,25 +1051,39 @@ def update_unnumbered_lists(docx_input, docx_output): else: else: compact_style = OxmlElement('w:pStyle') compact_style = OxmlElement('w:pStyle') # Check if it is a numbered list and get the format # Check if it is a numbered list and get the format is_numbered, num_format = is_numbered_list(para, numbering_root) is_numbered, num_format, abstract_num_id = is_numbered_list(para, numbering_root) if is_numbered: # is_numbered is True if numbered, False if unnumbered, None if cannot determine if is_numbered: # is_numbered is True if numbered, False if unnumbered, None if cannot determine #pStyle = OxmlElement('w:pStyle') #pStyle = OxmlElement('w:pStyle') # If format is decimal (numbers), use BN; otherwise use BL # If format is decimal (numbers), use BN; otherwise use BL if num_format == 'decimal': if num_format == 'decimal': abstract_id_bn = get_style_abstract_id(styles_root, numbering_root, "BN") if abstract_id_bn is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_bn) compact_style.set(f"{{{ns['w']}}}val", "BN") compact_style.set(f"{{{ns['w']}}}val", "BN") else: else: abstract_id_bl = get_style_abstract_id(styles_root, numbering_root, "BL") if abstract_id_bl is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_bl) compact_style.set(f"{{{ns['w']}}}val", "BL") compact_style.set(f"{{{ns['w']}}}val", "BL") pPr.insert(0, compact_style) pPr.insert(0, compact_style) # Remove numId from numPr # Remove numId from numPr numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] #numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] numPr.remove(numPr.xpath('./w:numId', namespaces=ns)[0]) #numPr.remove(numPr.xpath('./w:numId', namespaces=ns)[0]) counter_numbered += 1 counter_numbered += 1 else: else: if para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="0"]', namespaces=ns): if para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="0"]', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): abstract_id_tb1 = get_style_abstract_id(styles_root, numbering_root, "TB1") if abstract_id_tb1 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_tb1) compact_style.set(f"{{{ns['w']}}}val", "TB1") compact_style.set(f"{{{ns['w']}}}val", "TB1") counter_table += 1 counter_table += 1 else: else: abstract_id_b1 = get_style_abstract_id(styles_root, numbering_root, "B1") if abstract_id_b1 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b1) compact_style.set(f"{{{ns['w']}}}val", "B1") compact_style.set(f"{{{ns['w']}}}val", "B1") #Remove numPr from pPr #Remove numPr from pPr numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] Loading @@ -1043,9 +1091,17 @@ def update_unnumbered_lists(docx_input, docx_output): counter_b1 += 1 counter_b1 += 1 elif para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="1"]', namespaces=ns): elif para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="1"]', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): abstract_id_tb2 = get_style_abstract_id(styles_root, numbering_root, "TB2") if abstract_id_tb2 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_tb2) compact_style.set(f"{{{ns['w']}}}val", "TB2") compact_style.set(f"{{{ns['w']}}}val", "TB2") counter_table += 1 counter_table += 1 else: else: abstract_id_b2 = get_style_abstract_id(styles_root, numbering_root, "B2") if abstract_id_b2 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b2) compact_style.set(f"{{{ns['w']}}}val", "B2") compact_style.set(f"{{{ns['w']}}}val", "B2") #Remove numPr from pPr #Remove numPr from pPr numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] Loading @@ -1053,9 +1109,17 @@ def update_unnumbered_lists(docx_input, docx_output): counter_b2 += 1 counter_b2 += 1 elif para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="2"]', namespaces=ns): elif para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="2"]', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): abstract_id_tb3 = get_style_abstract_id(styles_root, numbering_root, "TB3") if abstract_id_tb3 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_tb3) compact_style.set(f"{{{ns['w']}}}val", "TB3") compact_style.set(f"{{{ns['w']}}}val", "TB3") counter_table += 1 counter_table += 1 else: else: abstract_id_b3 = get_style_abstract_id(styles_root, numbering_root, "B3") if abstract_id_b3 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b3) compact_style.set(f"{{{ns['w']}}}val", "B3") compact_style.set(f"{{{ns['w']}}}val", "B3") #Remove numPr from pPr #Remove numPr from pPr numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] Loading Loading
generateBaseline/postprocessing.py +78 −14 Original line number Original line Diff line number Diff line Loading @@ -852,6 +852,13 @@ def update_unnumbered_lists(docx_input, docx_output): except KeyError: except KeyError: numbering_root = None numbering_root = None # Try to read styles.xml, if it doesn't exist, styles_root will be None try: styles_data = zin.read("word/styles.xml") styles_root = etree.fromstring(styles_data) except KeyError: styles_root = None root = etree.fromstring(xml_data) root = etree.fromstring(xml_data) def is_numbered_list(para, numbering_root): def is_numbered_list(para, numbering_root): Loading Loading @@ -888,16 +895,16 @@ def update_unnumbered_lists(docx_input, docx_output): namespaces=ns namespaces=ns ) ) if not num_elem: if not num_elem: return (None, None) return (None, None, None) # Get the abstractNumId # Get the abstractNumId abstract_num_id_elem = num_elem[0].xpath('./w:abstractNumId', namespaces=ns) abstract_num_id_elem = num_elem[0].xpath('./w:abstractNumId', namespaces=ns) if not abstract_num_id_elem: if not abstract_num_id_elem: return (None, None) return (None, None, None) abstract_num_id = abstract_num_id_elem[0].get(f"{{{ns['w']}}}val") abstract_num_id = abstract_num_id_elem[0].get(f"{{{ns['w']}}}val") if abstract_num_id is None: if abstract_num_id is None: return (None, None) return (None, None, None) # Find the abstractNum # Find the abstractNum abstract_num = numbering_root.xpath( abstract_num = numbering_root.xpath( Loading @@ -905,7 +912,7 @@ def update_unnumbered_lists(docx_input, docx_output): namespaces=ns namespaces=ns ) ) if not abstract_num: if not abstract_num: return (None, None) return (None, None, None) # Check the format for this level # Check the format for this level lvl = abstract_num[0].xpath( lvl = abstract_num[0].xpath( Loading @@ -913,11 +920,11 @@ def update_unnumbered_lists(docx_input, docx_output): namespaces=ns namespaces=ns ) ) if not lvl: if not lvl: return (None, None) return (None, None, None) num_fmt = lvl[0].get(f"{{{ns['w']}}}val") num_fmt = lvl[0].get(f"{{{ns['w']}}}val") if num_fmt is None: if num_fmt is None: return (None, None) return (None, None, None) # Numbered formats # Numbered formats numbered_formats = ['decimal', 'lowerLetter', 'upperLetter', numbered_formats = ['decimal', 'lowerLetter', 'upperLetter', Loading @@ -944,16 +951,16 @@ def update_unnumbered_lists(docx_input, docx_output): 'check', 'arrow', 'arrowhead', 'rtArrow', 'hyphen'] 'check', 'arrow', 'arrowhead', 'rtArrow', 'hyphen'] if num_fmt in numbered_formats: if num_fmt in numbered_formats: return (True, num_fmt) # Return (True, format string) return (True, num_fmt, abstract_num_id) # Return (True, format string) elif num_fmt in unnumbered_formats: elif num_fmt in unnumbered_formats: return (False, num_fmt) # Return (False, format string) return (False, num_fmt, abstract_num_id) # Return (False, format string) else: else: # Unknown format, default to unnumbered # Unknown format, default to unnumbered return (False, num_fmt) return (False, num_fmt, abstract_num_id) except Exception as e: except Exception as e: # If any error occurs, return None to fall back to heuristic # If any error occurs, return None to fall back to heuristic return (None, None) return (None, None, None) counter_regular = 0 counter_regular = 0 counter_b1 = 0 counter_b1 = 0 counter_b2 = 0 counter_b2 = 0 Loading Loading @@ -1001,6 +1008,33 @@ def update_unnumbered_lists(docx_input, docx_output): text += text_elem.text text += text_elem.text return text return text def get_style_abstract_id(styles_root, numbering_root, style_id): """ Find the abstractNumId used by the given style. """ styles = styles_root style = styles.xpath(f'//w:style[@w:styleId="{style_id}"]', namespaces=ns)[0] num_id_el = style.xpath('.//w:numId', namespaces=ns) if not num_id_el: return None #raise ValueError(f"Style {style_id} has no numId") # Remove numPr from style if it exists num_pr = style.xpath('.//w:numPr', namespaces=ns) if num_pr: for el in num_pr: el.getparent().remove(el) return num_id_el[0].get(qn('w:val')) def update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_bn): for num in numbering_root.xpath(f'//w:num[@w:abstractNumId="{abstract_num_id}"]', namespaces=ns): abstract = num.xpath('./w:abstractNumId', namespaces=ns)[0] old_id = abstract.get(qn('w:val')) abstract.set(qn('w:val'), abstract_id_bn) for para in paragraphs: for para in paragraphs: # Skip if already processed # Skip if already processed if id(para) in processed_paras: if id(para) in processed_paras: Loading @@ -1017,25 +1051,39 @@ def update_unnumbered_lists(docx_input, docx_output): else: else: compact_style = OxmlElement('w:pStyle') compact_style = OxmlElement('w:pStyle') # Check if it is a numbered list and get the format # Check if it is a numbered list and get the format is_numbered, num_format = is_numbered_list(para, numbering_root) is_numbered, num_format, abstract_num_id = is_numbered_list(para, numbering_root) if is_numbered: # is_numbered is True if numbered, False if unnumbered, None if cannot determine if is_numbered: # is_numbered is True if numbered, False if unnumbered, None if cannot determine #pStyle = OxmlElement('w:pStyle') #pStyle = OxmlElement('w:pStyle') # If format is decimal (numbers), use BN; otherwise use BL # If format is decimal (numbers), use BN; otherwise use BL if num_format == 'decimal': if num_format == 'decimal': abstract_id_bn = get_style_abstract_id(styles_root, numbering_root, "BN") if abstract_id_bn is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_bn) compact_style.set(f"{{{ns['w']}}}val", "BN") compact_style.set(f"{{{ns['w']}}}val", "BN") else: else: abstract_id_bl = get_style_abstract_id(styles_root, numbering_root, "BL") if abstract_id_bl is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_bl) compact_style.set(f"{{{ns['w']}}}val", "BL") compact_style.set(f"{{{ns['w']}}}val", "BL") pPr.insert(0, compact_style) pPr.insert(0, compact_style) # Remove numId from numPr # Remove numId from numPr numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] #numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] numPr.remove(numPr.xpath('./w:numId', namespaces=ns)[0]) #numPr.remove(numPr.xpath('./w:numId', namespaces=ns)[0]) counter_numbered += 1 counter_numbered += 1 else: else: if para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="0"]', namespaces=ns): if para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="0"]', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): abstract_id_tb1 = get_style_abstract_id(styles_root, numbering_root, "TB1") if abstract_id_tb1 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_tb1) compact_style.set(f"{{{ns['w']}}}val", "TB1") compact_style.set(f"{{{ns['w']}}}val", "TB1") counter_table += 1 counter_table += 1 else: else: abstract_id_b1 = get_style_abstract_id(styles_root, numbering_root, "B1") if abstract_id_b1 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b1) compact_style.set(f"{{{ns['w']}}}val", "B1") compact_style.set(f"{{{ns['w']}}}val", "B1") #Remove numPr from pPr #Remove numPr from pPr numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] Loading @@ -1043,9 +1091,17 @@ def update_unnumbered_lists(docx_input, docx_output): counter_b1 += 1 counter_b1 += 1 elif para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="1"]', namespaces=ns): elif para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="1"]', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): abstract_id_tb2 = get_style_abstract_id(styles_root, numbering_root, "TB2") if abstract_id_tb2 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_tb2) compact_style.set(f"{{{ns['w']}}}val", "TB2") compact_style.set(f"{{{ns['w']}}}val", "TB2") counter_table += 1 counter_table += 1 else: else: abstract_id_b2 = get_style_abstract_id(styles_root, numbering_root, "B2") if abstract_id_b2 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b2) compact_style.set(f"{{{ns['w']}}}val", "B2") compact_style.set(f"{{{ns['w']}}}val", "B2") #Remove numPr from pPr #Remove numPr from pPr numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] Loading @@ -1053,9 +1109,17 @@ def update_unnumbered_lists(docx_input, docx_output): counter_b2 += 1 counter_b2 += 1 elif para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="2"]', namespaces=ns): elif para.xpath('./w:pPr/w:numPr/w:ilvl[@w:val="2"]', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): if para.xpath('ancestor::w:tbl', namespaces=ns): abstract_id_tb3 = get_style_abstract_id(styles_root, numbering_root, "TB3") if abstract_id_tb3 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_tb3) compact_style.set(f"{{{ns['w']}}}val", "TB3") compact_style.set(f"{{{ns['w']}}}val", "TB3") counter_table += 1 counter_table += 1 else: else: abstract_id_b3 = get_style_abstract_id(styles_root, numbering_root, "B3") if abstract_id_b3 is not None and abstract_num_id is not None: update_list_instance_to_style(numbering_root, abstract_num_id, abstract_id_b3) compact_style.set(f"{{{ns['w']}}}val", "B3") compact_style.set(f"{{{ns['w']}}}val", "B3") #Remove numPr from pPr #Remove numPr from pPr numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] numPr = pPr.xpath('./w:numPr', namespaces=ns)[0] Loading