Loading generateBaseline/postprocessing.py +72 −1 Original line number Original line Diff line number Diff line Loading @@ -19,6 +19,48 @@ from docx.shared import Cm #from file_helper import get_all_files_from_dir #from file_helper import get_all_files_from_dir def sanitize_xml_text(text): """ Remove invalid XML control characters from text. XML 1.0 allows only: tab (0x09), line feed (0x0A), carriage return (0x0D) All other control characters (0x00-0x08, 0x0B-0x0C, 0x0E-0x1F) are invalid. Parameters ---------- text : str Text that may contain invalid XML characters Returns ------- str Text with invalid control characters removed """ if text is None: return None # Remove invalid control characters (keep tab, LF, CR) # Invalid: 0x00-0x08, 0x0B-0x0C, 0x0E-0x1F # Valid: 0x09 (tab), 0x0A (LF), 0x0D (CR) sanitized = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r') return sanitized def sanitize_document_xml(root, ns): """ Sanitize all text elements in a document XML tree by removing invalid control characters. Parameters ---------- root : lxml.etree.Element Root element of the document XML ns : dict Namespace dictionary """ # Find all text elements text_elems = root.xpath('.//w:t', namespaces=ns) for text_elem in text_elems: if text_elem.text: text_elem.text = sanitize_xml_text(text_elem.text) def apply_standard_style_to_unformatted_paragraphs(config): def apply_standard_style_to_unformatted_paragraphs(config): docx_path = config.get("output_docx") docx_path = config.get("output_docx") Loading Loading @@ -2368,3 +2410,32 @@ def update_format_styles_cli(): update_source_code_style(args.docx_input, args.docx_output) update_source_code_style(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) # Final sanitization: remove invalid XML characters ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} with zipfile.ZipFile(args.docx_output, 'r') as zin: xml_data = zin.read("word/document.xml") root = etree.fromstring(xml_data) sanitize_document_xml(root, ns) xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") # Write sanitized document back tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) try: with zipfile.ZipFile(args.docx_output, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) shutil.move(tmp_path, args.docx_output) os.chmod(args.docx_output, 0o644) print('Sanitized document: removed invalid XML characters') finally: if os.path.exists(tmp_path): os.remove(tmp_path) No newline at end of file Loading
generateBaseline/postprocessing.py +72 −1 Original line number Original line Diff line number Diff line Loading @@ -19,6 +19,48 @@ from docx.shared import Cm #from file_helper import get_all_files_from_dir #from file_helper import get_all_files_from_dir def sanitize_xml_text(text): """ Remove invalid XML control characters from text. XML 1.0 allows only: tab (0x09), line feed (0x0A), carriage return (0x0D) All other control characters (0x00-0x08, 0x0B-0x0C, 0x0E-0x1F) are invalid. Parameters ---------- text : str Text that may contain invalid XML characters Returns ------- str Text with invalid control characters removed """ if text is None: return None # Remove invalid control characters (keep tab, LF, CR) # Invalid: 0x00-0x08, 0x0B-0x0C, 0x0E-0x1F # Valid: 0x09 (tab), 0x0A (LF), 0x0D (CR) sanitized = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r') return sanitized def sanitize_document_xml(root, ns): """ Sanitize all text elements in a document XML tree by removing invalid control characters. Parameters ---------- root : lxml.etree.Element Root element of the document XML ns : dict Namespace dictionary """ # Find all text elements text_elems = root.xpath('.//w:t', namespaces=ns) for text_elem in text_elems: if text_elem.text: text_elem.text = sanitize_xml_text(text_elem.text) def apply_standard_style_to_unformatted_paragraphs(config): def apply_standard_style_to_unformatted_paragraphs(config): docx_path = config.get("output_docx") docx_path = config.get("output_docx") Loading Loading @@ -2368,3 +2410,32 @@ def update_format_styles_cli(): update_source_code_style(args.docx_input, args.docx_output) update_source_code_style(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) # Final sanitization: remove invalid XML characters ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} with zipfile.ZipFile(args.docx_output, 'r') as zin: xml_data = zin.read("word/document.xml") root = etree.fromstring(xml_data) sanitize_document_xml(root, ns) xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") # Write sanitized document back tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) try: with zipfile.ZipFile(args.docx_output, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) shutil.move(tmp_path, args.docx_output) os.chmod(args.docx_output, 0o644) print('Sanitized document: removed invalid XML characters') finally: if os.path.exists(tmp_path): os.remove(tmp_path) No newline at end of file