Commit 6402d667 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Adding sanitize helpers

parent 8ec31d0b
Loading
Loading
Loading
Loading
Loading
+72 −1
Original line number Original line Diff line number Diff line
@@ -19,6 +19,48 @@ from docx.shared import Cm


#from file_helper import get_all_files_from_dir
#from file_helper import get_all_files_from_dir


def sanitize_xml_text(text):
    """
    Remove invalid XML control characters from text.
    XML 1.0 allows only: tab (0x09), line feed (0x0A), carriage return (0x0D)
    All other control characters (0x00-0x08, 0x0B-0x0C, 0x0E-0x1F) are invalid.
    
    Parameters
    ----------
    text : str
        Text that may contain invalid XML characters
        
    Returns
    -------
    str
        Text with invalid control characters removed
    """
    if text is None:
        return None
    
    # Remove invalid control characters (keep tab, LF, CR)
    # Invalid: 0x00-0x08, 0x0B-0x0C, 0x0E-0x1F
    # Valid: 0x09 (tab), 0x0A (LF), 0x0D (CR)
    sanitized = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r')
    return sanitized

def sanitize_document_xml(root, ns):
    """
    Sanitize all text elements in a document XML tree by removing invalid control characters.
    
    Parameters
    ----------
    root : lxml.etree.Element
        Root element of the document XML
    ns : dict
        Namespace dictionary
    """
    # Find all text elements
    text_elems = root.xpath('.//w:t', namespaces=ns)
    for text_elem in text_elems:
        if text_elem.text:
            text_elem.text = sanitize_xml_text(text_elem.text)



def apply_standard_style_to_unformatted_paragraphs(config):
def apply_standard_style_to_unformatted_paragraphs(config):
    docx_path = config.get("output_docx")
    docx_path = config.get("output_docx")
@@ -2368,3 +2410,32 @@ def update_format_styles_cli():
    update_source_code_style(args.docx_input, args.docx_output)
    update_source_code_style(args.docx_input, args.docx_output)
    update_equation_style(args.docx_input, args.docx_output)
    update_equation_style(args.docx_input, args.docx_output)
    correct_quotes_docx(args.docx_input, args.docx_output)
    correct_quotes_docx(args.docx_input, args.docx_output)
    
    # Final sanitization: remove invalid XML characters
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    with zipfile.ZipFile(args.docx_output, 'r') as zin:
        xml_data = zin.read("word/document.xml")
    
    root = etree.fromstring(xml_data)
    sanitize_document_xml(root, ns)
    
    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")
    
    # Write sanitized document back
    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)
    
    try:
        with zipfile.ZipFile(args.docx_output, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)
        
        shutil.move(tmp_path, args.docx_output)
        os.chmod(args.docx_output, 0o644)
        print('Sanitized document: removed invalid XML characters')
    finally:
        if os.path.exists(tmp_path):
            os.remove(tmp_path)
 No newline at end of file