Commit 4adc7697 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Add correction of quotes

parent 0c23d9d7
Loading
Loading
Loading
Loading
Loading
+85 −1
Original line number Diff line number Diff line
@@ -2196,6 +2196,89 @@ def update_equation_style(docx_input, docx_output):
        if os.path.exists(tmp_path):
            os.remove(tmp_path)

def correct_quotes_docx(docx_input, docx_output):
    """
    Converts all curly/smart quotes to straight quotes in a DOCX file.
    
    Replaces:
    - Left double quote " (U+201C) → " (U+0022)
    - Right double quote " (U+201D) → " (U+0022)
    - Left single quote ' (U+2018) → ' (U+0027)
    - Right single quote ' (U+2019) → ' (U+0027)
    - Double prime " (U+2033) → " (U+0022)
    - Single prime ' (U+2032) → ' (U+0027)
    
    Parameters
    ----------
    docx_input : str
        Path to the input DOCX file.
    docx_output : str
        Path to the output DOCX file.
    """
    ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
    
    # Quote mapping: curly/smart quotes to straight quotes
    quote_replacements = {
        '\u201C': '"',  # Left double quote
        '\u201D': '"',  # Right double quote
        '\u2018': "'",  # Left single quote
        '\u2019': "'",  # Right single quote
        '\u2033': '"',  # Double prime
        '\u2032': "'",  # Single prime
    }
    
    # Read XML
    with zipfile.ZipFile(docx_input, 'r') as zin:
        xml_data = zin.read("word/document.xml")

    root = etree.fromstring(xml_data)
    counter = 0

    # Find all text elements and replace quotes
    text_elems = root.xpath('.//w:t', namespaces=ns)
    for text_elem in text_elems:
        if text_elem.text:
            original_text = text_elem.text
            new_text = original_text
            
            # Count and replace all curly quotes
            for curly_quote, straight_quote in quote_replacements.items():
                if curly_quote in new_text:
                    count = new_text.count(curly_quote)
                    new_text = new_text.replace(curly_quote, straight_quote)
                    counter += count
            
            if new_text != original_text:
                text_elem.text = new_text
        
    print(f'Converted {counter} curly quotes to straight quotes')

    xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")
    
    # create temp file
    tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
    os.close(tmp_fd)  # Datei wird nur über zipfile geöffnet

    try:
        # write new docx to temp file
        with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
            for item in zin.infolist():
                if item.filename != "word/document.xml":
                    data = zin.read(item.filename)
                    zout.writestr(item.filename, data)
            zout.writestr("word/document.xml", xml_data)

        # Write to output file
        shutil.move(tmp_path, docx_output)
        # Set proper permissions (read/write for owner, read for group and others)
        os.chmod(docx_output, 0o644)

    finally:
        # delete temp file if still existing
        if os.path.exists(tmp_path):
            os.remove(tmp_path)


def update_format_styles_cli():
    parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.")
    parser.add_argument("docx_input", help="Path to input DOCX file")
@@ -2216,3 +2299,4 @@ def update_format_styles_cli():
    update_references_style(args.docx_input, args.docx_output)
    update_source_code_style(args.docx_input, args.docx_output)
    update_equation_style(args.docx_input, args.docx_output)
    correct_quotes_docx(args.docx_input, args.docx_output)
 No newline at end of file