Loading generateBaseline/postprocessing.py +85 −1 Original line number Diff line number Diff line Loading @@ -2196,6 +2196,89 @@ def update_equation_style(docx_input, docx_output): if os.path.exists(tmp_path): os.remove(tmp_path) def correct_quotes_docx(docx_input, docx_output): """ Converts all curly/smart quotes to straight quotes in a DOCX file. Replaces: - Left double quote " (U+201C) → " (U+0022) - Right double quote " (U+201D) → " (U+0022) - Left single quote ' (U+2018) → ' (U+0027) - Right single quote ' (U+2019) → ' (U+0027) - Double prime " (U+2033) → " (U+0022) - Single prime ' (U+2032) → ' (U+0027) Parameters ---------- docx_input : str Path to the input DOCX file. docx_output : str Path to the output DOCX file. """ ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} # Quote mapping: curly/smart quotes to straight quotes quote_replacements = { '\u201C': '"', # Left double quote '\u201D': '"', # Right double quote '\u2018': "'", # Left single quote '\u2019': "'", # Right single quote '\u2033': '"', # Double prime '\u2032': "'", # Single prime } # Read XML with zipfile.ZipFile(docx_input, 'r') as zin: xml_data = zin.read("word/document.xml") root = etree.fromstring(xml_data) counter = 0 # Find all text elements and replace quotes text_elems = root.xpath('.//w:t', namespaces=ns) for text_elem in text_elems: if text_elem.text: original_text = text_elem.text new_text = original_text # Count and replace all curly quotes for curly_quote, straight_quote in quote_replacements.items(): if curly_quote in new_text: count = new_text.count(curly_quote) new_text = new_text.replace(curly_quote, straight_quote) counter += count if new_text != original_text: text_elem.text = new_text print(f'Converted {counter} curly quotes to straight quotes') xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") # create temp file tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) # Datei wird nur über zipfile geöffnet try: # write new docx to temp file with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) # Write to output file shutil.move(tmp_path, docx_output) # Set proper permissions (read/write for owner, read for group and others) os.chmod(docx_output, 0o644) finally: # delete temp file if still existing if os.path.exists(tmp_path): os.remove(tmp_path) def update_format_styles_cli(): parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.") parser.add_argument("docx_input", help="Path to input DOCX file") Loading @@ -2216,3 +2299,4 @@ def update_format_styles_cli(): update_references_style(args.docx_input, args.docx_output) update_source_code_style(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) No newline at end of file Loading
generateBaseline/postprocessing.py +85 −1 Original line number Diff line number Diff line Loading @@ -2196,6 +2196,89 @@ def update_equation_style(docx_input, docx_output): if os.path.exists(tmp_path): os.remove(tmp_path) def correct_quotes_docx(docx_input, docx_output): """ Converts all curly/smart quotes to straight quotes in a DOCX file. Replaces: - Left double quote " (U+201C) → " (U+0022) - Right double quote " (U+201D) → " (U+0022) - Left single quote ' (U+2018) → ' (U+0027) - Right single quote ' (U+2019) → ' (U+0027) - Double prime " (U+2033) → " (U+0022) - Single prime ' (U+2032) → ' (U+0027) Parameters ---------- docx_input : str Path to the input DOCX file. docx_output : str Path to the output DOCX file. """ ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} # Quote mapping: curly/smart quotes to straight quotes quote_replacements = { '\u201C': '"', # Left double quote '\u201D': '"', # Right double quote '\u2018': "'", # Left single quote '\u2019': "'", # Right single quote '\u2033': '"', # Double prime '\u2032': "'", # Single prime } # Read XML with zipfile.ZipFile(docx_input, 'r') as zin: xml_data = zin.read("word/document.xml") root = etree.fromstring(xml_data) counter = 0 # Find all text elements and replace quotes text_elems = root.xpath('.//w:t', namespaces=ns) for text_elem in text_elems: if text_elem.text: original_text = text_elem.text new_text = original_text # Count and replace all curly quotes for curly_quote, straight_quote in quote_replacements.items(): if curly_quote in new_text: count = new_text.count(curly_quote) new_text = new_text.replace(curly_quote, straight_quote) counter += count if new_text != original_text: text_elem.text = new_text print(f'Converted {counter} curly quotes to straight quotes') xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes") # create temp file tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx") os.close(tmp_fd) # Datei wird nur über zipfile geöffnet try: # write new docx to temp file with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout: for item in zin.infolist(): if item.filename != "word/document.xml": data = zin.read(item.filename) zout.writestr(item.filename, data) zout.writestr("word/document.xml", xml_data) # Write to output file shutil.move(tmp_path, docx_output) # Set proper permissions (read/write for owner, read for group and others) os.chmod(docx_output, 0o644) finally: # delete temp file if still existing if os.path.exists(tmp_path): os.remove(tmp_path) def update_format_styles_cli(): parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.") parser.add_argument("docx_input", help="Path to input DOCX file") Loading @@ -2216,3 +2299,4 @@ def update_format_styles_cli(): update_references_style(args.docx_input, args.docx_output) update_source_code_style(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) No newline at end of file