Add correction of quotes (4adc7697) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateBaseline/postprocessing.py

+85 −1

Original line number	Diff line number	Diff line
		@@ -2196,6 +2196,89 @@ def update_equation_style(docx_input, docx_output):
		if os.path.exists(tmp_path):
		os.remove(tmp_path)

		def correct_quotes_docx(docx_input, docx_output):
		"""
		Converts all curly/smart quotes to straight quotes in a DOCX file.

		Replaces:
		- Left double quote " (U+201C) → " (U+0022)
		- Right double quote " (U+201D) → " (U+0022)
		- Left single quote ' (U+2018) → ' (U+0027)
		- Right single quote ' (U+2019) → ' (U+0027)
		- Double prime " (U+2033) → " (U+0022)
		- Single prime ' (U+2032) → ' (U+0027)

		Parameters
		----------
		docx_input : str
		Path to the input DOCX file.
		docx_output : str
		Path to the output DOCX file.
		"""
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}

		# Quote mapping: curly/smart quotes to straight quotes
		quote_replacements = {
		'\u201C': '"', # Left double quote
		'\u201D': '"', # Right double quote
		'\u2018': "'", # Left single quote
		'\u2019': "'", # Right single quote
		'\u2033': '"', # Double prime
		'\u2032': "'", # Single prime
		}

		# Read XML
		with zipfile.ZipFile(docx_input, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		root = etree.fromstring(xml_data)
		counter = 0

		# Find all text elements and replace quotes
		text_elems = root.xpath('.//w:t', namespaces=ns)
		for text_elem in text_elems:
		if text_elem.text:
		original_text = text_elem.text
		new_text = original_text

		# Count and replace all curly quotes
		for curly_quote, straight_quote in quote_replacements.items():
		if curly_quote in new_text:
		count = new_text.count(curly_quote)
		new_text = new_text.replace(curly_quote, straight_quote)
		counter += count

		if new_text != original_text:
		text_elem.text = new_text

		print(f'Converted {counter} curly quotes to straight quotes')

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		# create temp file
		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd) # Datei wird nur über zipfile geöffnet

		try:
		# write new docx to temp file
		with zipfile.ZipFile(docx_input, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		# Write to output file
		shutil.move(tmp_path, docx_output)
		# Set proper permissions (read/write for owner, read for group and others)
		os.chmod(docx_output, 0o644)

		finally:
		# delete temp file if still existing
		if os.path.exists(tmp_path):
		os.remove(tmp_path)


		def update_format_styles_cli():
		parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.")
		parser.add_argument("docx_input", help="Path to input DOCX file")
		@@ -2216,3 +2299,4 @@ def update_format_styles_cli():
		update_references_style(args.docx_input, args.docx_output)
		update_source_code_style(args.docx_input, args.docx_output)
		update_equation_style(args.docx_input, args.docx_output)
		correct_quotes_docx(args.docx_input, args.docx_output)
		No newline at end of file