Adding sanitize helpers (6402d667) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateBaseline/postprocessing.py

+72 −1

Original line number	Diff line number	Diff line
		@@ -19,6 +19,48 @@ from docx.shared import Cm

		#from file_helper import get_all_files_from_dir

		def sanitize_xml_text(text):
		"""
		Remove invalid XML control characters from text.
		XML 1.0 allows only: tab (0x09), line feed (0x0A), carriage return (0x0D)
		All other control characters (0x00-0x08, 0x0B-0x0C, 0x0E-0x1F) are invalid.

		Parameters
		----------
		text : str
		Text that may contain invalid XML characters

		Returns
		-------
		str
		Text with invalid control characters removed
		"""
		if text is None:
		return None

		# Remove invalid control characters (keep tab, LF, CR)
		# Invalid: 0x00-0x08, 0x0B-0x0C, 0x0E-0x1F
		# Valid: 0x09 (tab), 0x0A (LF), 0x0D (CR)
		sanitized = ''.join(char for char in text if ord(char) >= 32 or char in '\t\n\r')
		return sanitized

		def sanitize_document_xml(root, ns):
		"""
		Sanitize all text elements in a document XML tree by removing invalid control characters.

		Parameters
		----------
		root : lxml.etree.Element
		Root element of the document XML
		ns : dict
		Namespace dictionary
		"""
		# Find all text elements
		text_elems = root.xpath('.//w:t', namespaces=ns)
		for text_elem in text_elems:
		if text_elem.text:
		text_elem.text = sanitize_xml_text(text_elem.text)


		def apply_standard_style_to_unformatted_paragraphs(config):
		docx_path = config.get("output_docx")
		@@ -2368,3 +2410,32 @@ def update_format_styles_cli():
		update_source_code_style(args.docx_input, args.docx_output)
		update_equation_style(args.docx_input, args.docx_output)
		correct_quotes_docx(args.docx_input, args.docx_output)

		# Final sanitization: remove invalid XML characters
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
		with zipfile.ZipFile(args.docx_output, 'r') as zin:
		xml_data = zin.read("word/document.xml")

		root = etree.fromstring(xml_data)
		sanitize_document_xml(root, ns)

		xml_data = etree.tostring(root, xml_declaration=True, encoding="UTF-8", standalone="yes")

		# Write sanitized document back
		tmp_fd, tmp_path = tempfile.mkstemp(suffix=".docx")
		os.close(tmp_fd)

		try:
		with zipfile.ZipFile(args.docx_output, 'r') as zin, zipfile.ZipFile(tmp_path, 'w', zipfile.ZIP_DEFLATED) as zout:
		for item in zin.infolist():
		if item.filename != "word/document.xml":
		data = zin.read(item.filename)
		zout.writestr(item.filename, data)
		zout.writestr("word/document.xml", xml_data)

		shutil.move(tmp_path, args.docx_output)
		os.chmod(args.docx_output, 0o644)
		print('Sanitized document: removed invalid XML characters')
		finally:
		if os.path.exists(tmp_path):
		os.remove(tmp_path)
		No newline at end of file