Adding config file for postprocessing steps execution (12fd2c30) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateBaseline/postprocessing.py

+121 −42

Original line number	Diff line number	Diff line
		@@ -19,6 +19,67 @@ from docx.shared import Cm

		#from file_helper import get_all_files_from_dir

		DEFAULT_FORMAT_STYLE_UPDATES = {
		"update_figure_captions": True,
		"update_heading_styles": True,
		"update_figure_style": True,
		"update_table_captions": True,
		"update_abbreviations": True,
		"update_table_rows": True,
		"update_notes": True,
		"update_references": True,
		"update_lists": True,
		"update_body_text_style": True,
		"add_no_break_hyphens": True,
		"update_references_style": True,
		"update_source_code_style": True,
		"add_break_after_code_blocks_and_tables": True,
		"update_equation_style": True,
		"correct_quotes_docx": True,
		"sanitize_document_xml": True
		}


		def load_format_style_updates_config(config_path=None):
		"""
		Load optional JSON config for update_format_styles step toggles.

		Expected JSON format:
		{
		"updates": {
		"update_figure_captions": true,
		"update_heading_styles": false
		}
		}

		The root object can also directly contain step names.
		Unknown keys are ignored.
		"""
		config = DEFAULT_FORMAT_STYLE_UPDATES.copy()
		if not config_path:
		return config

		with open(config_path, "r", encoding="utf-8") as config_file:
		raw_config = json.load(config_file)

		update_toggles = raw_config.get("updates", raw_config)
		if not isinstance(update_toggles, dict):
		raise ValueError("Style config must be a JSON object or contain an 'updates' object.")

		unknown_keys = []
		for key, value in update_toggles.items():
		if key not in config:
		unknown_keys.append(key)
		continue
		if not isinstance(value, bool):
		raise ValueError(f"Config value for '{key}' must be a boolean.")
		config[key] = value

		if unknown_keys:
		print(f"Ignoring unknown update toggle(s): {', '.join(sorted(unknown_keys))}")

		return config

		def sanitize_xml_text(text):
		"""
		Remove invalid XML control characters from text.
		@@ -2711,26 +2772,42 @@ def update_format_styles_cli():
		parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.")
		parser.add_argument("docx_input", help="Path to input DOCX file")
		parser.add_argument("docx_output", help="Path to output DOCX file")
		parser.add_argument(
		"--style-config",
		help="Path to JSON config file with format-style update toggles",
		default=None
		)
		args = parser.parse_args()

		update_figure_captions(args.docx_input, args.docx_output)
		update_heading_styles(args.docx_input, args.docx_output)
		update_figure_style(args.docx_input, args.docx_output)
		update_table_captions(args.docx_input, args.docx_output)
		update_abbreviations(args.docx_input, args.docx_output)
		update_table_rows(args.docx_input, args.docx_output)
		update_notes(args.docx_input, args.docx_output)
		update_references(args.docx_input, args.docx_output)
		update_lists(args.docx_input, args.docx_output)
		update_body_text_style(args.docx_input, args.docx_output)
		add_no_break_hyphens(args.docx_input, args.docx_output)
		update_references_style(args.docx_input, args.docx_output)
		update_source_code_style(args.docx_input, args.docx_output)
		add_break_after_code_blocks_and_tables(args.docx_input, args.docx_output)
		update_equation_style(args.docx_input, args.docx_output)
		correct_quotes_docx(args.docx_input, args.docx_output)
		style_updates_config = load_format_style_updates_config(args.style_config)

		format_style_steps = [
		("update_figure_captions", update_figure_captions),
		("update_heading_styles", update_heading_styles),
		("update_figure_style", update_figure_style),
		("update_table_captions", update_table_captions),
		("update_abbreviations", update_abbreviations),
		("update_table_rows", update_table_rows),
		("update_notes", update_notes),
		("update_references", update_references),
		("update_lists", update_lists),
		("update_body_text_style", update_body_text_style),
		("add_no_break_hyphens", add_no_break_hyphens),
		("update_references_style", update_references_style),
		("update_source_code_style", update_source_code_style),
		("add_break_after_code_blocks_and_tables", add_break_after_code_blocks_and_tables),
		("update_equation_style", update_equation_style),
		("correct_quotes_docx", correct_quotes_docx)
		]

		for step_name, step_func in format_style_steps:
		if style_updates_config[step_name]:
		step_func(args.docx_input, args.docx_output)
		else:
		print(f'Skipped "{step_name}" (disabled in style config)')

		# Final sanitization: remove invalid XML characters
		if style_updates_config["sanitize_document_xml"]:
		ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
		with zipfile.ZipFile(args.docx_output, 'r') as zin:
		xml_data = zin.read("word/document.xml")
		@@ -2758,3 +2835,5 @@ def update_format_styles_cli():
		finally:
		if os.path.exists(tmp_path):
		os.remove(tmp_path)
		else:
		print('Skipped "sanitize_document_xml" (disabled in style config)')
		No newline at end of file