Loading generateBaseline/postprocessing.py +121 −42 Original line number Diff line number Diff line Loading @@ -19,6 +19,67 @@ from docx.shared import Cm #from file_helper import get_all_files_from_dir DEFAULT_FORMAT_STYLE_UPDATES = { "update_figure_captions": True, "update_heading_styles": True, "update_figure_style": True, "update_table_captions": True, "update_abbreviations": True, "update_table_rows": True, "update_notes": True, "update_references": True, "update_lists": True, "update_body_text_style": True, "add_no_break_hyphens": True, "update_references_style": True, "update_source_code_style": True, "add_break_after_code_blocks_and_tables": True, "update_equation_style": True, "correct_quotes_docx": True, "sanitize_document_xml": True } def load_format_style_updates_config(config_path=None): """ Load optional JSON config for update_format_styles step toggles. Expected JSON format: { "updates": { "update_figure_captions": true, "update_heading_styles": false } } The root object can also directly contain step names. Unknown keys are ignored. """ config = DEFAULT_FORMAT_STYLE_UPDATES.copy() if not config_path: return config with open(config_path, "r", encoding="utf-8") as config_file: raw_config = json.load(config_file) update_toggles = raw_config.get("updates", raw_config) if not isinstance(update_toggles, dict): raise ValueError("Style config must be a JSON object or contain an 'updates' object.") unknown_keys = [] for key, value in update_toggles.items(): if key not in config: unknown_keys.append(key) continue if not isinstance(value, bool): raise ValueError(f"Config value for '{key}' must be a boolean.") config[key] = value if unknown_keys: print(f"Ignoring unknown update toggle(s): {', '.join(sorted(unknown_keys))}") return config def sanitize_xml_text(text): """ Remove invalid XML control characters from text. Loading Loading @@ -2711,26 +2772,42 @@ def update_format_styles_cli(): parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.") parser.add_argument("docx_input", help="Path to input DOCX file") parser.add_argument("docx_output", help="Path to output DOCX file") parser.add_argument( "--style-config", help="Path to JSON config file with format-style update toggles", default=None ) args = parser.parse_args() update_figure_captions(args.docx_input, args.docx_output) update_heading_styles(args.docx_input, args.docx_output) update_figure_style(args.docx_input, args.docx_output) update_table_captions(args.docx_input, args.docx_output) update_abbreviations(args.docx_input, args.docx_output) update_table_rows(args.docx_input, args.docx_output) update_notes(args.docx_input, args.docx_output) update_references(args.docx_input, args.docx_output) update_lists(args.docx_input, args.docx_output) update_body_text_style(args.docx_input, args.docx_output) add_no_break_hyphens(args.docx_input, args.docx_output) update_references_style(args.docx_input, args.docx_output) update_source_code_style(args.docx_input, args.docx_output) add_break_after_code_blocks_and_tables(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) style_updates_config = load_format_style_updates_config(args.style_config) format_style_steps = [ ("update_figure_captions", update_figure_captions), ("update_heading_styles", update_heading_styles), ("update_figure_style", update_figure_style), ("update_table_captions", update_table_captions), ("update_abbreviations", update_abbreviations), ("update_table_rows", update_table_rows), ("update_notes", update_notes), ("update_references", update_references), ("update_lists", update_lists), ("update_body_text_style", update_body_text_style), ("add_no_break_hyphens", add_no_break_hyphens), ("update_references_style", update_references_style), ("update_source_code_style", update_source_code_style), ("add_break_after_code_blocks_and_tables", add_break_after_code_blocks_and_tables), ("update_equation_style", update_equation_style), ("correct_quotes_docx", correct_quotes_docx) ] for step_name, step_func in format_style_steps: if style_updates_config[step_name]: step_func(args.docx_input, args.docx_output) else: print(f'Skipped "{step_name}" (disabled in style config)') # Final sanitization: remove invalid XML characters if style_updates_config["sanitize_document_xml"]: ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} with zipfile.ZipFile(args.docx_output, 'r') as zin: xml_data = zin.read("word/document.xml") Loading Loading @@ -2758,3 +2835,5 @@ def update_format_styles_cli(): finally: if os.path.exists(tmp_path): os.remove(tmp_path) else: print('Skipped "sanitize_document_xml" (disabled in style config)') No newline at end of file Loading
generateBaseline/postprocessing.py +121 −42 Original line number Diff line number Diff line Loading @@ -19,6 +19,67 @@ from docx.shared import Cm #from file_helper import get_all_files_from_dir DEFAULT_FORMAT_STYLE_UPDATES = { "update_figure_captions": True, "update_heading_styles": True, "update_figure_style": True, "update_table_captions": True, "update_abbreviations": True, "update_table_rows": True, "update_notes": True, "update_references": True, "update_lists": True, "update_body_text_style": True, "add_no_break_hyphens": True, "update_references_style": True, "update_source_code_style": True, "add_break_after_code_blocks_and_tables": True, "update_equation_style": True, "correct_quotes_docx": True, "sanitize_document_xml": True } def load_format_style_updates_config(config_path=None): """ Load optional JSON config for update_format_styles step toggles. Expected JSON format: { "updates": { "update_figure_captions": true, "update_heading_styles": false } } The root object can also directly contain step names. Unknown keys are ignored. """ config = DEFAULT_FORMAT_STYLE_UPDATES.copy() if not config_path: return config with open(config_path, "r", encoding="utf-8") as config_file: raw_config = json.load(config_file) update_toggles = raw_config.get("updates", raw_config) if not isinstance(update_toggles, dict): raise ValueError("Style config must be a JSON object or contain an 'updates' object.") unknown_keys = [] for key, value in update_toggles.items(): if key not in config: unknown_keys.append(key) continue if not isinstance(value, bool): raise ValueError(f"Config value for '{key}' must be a boolean.") config[key] = value if unknown_keys: print(f"Ignoring unknown update toggle(s): {', '.join(sorted(unknown_keys))}") return config def sanitize_xml_text(text): """ Remove invalid XML control characters from text. Loading Loading @@ -2711,26 +2772,42 @@ def update_format_styles_cli(): parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.") parser.add_argument("docx_input", help="Path to input DOCX file") parser.add_argument("docx_output", help="Path to output DOCX file") parser.add_argument( "--style-config", help="Path to JSON config file with format-style update toggles", default=None ) args = parser.parse_args() update_figure_captions(args.docx_input, args.docx_output) update_heading_styles(args.docx_input, args.docx_output) update_figure_style(args.docx_input, args.docx_output) update_table_captions(args.docx_input, args.docx_output) update_abbreviations(args.docx_input, args.docx_output) update_table_rows(args.docx_input, args.docx_output) update_notes(args.docx_input, args.docx_output) update_references(args.docx_input, args.docx_output) update_lists(args.docx_input, args.docx_output) update_body_text_style(args.docx_input, args.docx_output) add_no_break_hyphens(args.docx_input, args.docx_output) update_references_style(args.docx_input, args.docx_output) update_source_code_style(args.docx_input, args.docx_output) add_break_after_code_blocks_and_tables(args.docx_input, args.docx_output) update_equation_style(args.docx_input, args.docx_output) correct_quotes_docx(args.docx_input, args.docx_output) style_updates_config = load_format_style_updates_config(args.style_config) format_style_steps = [ ("update_figure_captions", update_figure_captions), ("update_heading_styles", update_heading_styles), ("update_figure_style", update_figure_style), ("update_table_captions", update_table_captions), ("update_abbreviations", update_abbreviations), ("update_table_rows", update_table_rows), ("update_notes", update_notes), ("update_references", update_references), ("update_lists", update_lists), ("update_body_text_style", update_body_text_style), ("add_no_break_hyphens", add_no_break_hyphens), ("update_references_style", update_references_style), ("update_source_code_style", update_source_code_style), ("add_break_after_code_blocks_and_tables", add_break_after_code_blocks_and_tables), ("update_equation_style", update_equation_style), ("correct_quotes_docx", correct_quotes_docx) ] for step_name, step_func in format_style_steps: if style_updates_config[step_name]: step_func(args.docx_input, args.docx_output) else: print(f'Skipped "{step_name}" (disabled in style config)') # Final sanitization: remove invalid XML characters if style_updates_config["sanitize_document_xml"]: ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"} with zipfile.ZipFile(args.docx_output, 'r') as zin: xml_data = zin.read("word/document.xml") Loading Loading @@ -2758,3 +2835,5 @@ def update_format_styles_cli(): finally: if os.path.exists(tmp_path): os.remove(tmp_path) else: print('Skipped "sanitize_document_xml" (disabled in style config)') No newline at end of file