Commit 12fd2c30 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Adding config file for postprocessing steps execution

parent 06cbba3a
Loading
Loading
Loading
Loading
Loading
+121 −42
Original line number Diff line number Diff line
@@ -19,6 +19,67 @@ from docx.shared import Cm

#from file_helper import get_all_files_from_dir

DEFAULT_FORMAT_STYLE_UPDATES = {
    "update_figure_captions": True,
    "update_heading_styles": True,
    "update_figure_style": True,
    "update_table_captions": True,
    "update_abbreviations": True,
    "update_table_rows": True,
    "update_notes": True,
    "update_references": True,
    "update_lists": True,
    "update_body_text_style": True,
    "add_no_break_hyphens": True,
    "update_references_style": True,
    "update_source_code_style": True,
    "add_break_after_code_blocks_and_tables": True,
    "update_equation_style": True,
    "correct_quotes_docx": True,
    "sanitize_document_xml": True
}


def load_format_style_updates_config(config_path=None):
    """
    Load optional JSON config for update_format_styles step toggles.

    Expected JSON format:
    {
      "updates": {
        "update_figure_captions": true,
        "update_heading_styles": false
      }
    }

    The root object can also directly contain step names.
    Unknown keys are ignored.
    """
    config = DEFAULT_FORMAT_STYLE_UPDATES.copy()
    if not config_path:
        return config

    with open(config_path, "r", encoding="utf-8") as config_file:
        raw_config = json.load(config_file)

    update_toggles = raw_config.get("updates", raw_config)
    if not isinstance(update_toggles, dict):
        raise ValueError("Style config must be a JSON object or contain an 'updates' object.")

    unknown_keys = []
    for key, value in update_toggles.items():
        if key not in config:
            unknown_keys.append(key)
            continue
        if not isinstance(value, bool):
            raise ValueError(f"Config value for '{key}' must be a boolean.")
        config[key] = value

    if unknown_keys:
        print(f"Ignoring unknown update toggle(s): {', '.join(sorted(unknown_keys))}")

    return config

def sanitize_xml_text(text):
    """
    Remove invalid XML control characters from text.
@@ -2711,26 +2772,42 @@ def update_format_styles_cli():
    parser = argparse.ArgumentParser(description="Update format styles in a DOCX file.")
    parser.add_argument("docx_input", help="Path to input DOCX file")
    parser.add_argument("docx_output", help="Path to output DOCX file")
    parser.add_argument(
        "--style-config",
        help="Path to JSON config file with format-style update toggles",
        default=None
    )
    args = parser.parse_args()

    update_figure_captions(args.docx_input, args.docx_output)
    update_heading_styles(args.docx_input, args.docx_output)
    update_figure_style(args.docx_input, args.docx_output)
    update_table_captions(args.docx_input, args.docx_output)
    update_abbreviations(args.docx_input, args.docx_output)
    update_table_rows(args.docx_input, args.docx_output)
    update_notes(args.docx_input, args.docx_output)
    update_references(args.docx_input, args.docx_output)
    update_lists(args.docx_input, args.docx_output)
    update_body_text_style(args.docx_input, args.docx_output)
    add_no_break_hyphens(args.docx_input, args.docx_output)
    update_references_style(args.docx_input, args.docx_output)
    update_source_code_style(args.docx_input, args.docx_output)
    add_break_after_code_blocks_and_tables(args.docx_input, args.docx_output)
    update_equation_style(args.docx_input, args.docx_output)
    correct_quotes_docx(args.docx_input, args.docx_output)
    style_updates_config = load_format_style_updates_config(args.style_config)

    format_style_steps = [
        ("update_figure_captions", update_figure_captions),
        ("update_heading_styles", update_heading_styles),
        ("update_figure_style", update_figure_style),
        ("update_table_captions", update_table_captions),
        ("update_abbreviations", update_abbreviations),
        ("update_table_rows", update_table_rows),
        ("update_notes", update_notes),
        ("update_references", update_references),
        ("update_lists", update_lists),
        ("update_body_text_style", update_body_text_style),
        ("add_no_break_hyphens", add_no_break_hyphens),
        ("update_references_style", update_references_style),
        ("update_source_code_style", update_source_code_style),
        ("add_break_after_code_blocks_and_tables", add_break_after_code_blocks_and_tables),
        ("update_equation_style", update_equation_style),
        ("correct_quotes_docx", correct_quotes_docx)
    ]

    for step_name, step_func in format_style_steps:
        if style_updates_config[step_name]:
            step_func(args.docx_input, args.docx_output)
        else:
            print(f'Skipped "{step_name}" (disabled in style config)')
    
    # Final sanitization: remove invalid XML characters
    if style_updates_config["sanitize_document_xml"]:
        ns = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
        with zipfile.ZipFile(args.docx_output, 'r') as zin:
            xml_data = zin.read("word/document.xml")
@@ -2758,3 +2835,5 @@ def update_format_styles_cli():
        finally:
            if os.path.exists(tmp_path):
                os.remove(tmp_path)
    else:
        print('Skipped "sanitize_document_xml" (disabled in style config)')
 No newline at end of file