Commit 501186aa authored by Marco Cavalli's avatar Marco Cavalli
Browse files

feat: enhance preprocessing and postprocessing with timing functionality and improved file handling

parent 2fdbd6c5
Loading
Loading
Loading
Loading
+86 −71
Original line number Diff line number Diff line
import os, re, html, json
from bs4 import BeautifulSoup, Tag, NavigableString
from ..time_book import get_timer

from src.utils import (
    apply_renaming_logic,
@@ -342,6 +343,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
            date_text["data-replace"] = "DATE"
            date_text.string = text
            close_bracket = NavigableString(")")
            date.clear()
            date.append(open_bracket)
            date.append(date_text)
            date.append(close_bracket)
@@ -624,21 +626,21 @@ def shorten_toc_text(soup: BeautifulSoup):
    """
    Remove informative/normative from TOC only
    """
    tocTexts = soup.select("#TOC .norm")
    for tocText in tocTexts:
        tocText.decompose()
    # Find TOC element once to avoid repeated document searches
    toc = soup.select_one("#TOC")

    tocTexts = soup.select(".norm")
    for tocText in tocTexts:
        tocText.string = "Normative"
    if toc:
        # Remove norm and inform elements from TOC first (reduces DOM size for subsequent searches)
        for elem in toc.find_all(class_=["norm", "inform"]):
            elem.decompose()

    tocTexts = soup.select("#TOC .inform")
    for tocText in tocTexts:
        tocText.decompose()
    # Update remaining norm elements (now only outside TOC)
    for elem in soup.find_all(class_="norm"):
        elem.string = "Normative"

    tocTexts = soup.select(".inform")
    for tocText in tocTexts:
        tocText.string = "Informative"
    # Update remaining inform elements (now only outside TOC)
    for elem in soup.find_all(class_="inform"):
        elem.string = "Informative"

    return soup

@@ -917,68 +919,81 @@ def postprocess(html_dir: str):
    ### Arguments
    - `html_dir`: Directory containing the HTML files to be processed
    """
    filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir)
    with get_timer().section("Postprocessing: 1 - Get dirty filenames mapping"):
        filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(
            html_dir
        )
    images_mapping = {}
    html_files = []
    processed_soups = []

    # Read and rename all HTML files
    with get_timer().section("Postprocessing: 2 - Read and Rename HTML Files"):
        for filename in os.listdir(html_dir):
            if filename.endswith(".html"):
            with open(os.path.join(html_dir, filename), "r", encoding="utf-8") as file:
                html = file.read()
                with open(
                    os.path.join(html_dir, filename), "r", encoding="utf-8"
                ) as file:
                    html_content = file.read()

                if filename == "index.html":
                    new_filename = filename
                else:
                new_filename = apply_renaming_logic(html, filename, "html")
                    new_filename = apply_renaming_logic(html_content, filename, "html")

                os.rename(
                os.path.join(html_dir, filename), os.path.join(html_dir, new_filename)
                    os.path.join(html_dir, filename),
                    os.path.join(html_dir, new_filename),
                )
            file_path = os.path.join(html_dir, new_filename)

        with open(file_path, "r", encoding="utf-8") as html:
            soup = BeautifulSoup(html, "html.parser")
                html_files.append((new_filename, html_content))

    # First pass: process all HTML files while keeping soup in memory
    with get_timer().section("Postprocessing: 3 - First Pass - Process HTML Files"):
        for new_filename, html_content in html_files:
            with get_timer().section(
                f"Postprocessing: 3.a - Processing {new_filename}"
            ):
                soup = BeautifulSoup(html_content, "html.parser")
                soup = shorten_toc_text(soup)
                soup = remove_code_blocks_with_only_images(soup)
                soup = format_examples_and_notes(soup)
                soup = format_tables(soup)

                if new_filename == "front-page.html":
                    soup = format_front_page(soup)

                if (
                    new_filename.replace(".html", "") in files_with_references
                ):  # Reference-specific formatting
                    soup = format_references(soup)
                else:
                    soup = add_links_to_references_in_text(soup)

                soup = fix_toc_links(soup, filenames_mapping)
                soup = move_dangling_brackets_out_of_links(soup)
                soup = fix_ex_json_spacing(soup)
                soup = unwrap_gt_lt_code_tags(soup)

                soup = handle_ew_div(soup)

                soup = remove_links_from_labels(soup)
                soup = add_ids_to_labels(soup)
                soup = replace_dash_characters(soup)
                soup = move_figure_id_to_FL_elements(soup)
                soup = fix_custom_tags(soup)
                soup = fix_lists(soup)

                images, soup = extract_images_from_html(soup)
                for image_id, image_src in images.items():
            images_mapping[image_src] = {"id": image_id, "file": new_filename}

        contents = soup.decode_contents()
                    images_mapping[image_src] = {
                        "id": image_id,
                        "file": new_filename,
                    }

        with open(file_path, "w", encoding="utf-8") as html:
            html.write(contents)
                # Keep soup in memory instead of writing and re-reading
                processed_soups.append((new_filename, soup))

    for filename in os.listdir(html_dir):
        if filename.endswith(".html"):
    # Second pass: reuse soup objects already in memory
    with get_timer().section(
        "Postprocessing: 4 - Second Pass - Add Links and Write Files"
    ):
        for filename, soup in processed_soups:
            file_path = os.path.join(html_dir, filename)
            with open(file_path, "r", encoding="utf-8") as html:
                soup = BeautifulSoup(html, "html.parser")

            try:
                soup = add_custom_link_to_images(soup, images_mapping)
@@ -988,8 +1003,8 @@ def postprocess(html_dir: str):
                print(p_error(str(e)))
                os._exit(1)

            # Write the final file only once
            contents = soup.decode_contents()

            with open(file_path, "w", encoding="utf-8") as html:
                html.write(contents)

+88 −77
Original line number Diff line number Diff line
import os, re, os, json
import os, re, json
import sys
from typing_extensions import Literal
from ..time_book import get_timer

from src.constants import (
    NORMATIVE_REF_FILE,
@@ -28,8 +29,6 @@ files_with_references = [NORMATIVE_REF_FILE, INFORMATIVE_REF_FILE]


# region Helpers


def undo_prettier_formatting(text: str) -> str:
    """Undo any formatting changes made by Prettier to ensure the Markdown is in a more raw format for processing."""

@@ -649,10 +648,16 @@ def add_ids_to_references(file_contents: str, filename: str):
                return f'<span id="{new_ref}" />[{new_ref}]'

            file_contents = re.sub(
                REF_REGEX_I, replace_informative_ref, file_contents, flags=re.MULTILINE
                REF_REGEX_I,
                replace_informative_ref,
                file_contents,
                flags=re.MULTILINE,
            )
            file_contents = re.sub(
                REF_REGEX_N, replace_normative_ref, file_contents, flags=re.MULTILINE
                REF_REGEX_N,
                replace_normative_ref,
                file_contents,
                flags=re.MULTILINE,
            )

            with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file:
@@ -666,8 +671,6 @@ def add_ids_to_references(file_contents: str, filename: str):


# endregion


def preprocess(
    src: str, src_type: str, consolidated_md_path: str, file_order_json: str
):
@@ -699,27 +702,32 @@ def preprocess(
    annexes = DEFAULT_ANNEXES

    # create REFERENCE_MAPPING_MD_TO_HTML file locally if it doesn't exist
    with get_timer().section("Preprocessing: 1 - Creating reference mapping file"):
        if not os.path.exists(REFERENCE_MAPPING_MD_TO_HTML):
            with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file:
                json.dump({}, ref_file, indent=4)

    with get_timer().section("Preprocessing: 2 - Preprocessing Markdown files"):
        if file_order_json:
            with open(file_order_json, "r") as file:
                json_data = json.load(file)
                clauses = json_data.get("clauses")
                annexes = json_data.get("annexes")

    files, clauses_filenames, annexes_filenames = get_file_order(src, clauses, annexes)
        files, clauses_filenames, annexes_filenames = get_file_order(
            src, clauses, annexes
        )
        files = [f"{filename}.md" for filename in files]
        clauses_filenames = [f"{filename}.md" for filename in clauses_filenames]
        annexes_filenames = [f"{filename}.md" for filename in annexes_filenames]
        preprocessed_filenames = []
    with get_timer().section("Preprocessing: 3 - Processing individual files"):
        for filename in files:
            filename_without_extension = filename[:-3]  # Remove .md extension
            if filename.endswith(src_type) and filename != "consolidated.md":
                input_path = os.path.join(src, filename)
                try:
                text = open(input_path, "r", encoding="utf-8").read()
                    with open(input_path, "r", encoding="utf-8") as file:
                        text = file.read()

                    text = undo_prettier_formatting(text)
                    run_format_checks(filename, text.splitlines())
@@ -749,7 +757,8 @@ def preprocess(
                        r"([\w-]+?).md", r"--preprocessed--\1.md", filename
                    )  # Ensure file order is preserved by keeping the number in front
                    output_path = os.path.join(src, new_filename)
                open(output_path, "w", encoding="utf-8").write(text)
                    with open(output_path, "w", encoding="utf-8") as file:
                        file.write(text)
                    preprocessed_filenames.append(new_filename)
                except Exception as e:
                    # print(f"Error: {e}")
@@ -766,8 +775,10 @@ def preprocess(
                            if f.startswith("--preprocessed--"):
                                os.remove(os.path.join(src, f))
                        sys.exit(1)
                pass

    handle_consolidated_md("create", src, consolidated_md_path, preprocessed_filenames)
                    p_warning(f"Warning: Could not preprocess {input_path}: {e}")
    with get_timer().section("Preprocessing: 4 - Creating consolidated Markdown file"):
        handle_consolidated_md(
            "create", src, consolidated_md_path, preprocessed_filenames
        )

    return filename_numbers_mapping