Commit 2904106d authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Adding debug for release notes diff + renaming html files

parent 620f1aaa
Loading
Loading
Loading
Loading
Loading
+22 −4
Original line number Diff line number Diff line
@@ -55,11 +55,29 @@ function getIdFromURL() {
      }
    }
  } else {
    // no anchor, deduct from filename
    const filename = window.location.pathname.split("/").pop().split(".")[0];
    const topHeadingId = filename.split("-")[1];
    // no anchor — match the current page filename against TOC hrefs
    const filename = window.location.pathname.split("/").pop();
    if (filename) {
      const tocLink = Array.from(document.querySelectorAll("nav#TOC a")).find(
        (a) => {
          const href = a.getAttribute("href") || "";
          const hrefFile = href.split("#")[0].replace(/^\.\//, "");
          return hrefFile === filename;
        }
      );
      if (tocLink && tocLink.id) {
        tocId = tocLink.id;
      }
    }
    if (!tocId && filename) {
      // fallback for pandoc-style {number}-{slug}.html names
      const basename = filename.split(".")[0];
      const parts = basename.split("-");
      const topHeadingId =
        parts.length > 1 ? parts.slice(1).join("-") : basename;
      tocId = `toc-${topHeadingId}`;
    }
  }
  return tocId;
}

+125 −6
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@ from src.utils import (
    get_output_doc_path,
    p_error,
    p_label,
    p_warning,
    validate_src_directory,
    validate_type,
    validate_conversion,
@@ -35,7 +36,10 @@ from src.to_md.preprocessing import preprocess as preprocess_md
from src.to_md.postprocessing import postprocess as postprocess_md

from src.to_html.preprocessing import preprocess as preprocess_html
from src.to_html.postprocessing import postprocess as postprocess_html
from src.to_html.postprocessing import (
    postprocess as postprocess_html,
    rename_html_files_by_heading,
)
from src.to_html.postprocessing import clone_toc_in_file as clone_toc_in_file_html

from src.to_docx.preprocessing import preprocess as preprocess_docx
@@ -383,6 +387,9 @@ def convert(conversion_args=None):
                shutil.copy2(css_file, os.path.join(FILEGEN_DIR, FOLDER))
            shutil.copy("advancedTOCLogic.js", DEST)

            with get_timer().section("Rename HTML files to heading-based names"):
                rename_html_files_by_heading(DEST)

            # Cleanup the consolidated Markdown
            handle_consolidated_md("delete", SRC, CONSOLIDATED_MD_PATH)

@@ -492,6 +499,56 @@ def convert(conversion_args=None):
        convert_md_to_html()


def _editor_text_length(html_path: str) -> int | None:
    """Return #editor text length, -1 if no editor div, None if file missing."""
    if not os.path.isfile(html_path):
        return None
    try:
        with open(html_path, encoding="utf-8") as html_file:
            soup = BeautifulSoup(html_file.read(), "html.parser")
    except OSError:
        return None
    editor = soup.find("div", id="editor")
    if editor is None:
        return -1
    return len(editor.get_text(strip=True))


def _log_diff_html_inventory(diff_source_dir: str, dest_dir: str) -> None:
    """Log base vs new HTML filenames to spot pairing mismatches."""
    if not os.path.isdir(diff_source_dir):
        print(
            p_warning(
                f"[diff-debug] Base HTML directory does not exist: {diff_source_dir}"
            )
        )
        return
    base_files = {
        name
        for name in os.listdir(diff_source_dir)
        if name.endswith(".html")
    }
    new_files = {name for name in os.listdir(dest_dir) if name.endswith(".html")}
    only_in_new = sorted(new_files - base_files)
    only_in_base = sorted(base_files - new_files)
    print(
        f"[diff-debug] HTML inventory: base={len(base_files)}, new={len(new_files)}, "
        f"paired={len(base_files & new_files)}"
    )
    if only_in_new:
        print(
            p_warning(
                f"[diff-debug] New HTML without base counterpart ({len(only_in_new)}): "
                f"{', '.join(only_in_new)}"
            )
        )
    if only_in_base:
        print(
            f"[diff-debug] Base HTML without new counterpart ({len(only_in_base)}): "
            f"{', '.join(only_in_base)}"
        )


def get_comparing_folder_from_git():
    # Check if the DIFF_PATH is a git repo
    if not git_helper.is_git_repo(SRC):
@@ -557,9 +614,27 @@ def get_comparing_folder_from_git():
            sys.exit(1)
    else:
        branch_target = None
    cached_base_commit, cached_rename_scheme = load_target_commit(
        os.path.join(FILEGEN_DIR, f"{FOLDER}-base")
    )
    print(
        f"[diff-debug] Diff git target: requested={p_label(GIT_CHECKOUT_NAME)}, "
        f"resolved={p_label(target_to_checkout)}"
        f"{f', branch={p_label(branch_target)}' if branch_target else ''}"
    )
    print(
        f"[diff-debug] Current repo state: branch={p_label(current_branch)}, "
        f"commit={p_label(current_commit_hash)}"
    )
    print(
        f"[diff-debug] Cached base commit: {p_label(cached_base_commit or '(none)')}, "
        f"rename scheme: {p_label(cached_rename_scheme or '(none)')}, "
        f"matches resolved={cached_base_commit == target_to_checkout}, "
        f"matches scheme={cached_rename_scheme == HTML_RENAME_SCHEME}"
    )
    if (
        load_target_commit(os.path.join(FILEGEN_DIR, f"{FOLDER}-base"))
        == target_to_checkout
        cached_base_commit == target_to_checkout
        and cached_rename_scheme == HTML_RENAME_SCHEME
    ):
        print(
            f"The base HTML files for commit {p_label(target_to_checkout)} {f'(branch: {p_label(branch_target)}) ' if branch_target else ''}have already been generated. Using existing files for diff..."
@@ -677,17 +752,25 @@ def restore_original_state(original_branch, changes_stashed):
        apply_stash(changes_stashed)


HTML_RENAME_SCHEME = "heading-title-v2"


def save_target_commit(target_commit, path):
    with open(os.path.join(path, ".gittargetcommit"), "w") as f:
        f.write(target_commit)
        f.write(f"{target_commit}\n{HTML_RENAME_SCHEME}\n")


def load_target_commit(path):
    target_commit_path = os.path.join(path, ".gittargetcommit")
    if os.path.exists(target_commit_path):
        with open(target_commit_path, "r") as f:
            return f.read().strip()
    return None
            lines = f.read().strip().splitlines()
        if not lines:
            return None, None
        commit = lines[0]
        scheme = lines[1] if len(lines) > 1 else None
        return commit, scheme
    return None, None


### Run script
@@ -727,18 +810,54 @@ if SRC_TYPE == "md" and DEST_TYPE == "html":
            else:
                diff_source_dir = os.path.join(FILEGEN_DIR, f"{FOLDER}-base", "html")
            print(f"Applying diff from source directory: {diff_source_dir}...")
            _log_diff_html_inventory(diff_source_dir, DEST)
            # Iterate through new HTML files and create diffs
            filename_count_mapping = {}
            missing_base_files: list[str] = []
            empty_base_editors: list[str] = []
            for filename in os.listdir(DEST):
                if filename.endswith(".html"):
                    new_file_path = os.path.join(DEST, filename)
                    comparing_file_path = os.path.join(diff_source_dir, filename)
                    old_editor_len = _editor_text_length(comparing_file_path)
                    new_editor_len = _editor_text_length(new_file_path)
                    if old_editor_len is None:
                        missing_base_files.append(filename)
                        print(
                            p_warning(
                                f"[diff-debug] No base HTML for '{filename}' "
                                f"(expected: {comparing_file_path}) — page will diff as all-new"
                            )
                        )
                    elif old_editor_len == 0:
                        empty_base_editors.append(filename)
                        print(
                            p_warning(
                                f"[diff-debug] Base HTML for '{filename}' has empty #editor "
                                f"(new #editor={new_editor_len} chars)"
                            )
                        )
                    # Create diff and overwrite the new file
                    count = make_trackchanges_diff(
                        comparing_file_path, new_file_path, new_file_path
                    )
                    print(
                        f"[diff-debug] {filename}: old_editor="
                        f"{old_editor_len if old_editor_len is not None else 'missing'} chars, "
                        f"new_editor={new_editor_len} chars, changes={count or 0}"
                    )
                    if count and count > 0:
                        filename_count_mapping[filename] = count
            print(
                f"[diff-debug] Diff summary: pages_with_changes={len(filename_count_mapping)}, "
                f"missing_base={len(missing_base_files)}, empty_base_editor={len(empty_base_editors)}"
            )
            if missing_base_files:
                print(
                    p_warning(
                        f"[diff-debug] Pages missing base HTML: {', '.join(missing_base_files)}"
                    )
                )
            shutil.copy("diffVisualizer.js", DEST)
            toc_soup = None
            if os.path.exists(toc_path):
+11 −5
Original line number Diff line number Diff line
@@ -764,14 +764,20 @@ def get_element_repr(elem):
def make_trackchanges_diff(
    old_path: str, new_path: str, out_path: str = "diff.html"
) -> int | None:
    old_path_obj = Path(old_path)
    new_path_obj = Path(new_path)
    try:
        old_html = Path(old_path).read_text(encoding="utf-8")
    except Exception as e:
        old_html = old_path_obj.read_text(encoding="utf-8")
    except FileNotFoundError:
        old_html = "<html><body><div id='editor'></div></body></html>"
    except OSError as exc:
        print(f"[diff-debug] Failed to read old HTML '{old_path}': {exc}")
        old_html = "<html><body><div id='editor'></div></body></html>"

    try:
        new_html = Path(new_path).read_text(encoding="utf-8")
    except Exception as e:
        new_html = new_path_obj.read_text(encoding="utf-8")
    except OSError as exc:
        print(f"[diff-debug] Failed to read new HTML '{new_path}': {exc}")
        new_html = "<html><body><div id='editor'></div></body></html>"

    # Parse with BeautifulSoup
+77 −26
Original line number Diff line number Diff line
@@ -4,7 +4,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString
from ..time_book import get_timer

from src.utils import (
    apply_renaming_logic,
    apply_filename_mapping_to_html,
    get_dirty_filenames_mapping_with_expected_filenames,
    p_error,
    p_warning,
@@ -45,15 +45,80 @@ def fix_toc_links(soup: BeautifulSoup, filenames_mapping: dict):
    Fixes the table of contents links in the HTML by updating their href attributes
    based on the provided filenames mapping.
    """
    toc_links = soup.select("#TOC a")
    return apply_filename_mapping_to_html(soup, filenames_mapping)

    for link in toc_links:
        href = link.get("href", "")
        before_ash, after_ash = href.split("#", 1) if "#" in href else (href, "")
        if before_ash in filenames_mapping:
            link["href"] = f"{filenames_mapping[before_ash]}#{after_ash}"

    return soup
def apply_filename_mapping_to_html_dir(
    html_dir: str, filenames_mapping: dict[str, str]
) -> None:
    """Rewrite internal links in every HTML file after chunk renaming."""
    if not any(old != new for old, new in filenames_mapping.items()):
        return

    for filename in os.listdir(html_dir):
        if not filename.endswith(".html"):
            continue
        file_path = os.path.join(html_dir, filename)
        with open(file_path, encoding="utf-8") as html_file:
            soup = BeautifulSoup(html_file.read(), "html.parser")
        soup = apply_filename_mapping_to_html(soup, filenames_mapping)
        with open(file_path, "w", encoding="utf-8") as html_file:
            html_file.write(str(soup))


def rename_html_files_by_heading(html_dir: str) -> dict[str, str]:
    """
    Rename pandoc sequential chunked HTML files to stable names derived from each
    page's H1 heading (e.g. 5-conventions.html -> 4-conventions.html when the
    heading is "4 Conventions").
    """
    filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir)
    pending = {
        old_name: new_name
        for old_name, new_name in filenames_mapping.items()
        if old_name != new_name
    }
    if not pending:
        return filenames_mapping

    temp_suffix = ".heading-rename"
    for old_name in pending:
        os.rename(
            os.path.join(html_dir, old_name),
            os.path.join(html_dir, old_name + temp_suffix),
        )

    restored: list[str] = []
    for old_name, new_name in pending.items():
        src = os.path.join(html_dir, old_name + temp_suffix)
        dst = os.path.join(html_dir, new_name)
        if os.path.exists(dst):
            print(
                p_warning(
                    f"HTML rename collision: '{new_name}' already exists; "
                    f"keeping pandoc name '{old_name}'"
                )
            )
            os.rename(src, os.path.join(html_dir, old_name))
            restored.append(old_name)
            continue
        os.rename(src, dst)

    for old_name in restored:
        filenames_mapping[old_name] = old_name

    apply_filename_mapping_to_html_dir(html_dir, filenames_mapping)

    renamed_count = len(pending) - len(restored)
    print(
        f"[diff-debug] Renamed {renamed_count} HTML file(s) to heading-based names"
    )
    for old_name, new_name in sorted(pending.items()):
        if old_name in restored:
            continue
        print(f"[diff-debug]   {old_name} -> {new_name}")

    return filenames_mapping


def get_document_title_from_html(html_dir: str, front_page: str = "front-page.html"):
@@ -1072,34 +1137,20 @@ def postprocess(html_dir: str, no_lazy_toc: bool = False):
    ### Arguments
    - `html_dir`: Directory containing the HTML files to be processed
    """
    with get_timer().section("Postprocessing: 1 - Get dirty filenames mapping"):
        filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(
            html_dir
        )
    with get_timer().section("Postprocessing: 1-2 - Rename HTML files by heading"):
        filenames_mapping = rename_html_files_by_heading(html_dir)
    images_mapping = {}
    html_files = []
    processed_soups = []

    # Read and rename all HTML files
    with get_timer().section("Postprocessing: 2 - Read and Rename HTML Files"):
    with get_timer().section("Postprocessing: 2 - Read renamed HTML files"):
        for filename in os.listdir(html_dir):
            if filename.endswith(".html"):
                with open(
                    os.path.join(html_dir, filename), "r", encoding="utf-8"
                ) as file:
                    html_content = file.read()

                if filename == "index.html":
                    new_filename = filename
                else:
                    new_filename = apply_renaming_logic(html_content, filename, "html")

                os.rename(
                    os.path.join(html_dir, filename),
                    os.path.join(html_dir, new_filename),
                )

                html_files.append((new_filename, html_content))
                html_files.append((filename, html_content))

    # First pass: process all HTML files while keeping soup in memory

+86 −18
Original line number Diff line number Diff line
@@ -488,6 +488,61 @@ def get_output_doc_path(dest: str):
    return f"{dest}/{OUTPUT_DOC_NAME}"


def slugify_heading_text(text: str) -> str:
    """Turn heading words into a stable filename slug."""
    text = text.lower().strip()
    text = re.sub(r"\([^)]*\)", "", text)
    text = re.sub(r"[^\w\s-]", "", text)
    text = re.sub(r"[\s_]+", "-", text)
    text = re.sub(r"-+", "-", text)
    return text.strip("-")


def find_page_h1(soup: BeautifulSoup):
    """Return the page H1, preferring the main #editor content area."""
    editor = soup.find("div", id="editor")
    search_root = editor if editor else soup
    return search_root.find("h1", id=True) or search_root.find("h1")


def split_html_href(href: str) -> tuple[str, str, str]:
    """Split href into (path_prefix, filename, fragment). path_prefix is '' or './'."""
    prefix = "./" if href.startswith("./") else ""
    path = href[2:] if prefix else href
    if "#" in path:
        filename, fragment = path.split("#", 1)
    else:
        filename, fragment = path, ""
    return prefix, filename, fragment


def remap_html_href(href: str, filenames_mapping: dict[str, str]) -> str | None:
    """Return an updated href when filename is in the mapping, else None."""
    if not href or href.startswith(("http://", "https://", "mailto:", "#")):
        return None
    prefix, filename, fragment = split_html_href(href)
    if not filename or filename not in filenames_mapping:
        return None
    new_name = filenames_mapping[filename]
    if new_name == filename and not fragment:
        return None
    new_href = f"{prefix}{new_name}"
    if fragment:
        new_href = f"{new_href}#{fragment}"
    return new_href


def apply_filename_mapping_to_html(
    soup: BeautifulSoup, filenames_mapping: dict[str, str]
) -> BeautifulSoup:
    """Update internal page links after HTML files are renamed."""
    for link in soup.find_all("a", href=True):
        new_href = remap_html_href(link["href"], filenames_mapping)
        if new_href:
            link["href"] = new_href
    return soup


def apply_renaming_logic(text: str, filename: str, postfix: str) -> str:
    new_filename = filename
    if postfix == "md":
@@ -514,26 +569,39 @@ def apply_renaming_logic(text: str, filename: str, postfix: str) -> str:
        if soup.find("div", class_="ZA"):
            new_filename = f"front-page.{postfix}"
        else:
            title_tag = soup.find("h1", id=True)
            title = title_tag.get_text()
            if title.startswith("Annex"):
                annex_number = title.split()[1].lower().replace(":", "")
                new_filename = f"annex-{annex_number}.{postfix}"
            title_tag = find_page_h1(soup)
            if title_tag is None:
                return re.sub(r"^\d+-", "", filename).lower()

            title = " ".join(title_tag.get_text().split())
            annex_match = re.match(r"^Annex\s+([A-Za-z])", title, re.IGNORECASE)
            if annex_match:
                new_filename = f"annex-{annex_match.group(1).lower()}.{postfix}"
            else:
                header_regex = r"^(\d+)\s"
                match = re.match(header_regex, title)
                annex_sub_match = re.match(
                    r"^([A-Z]\.\d+(?:\.\d+)*)\s+(.+)", title
                )
                if annex_sub_match:
                    clause_id = annex_sub_match.group(1).lower()
                    slug = slugify_heading_text(annex_sub_match.group(2))
                    if slug:
                        new_filename = f"{clause_id}-{slug}.{postfix}"
                    else:
                        new_filename = f"{clause_id}.{postfix}"
                else:
                    match = re.match(r"^(\d+)\s+(.+)", title)
                    if match:
                        chapter_number = match.group(1)
                    if postfix == "md" and chapter_number == "1":
                        new_filename = f"{SCOPE}.{postfix}"
                    elif postfix == "md" and chapter_number == "2":
                        new_filename = f"{REFS}.{postfix}"
                    elif postfix == "md" and chapter_number == "3":
                        new_filename = f"{DEFS}.{postfix}"
                        slug = slugify_heading_text(match.group(2))
                        if slug:
                            new_filename = f"{chapter_number}-{slug}.{postfix}"
                        else:
                            new_filename = f"clause-{chapter_number}.{postfix}"
                    else:
                    # it is a clause without a number, just use the filename without the leading number
                        slug = slugify_heading_text(title)
                        if slug:
                            new_filename = f"{slug}.{postfix}"
                        else:
                            new_filename = re.sub(r"^\d+-", "", filename).lower()
    return new_filename