Loading generateSpecWebSite/md_to_docx_converter/advancedTOCLogic.js +22 −4 Original line number Diff line number Diff line Loading @@ -55,11 +55,29 @@ function getIdFromURL() { } } } else { // no anchor, deduct from filename const filename = window.location.pathname.split("/").pop().split(".")[0]; const topHeadingId = filename.split("-")[1]; // no anchor — match the current page filename against TOC hrefs const filename = window.location.pathname.split("/").pop(); if (filename) { const tocLink = Array.from(document.querySelectorAll("nav#TOC a")).find( (a) => { const href = a.getAttribute("href") || ""; const hrefFile = href.split("#")[0].replace(/^\.\//, ""); return hrefFile === filename; } ); if (tocLink && tocLink.id) { tocId = tocLink.id; } } if (!tocId && filename) { // fallback for pandoc-style {number}-{slug}.html names const basename = filename.split(".")[0]; const parts = basename.split("-"); const topHeadingId = parts.length > 1 ? parts.slice(1).join("-") : basename; tocId = `toc-${topHeadingId}`; } } return tocId; } Loading generateSpecWebSite/md_to_docx_converter/convert.py +125 −6 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ from src.utils import ( get_output_doc_path, p_error, p_label, p_warning, validate_src_directory, validate_type, validate_conversion, Loading @@ -35,7 +36,10 @@ from src.to_md.preprocessing import preprocess as preprocess_md from src.to_md.postprocessing import postprocess as postprocess_md from src.to_html.preprocessing import preprocess as preprocess_html from src.to_html.postprocessing import postprocess as postprocess_html from src.to_html.postprocessing import ( postprocess as postprocess_html, rename_html_files_by_heading, ) from src.to_html.postprocessing import clone_toc_in_file as clone_toc_in_file_html from src.to_docx.preprocessing import preprocess as preprocess_docx Loading Loading @@ -383,6 +387,9 @@ def convert(conversion_args=None): shutil.copy2(css_file, os.path.join(FILEGEN_DIR, FOLDER)) shutil.copy("advancedTOCLogic.js", DEST) with get_timer().section("Rename HTML files to heading-based names"): rename_html_files_by_heading(DEST) # Cleanup the consolidated Markdown handle_consolidated_md("delete", SRC, CONSOLIDATED_MD_PATH) Loading Loading @@ -492,6 +499,56 @@ def convert(conversion_args=None): convert_md_to_html() def _editor_text_length(html_path: str) -> int | None: """Return #editor text length, -1 if no editor div, None if file missing.""" if not os.path.isfile(html_path): return None try: with open(html_path, encoding="utf-8") as html_file: soup = BeautifulSoup(html_file.read(), "html.parser") except OSError: return None editor = soup.find("div", id="editor") if editor is None: return -1 return len(editor.get_text(strip=True)) def _log_diff_html_inventory(diff_source_dir: str, dest_dir: str) -> None: """Log base vs new HTML filenames to spot pairing mismatches.""" if not os.path.isdir(diff_source_dir): print( p_warning( f"[diff-debug] Base HTML directory does not exist: {diff_source_dir}" ) ) return base_files = { name for name in os.listdir(diff_source_dir) if name.endswith(".html") } new_files = {name for name in os.listdir(dest_dir) if name.endswith(".html")} only_in_new = sorted(new_files - base_files) only_in_base = sorted(base_files - new_files) print( f"[diff-debug] HTML inventory: base={len(base_files)}, new={len(new_files)}, " f"paired={len(base_files & new_files)}" ) if only_in_new: print( p_warning( f"[diff-debug] New HTML without base counterpart ({len(only_in_new)}): " f"{', '.join(only_in_new)}" ) ) if only_in_base: print( f"[diff-debug] Base HTML without new counterpart ({len(only_in_base)}): " f"{', '.join(only_in_base)}" ) def get_comparing_folder_from_git(): # Check if the DIFF_PATH is a git repo if not git_helper.is_git_repo(SRC): Loading Loading @@ -557,9 +614,27 @@ def get_comparing_folder_from_git(): sys.exit(1) else: branch_target = None cached_base_commit, cached_rename_scheme = load_target_commit( os.path.join(FILEGEN_DIR, f"{FOLDER}-base") ) print( f"[diff-debug] Diff git target: requested={p_label(GIT_CHECKOUT_NAME)}, " f"resolved={p_label(target_to_checkout)}" f"{f', branch={p_label(branch_target)}' if branch_target else ''}" ) print( f"[diff-debug] Current repo state: branch={p_label(current_branch)}, " f"commit={p_label(current_commit_hash)}" ) print( f"[diff-debug] Cached base commit: {p_label(cached_base_commit or '(none)')}, " f"rename scheme: {p_label(cached_rename_scheme or '(none)')}, " f"matches resolved={cached_base_commit == target_to_checkout}, " f"matches scheme={cached_rename_scheme == HTML_RENAME_SCHEME}" ) if ( load_target_commit(os.path.join(FILEGEN_DIR, f"{FOLDER}-base")) == target_to_checkout cached_base_commit == target_to_checkout and cached_rename_scheme == HTML_RENAME_SCHEME ): print( f"The base HTML files for commit {p_label(target_to_checkout)} {f'(branch: {p_label(branch_target)}) ' if branch_target else ''}have already been generated. Using existing files for diff..." Loading Loading @@ -677,17 +752,25 @@ def restore_original_state(original_branch, changes_stashed): apply_stash(changes_stashed) HTML_RENAME_SCHEME = "heading-title-v2" def save_target_commit(target_commit, path): with open(os.path.join(path, ".gittargetcommit"), "w") as f: f.write(target_commit) f.write(f"{target_commit}\n{HTML_RENAME_SCHEME}\n") def load_target_commit(path): target_commit_path = os.path.join(path, ".gittargetcommit") if os.path.exists(target_commit_path): with open(target_commit_path, "r") as f: return f.read().strip() return None lines = f.read().strip().splitlines() if not lines: return None, None commit = lines[0] scheme = lines[1] if len(lines) > 1 else None return commit, scheme return None, None ### Run script Loading Loading @@ -727,18 +810,54 @@ if SRC_TYPE == "md" and DEST_TYPE == "html": else: diff_source_dir = os.path.join(FILEGEN_DIR, f"{FOLDER}-base", "html") print(f"Applying diff from source directory: {diff_source_dir}...") _log_diff_html_inventory(diff_source_dir, DEST) # Iterate through new HTML files and create diffs filename_count_mapping = {} missing_base_files: list[str] = [] empty_base_editors: list[str] = [] for filename in os.listdir(DEST): if filename.endswith(".html"): new_file_path = os.path.join(DEST, filename) comparing_file_path = os.path.join(diff_source_dir, filename) old_editor_len = _editor_text_length(comparing_file_path) new_editor_len = _editor_text_length(new_file_path) if old_editor_len is None: missing_base_files.append(filename) print( p_warning( f"[diff-debug] No base HTML for '{filename}' " f"(expected: {comparing_file_path}) — page will diff as all-new" ) ) elif old_editor_len == 0: empty_base_editors.append(filename) print( p_warning( f"[diff-debug] Base HTML for '{filename}' has empty #editor " f"(new #editor={new_editor_len} chars)" ) ) # Create diff and overwrite the new file count = make_trackchanges_diff( comparing_file_path, new_file_path, new_file_path ) print( f"[diff-debug] {filename}: old_editor=" f"{old_editor_len if old_editor_len is not None else 'missing'} chars, " f"new_editor={new_editor_len} chars, changes={count or 0}" ) if count and count > 0: filename_count_mapping[filename] = count print( f"[diff-debug] Diff summary: pages_with_changes={len(filename_count_mapping)}, " f"missing_base={len(missing_base_files)}, empty_base_editor={len(empty_base_editors)}" ) if missing_base_files: print( p_warning( f"[diff-debug] Pages missing base HTML: {', '.join(missing_base_files)}" ) ) shutil.copy("diffVisualizer.js", DEST) toc_soup = None if os.path.exists(toc_path): Loading generateSpecWebSite/md_to_docx_converter/html_diff.py +11 −5 Original line number Diff line number Diff line Loading @@ -764,14 +764,20 @@ def get_element_repr(elem): def make_trackchanges_diff( old_path: str, new_path: str, out_path: str = "diff.html" ) -> int | None: old_path_obj = Path(old_path) new_path_obj = Path(new_path) try: old_html = Path(old_path).read_text(encoding="utf-8") except Exception as e: old_html = old_path_obj.read_text(encoding="utf-8") except FileNotFoundError: old_html = "<html><body><div id='editor'></div></body></html>" except OSError as exc: print(f"[diff-debug] Failed to read old HTML '{old_path}': {exc}") old_html = "<html><body><div id='editor'></div></body></html>" try: new_html = Path(new_path).read_text(encoding="utf-8") except Exception as e: new_html = new_path_obj.read_text(encoding="utf-8") except OSError as exc: print(f"[diff-debug] Failed to read new HTML '{new_path}': {exc}") new_html = "<html><body><div id='editor'></div></body></html>" # Parse with BeautifulSoup Loading generateSpecWebSite/md_to_docx_converter/src/to_html/postprocessing.py +77 −26 Original line number Diff line number Diff line Loading @@ -4,7 +4,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString from ..time_book import get_timer from src.utils import ( apply_renaming_logic, apply_filename_mapping_to_html, get_dirty_filenames_mapping_with_expected_filenames, p_error, p_warning, Loading Loading @@ -45,15 +45,80 @@ def fix_toc_links(soup: BeautifulSoup, filenames_mapping: dict): Fixes the table of contents links in the HTML by updating their href attributes based on the provided filenames mapping. """ toc_links = soup.select("#TOC a") return apply_filename_mapping_to_html(soup, filenames_mapping) for link in toc_links: href = link.get("href", "") before_ash, after_ash = href.split("#", 1) if "#" in href else (href, "") if before_ash in filenames_mapping: link["href"] = f"{filenames_mapping[before_ash]}#{after_ash}" return soup def apply_filename_mapping_to_html_dir( html_dir: str, filenames_mapping: dict[str, str] ) -> None: """Rewrite internal links in every HTML file after chunk renaming.""" if not any(old != new for old, new in filenames_mapping.items()): return for filename in os.listdir(html_dir): if not filename.endswith(".html"): continue file_path = os.path.join(html_dir, filename) with open(file_path, encoding="utf-8") as html_file: soup = BeautifulSoup(html_file.read(), "html.parser") soup = apply_filename_mapping_to_html(soup, filenames_mapping) with open(file_path, "w", encoding="utf-8") as html_file: html_file.write(str(soup)) def rename_html_files_by_heading(html_dir: str) -> dict[str, str]: """ Rename pandoc sequential chunked HTML files to stable names derived from each page's H1 heading (e.g. 5-conventions.html -> 4-conventions.html when the heading is "4 Conventions"). """ filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir) pending = { old_name: new_name for old_name, new_name in filenames_mapping.items() if old_name != new_name } if not pending: return filenames_mapping temp_suffix = ".heading-rename" for old_name in pending: os.rename( os.path.join(html_dir, old_name), os.path.join(html_dir, old_name + temp_suffix), ) restored: list[str] = [] for old_name, new_name in pending.items(): src = os.path.join(html_dir, old_name + temp_suffix) dst = os.path.join(html_dir, new_name) if os.path.exists(dst): print( p_warning( f"HTML rename collision: '{new_name}' already exists; " f"keeping pandoc name '{old_name}'" ) ) os.rename(src, os.path.join(html_dir, old_name)) restored.append(old_name) continue os.rename(src, dst) for old_name in restored: filenames_mapping[old_name] = old_name apply_filename_mapping_to_html_dir(html_dir, filenames_mapping) renamed_count = len(pending) - len(restored) print( f"[diff-debug] Renamed {renamed_count} HTML file(s) to heading-based names" ) for old_name, new_name in sorted(pending.items()): if old_name in restored: continue print(f"[diff-debug] {old_name} -> {new_name}") return filenames_mapping def get_document_title_from_html(html_dir: str, front_page: str = "front-page.html"): Loading Loading @@ -1072,34 +1137,20 @@ def postprocess(html_dir: str, no_lazy_toc: bool = False): ### Arguments - `html_dir`: Directory containing the HTML files to be processed """ with get_timer().section("Postprocessing: 1 - Get dirty filenames mapping"): filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames( html_dir ) with get_timer().section("Postprocessing: 1-2 - Rename HTML files by heading"): filenames_mapping = rename_html_files_by_heading(html_dir) images_mapping = {} html_files = [] processed_soups = [] # Read and rename all HTML files with get_timer().section("Postprocessing: 2 - Read and Rename HTML Files"): with get_timer().section("Postprocessing: 2 - Read renamed HTML files"): for filename in os.listdir(html_dir): if filename.endswith(".html"): with open( os.path.join(html_dir, filename), "r", encoding="utf-8" ) as file: html_content = file.read() if filename == "index.html": new_filename = filename else: new_filename = apply_renaming_logic(html_content, filename, "html") os.rename( os.path.join(html_dir, filename), os.path.join(html_dir, new_filename), ) html_files.append((new_filename, html_content)) html_files.append((filename, html_content)) # First pass: process all HTML files while keeping soup in memory Loading generateSpecWebSite/md_to_docx_converter/src/utils.py +86 −18 Original line number Diff line number Diff line Loading @@ -488,6 +488,61 @@ def get_output_doc_path(dest: str): return f"{dest}/{OUTPUT_DOC_NAME}" def slugify_heading_text(text: str) -> str: """Turn heading words into a stable filename slug.""" text = text.lower().strip() text = re.sub(r"\([^)]*\)", "", text) text = re.sub(r"[^\w\s-]", "", text) text = re.sub(r"[\s_]+", "-", text) text = re.sub(r"-+", "-", text) return text.strip("-") def find_page_h1(soup: BeautifulSoup): """Return the page H1, preferring the main #editor content area.""" editor = soup.find("div", id="editor") search_root = editor if editor else soup return search_root.find("h1", id=True) or search_root.find("h1") def split_html_href(href: str) -> tuple[str, str, str]: """Split href into (path_prefix, filename, fragment). path_prefix is '' or './'.""" prefix = "./" if href.startswith("./") else "" path = href[2:] if prefix else href if "#" in path: filename, fragment = path.split("#", 1) else: filename, fragment = path, "" return prefix, filename, fragment def remap_html_href(href: str, filenames_mapping: dict[str, str]) -> str | None: """Return an updated href when filename is in the mapping, else None.""" if not href or href.startswith(("http://", "https://", "mailto:", "#")): return None prefix, filename, fragment = split_html_href(href) if not filename or filename not in filenames_mapping: return None new_name = filenames_mapping[filename] if new_name == filename and not fragment: return None new_href = f"{prefix}{new_name}" if fragment: new_href = f"{new_href}#{fragment}" return new_href def apply_filename_mapping_to_html( soup: BeautifulSoup, filenames_mapping: dict[str, str] ) -> BeautifulSoup: """Update internal page links after HTML files are renamed.""" for link in soup.find_all("a", href=True): new_href = remap_html_href(link["href"], filenames_mapping) if new_href: link["href"] = new_href return soup def apply_renaming_logic(text: str, filename: str, postfix: str) -> str: new_filename = filename if postfix == "md": Loading @@ -514,26 +569,39 @@ def apply_renaming_logic(text: str, filename: str, postfix: str) -> str: if soup.find("div", class_="ZA"): new_filename = f"front-page.{postfix}" else: title_tag = soup.find("h1", id=True) title = title_tag.get_text() if title.startswith("Annex"): annex_number = title.split()[1].lower().replace(":", "") new_filename = f"annex-{annex_number}.{postfix}" title_tag = find_page_h1(soup) if title_tag is None: return re.sub(r"^\d+-", "", filename).lower() title = " ".join(title_tag.get_text().split()) annex_match = re.match(r"^Annex\s+([A-Za-z])", title, re.IGNORECASE) if annex_match: new_filename = f"annex-{annex_match.group(1).lower()}.{postfix}" else: header_regex = r"^(\d+)\s" match = re.match(header_regex, title) annex_sub_match = re.match( r"^([A-Z]\.\d+(?:\.\d+)*)\s+(.+)", title ) if annex_sub_match: clause_id = annex_sub_match.group(1).lower() slug = slugify_heading_text(annex_sub_match.group(2)) if slug: new_filename = f"{clause_id}-{slug}.{postfix}" else: new_filename = f"{clause_id}.{postfix}" else: match = re.match(r"^(\d+)\s+(.+)", title) if match: chapter_number = match.group(1) if postfix == "md" and chapter_number == "1": new_filename = f"{SCOPE}.{postfix}" elif postfix == "md" and chapter_number == "2": new_filename = f"{REFS}.{postfix}" elif postfix == "md" and chapter_number == "3": new_filename = f"{DEFS}.{postfix}" slug = slugify_heading_text(match.group(2)) if slug: new_filename = f"{chapter_number}-{slug}.{postfix}" else: new_filename = f"clause-{chapter_number}.{postfix}" else: # it is a clause without a number, just use the filename without the leading number slug = slugify_heading_text(title) if slug: new_filename = f"{slug}.{postfix}" else: new_filename = re.sub(r"^\d+-", "", filename).lower() return new_filename Loading Loading
generateSpecWebSite/md_to_docx_converter/advancedTOCLogic.js +22 −4 Original line number Diff line number Diff line Loading @@ -55,11 +55,29 @@ function getIdFromURL() { } } } else { // no anchor, deduct from filename const filename = window.location.pathname.split("/").pop().split(".")[0]; const topHeadingId = filename.split("-")[1]; // no anchor — match the current page filename against TOC hrefs const filename = window.location.pathname.split("/").pop(); if (filename) { const tocLink = Array.from(document.querySelectorAll("nav#TOC a")).find( (a) => { const href = a.getAttribute("href") || ""; const hrefFile = href.split("#")[0].replace(/^\.\//, ""); return hrefFile === filename; } ); if (tocLink && tocLink.id) { tocId = tocLink.id; } } if (!tocId && filename) { // fallback for pandoc-style {number}-{slug}.html names const basename = filename.split(".")[0]; const parts = basename.split("-"); const topHeadingId = parts.length > 1 ? parts.slice(1).join("-") : basename; tocId = `toc-${topHeadingId}`; } } return tocId; } Loading
generateSpecWebSite/md_to_docx_converter/convert.py +125 −6 Original line number Diff line number Diff line Loading @@ -18,6 +18,7 @@ from src.utils import ( get_output_doc_path, p_error, p_label, p_warning, validate_src_directory, validate_type, validate_conversion, Loading @@ -35,7 +36,10 @@ from src.to_md.preprocessing import preprocess as preprocess_md from src.to_md.postprocessing import postprocess as postprocess_md from src.to_html.preprocessing import preprocess as preprocess_html from src.to_html.postprocessing import postprocess as postprocess_html from src.to_html.postprocessing import ( postprocess as postprocess_html, rename_html_files_by_heading, ) from src.to_html.postprocessing import clone_toc_in_file as clone_toc_in_file_html from src.to_docx.preprocessing import preprocess as preprocess_docx Loading Loading @@ -383,6 +387,9 @@ def convert(conversion_args=None): shutil.copy2(css_file, os.path.join(FILEGEN_DIR, FOLDER)) shutil.copy("advancedTOCLogic.js", DEST) with get_timer().section("Rename HTML files to heading-based names"): rename_html_files_by_heading(DEST) # Cleanup the consolidated Markdown handle_consolidated_md("delete", SRC, CONSOLIDATED_MD_PATH) Loading Loading @@ -492,6 +499,56 @@ def convert(conversion_args=None): convert_md_to_html() def _editor_text_length(html_path: str) -> int | None: """Return #editor text length, -1 if no editor div, None if file missing.""" if not os.path.isfile(html_path): return None try: with open(html_path, encoding="utf-8") as html_file: soup = BeautifulSoup(html_file.read(), "html.parser") except OSError: return None editor = soup.find("div", id="editor") if editor is None: return -1 return len(editor.get_text(strip=True)) def _log_diff_html_inventory(diff_source_dir: str, dest_dir: str) -> None: """Log base vs new HTML filenames to spot pairing mismatches.""" if not os.path.isdir(diff_source_dir): print( p_warning( f"[diff-debug] Base HTML directory does not exist: {diff_source_dir}" ) ) return base_files = { name for name in os.listdir(diff_source_dir) if name.endswith(".html") } new_files = {name for name in os.listdir(dest_dir) if name.endswith(".html")} only_in_new = sorted(new_files - base_files) only_in_base = sorted(base_files - new_files) print( f"[diff-debug] HTML inventory: base={len(base_files)}, new={len(new_files)}, " f"paired={len(base_files & new_files)}" ) if only_in_new: print( p_warning( f"[diff-debug] New HTML without base counterpart ({len(only_in_new)}): " f"{', '.join(only_in_new)}" ) ) if only_in_base: print( f"[diff-debug] Base HTML without new counterpart ({len(only_in_base)}): " f"{', '.join(only_in_base)}" ) def get_comparing_folder_from_git(): # Check if the DIFF_PATH is a git repo if not git_helper.is_git_repo(SRC): Loading Loading @@ -557,9 +614,27 @@ def get_comparing_folder_from_git(): sys.exit(1) else: branch_target = None cached_base_commit, cached_rename_scheme = load_target_commit( os.path.join(FILEGEN_DIR, f"{FOLDER}-base") ) print( f"[diff-debug] Diff git target: requested={p_label(GIT_CHECKOUT_NAME)}, " f"resolved={p_label(target_to_checkout)}" f"{f', branch={p_label(branch_target)}' if branch_target else ''}" ) print( f"[diff-debug] Current repo state: branch={p_label(current_branch)}, " f"commit={p_label(current_commit_hash)}" ) print( f"[diff-debug] Cached base commit: {p_label(cached_base_commit or '(none)')}, " f"rename scheme: {p_label(cached_rename_scheme or '(none)')}, " f"matches resolved={cached_base_commit == target_to_checkout}, " f"matches scheme={cached_rename_scheme == HTML_RENAME_SCHEME}" ) if ( load_target_commit(os.path.join(FILEGEN_DIR, f"{FOLDER}-base")) == target_to_checkout cached_base_commit == target_to_checkout and cached_rename_scheme == HTML_RENAME_SCHEME ): print( f"The base HTML files for commit {p_label(target_to_checkout)} {f'(branch: {p_label(branch_target)}) ' if branch_target else ''}have already been generated. Using existing files for diff..." Loading Loading @@ -677,17 +752,25 @@ def restore_original_state(original_branch, changes_stashed): apply_stash(changes_stashed) HTML_RENAME_SCHEME = "heading-title-v2" def save_target_commit(target_commit, path): with open(os.path.join(path, ".gittargetcommit"), "w") as f: f.write(target_commit) f.write(f"{target_commit}\n{HTML_RENAME_SCHEME}\n") def load_target_commit(path): target_commit_path = os.path.join(path, ".gittargetcommit") if os.path.exists(target_commit_path): with open(target_commit_path, "r") as f: return f.read().strip() return None lines = f.read().strip().splitlines() if not lines: return None, None commit = lines[0] scheme = lines[1] if len(lines) > 1 else None return commit, scheme return None, None ### Run script Loading Loading @@ -727,18 +810,54 @@ if SRC_TYPE == "md" and DEST_TYPE == "html": else: diff_source_dir = os.path.join(FILEGEN_DIR, f"{FOLDER}-base", "html") print(f"Applying diff from source directory: {diff_source_dir}...") _log_diff_html_inventory(diff_source_dir, DEST) # Iterate through new HTML files and create diffs filename_count_mapping = {} missing_base_files: list[str] = [] empty_base_editors: list[str] = [] for filename in os.listdir(DEST): if filename.endswith(".html"): new_file_path = os.path.join(DEST, filename) comparing_file_path = os.path.join(diff_source_dir, filename) old_editor_len = _editor_text_length(comparing_file_path) new_editor_len = _editor_text_length(new_file_path) if old_editor_len is None: missing_base_files.append(filename) print( p_warning( f"[diff-debug] No base HTML for '{filename}' " f"(expected: {comparing_file_path}) — page will diff as all-new" ) ) elif old_editor_len == 0: empty_base_editors.append(filename) print( p_warning( f"[diff-debug] Base HTML for '{filename}' has empty #editor " f"(new #editor={new_editor_len} chars)" ) ) # Create diff and overwrite the new file count = make_trackchanges_diff( comparing_file_path, new_file_path, new_file_path ) print( f"[diff-debug] {filename}: old_editor=" f"{old_editor_len if old_editor_len is not None else 'missing'} chars, " f"new_editor={new_editor_len} chars, changes={count or 0}" ) if count and count > 0: filename_count_mapping[filename] = count print( f"[diff-debug] Diff summary: pages_with_changes={len(filename_count_mapping)}, " f"missing_base={len(missing_base_files)}, empty_base_editor={len(empty_base_editors)}" ) if missing_base_files: print( p_warning( f"[diff-debug] Pages missing base HTML: {', '.join(missing_base_files)}" ) ) shutil.copy("diffVisualizer.js", DEST) toc_soup = None if os.path.exists(toc_path): Loading
generateSpecWebSite/md_to_docx_converter/html_diff.py +11 −5 Original line number Diff line number Diff line Loading @@ -764,14 +764,20 @@ def get_element_repr(elem): def make_trackchanges_diff( old_path: str, new_path: str, out_path: str = "diff.html" ) -> int | None: old_path_obj = Path(old_path) new_path_obj = Path(new_path) try: old_html = Path(old_path).read_text(encoding="utf-8") except Exception as e: old_html = old_path_obj.read_text(encoding="utf-8") except FileNotFoundError: old_html = "<html><body><div id='editor'></div></body></html>" except OSError as exc: print(f"[diff-debug] Failed to read old HTML '{old_path}': {exc}") old_html = "<html><body><div id='editor'></div></body></html>" try: new_html = Path(new_path).read_text(encoding="utf-8") except Exception as e: new_html = new_path_obj.read_text(encoding="utf-8") except OSError as exc: print(f"[diff-debug] Failed to read new HTML '{new_path}': {exc}") new_html = "<html><body><div id='editor'></div></body></html>" # Parse with BeautifulSoup Loading
generateSpecWebSite/md_to_docx_converter/src/to_html/postprocessing.py +77 −26 Original line number Diff line number Diff line Loading @@ -4,7 +4,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString from ..time_book import get_timer from src.utils import ( apply_renaming_logic, apply_filename_mapping_to_html, get_dirty_filenames_mapping_with_expected_filenames, p_error, p_warning, Loading Loading @@ -45,15 +45,80 @@ def fix_toc_links(soup: BeautifulSoup, filenames_mapping: dict): Fixes the table of contents links in the HTML by updating their href attributes based on the provided filenames mapping. """ toc_links = soup.select("#TOC a") return apply_filename_mapping_to_html(soup, filenames_mapping) for link in toc_links: href = link.get("href", "") before_ash, after_ash = href.split("#", 1) if "#" in href else (href, "") if before_ash in filenames_mapping: link["href"] = f"{filenames_mapping[before_ash]}#{after_ash}" return soup def apply_filename_mapping_to_html_dir( html_dir: str, filenames_mapping: dict[str, str] ) -> None: """Rewrite internal links in every HTML file after chunk renaming.""" if not any(old != new for old, new in filenames_mapping.items()): return for filename in os.listdir(html_dir): if not filename.endswith(".html"): continue file_path = os.path.join(html_dir, filename) with open(file_path, encoding="utf-8") as html_file: soup = BeautifulSoup(html_file.read(), "html.parser") soup = apply_filename_mapping_to_html(soup, filenames_mapping) with open(file_path, "w", encoding="utf-8") as html_file: html_file.write(str(soup)) def rename_html_files_by_heading(html_dir: str) -> dict[str, str]: """ Rename pandoc sequential chunked HTML files to stable names derived from each page's H1 heading (e.g. 5-conventions.html -> 4-conventions.html when the heading is "4 Conventions"). """ filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir) pending = { old_name: new_name for old_name, new_name in filenames_mapping.items() if old_name != new_name } if not pending: return filenames_mapping temp_suffix = ".heading-rename" for old_name in pending: os.rename( os.path.join(html_dir, old_name), os.path.join(html_dir, old_name + temp_suffix), ) restored: list[str] = [] for old_name, new_name in pending.items(): src = os.path.join(html_dir, old_name + temp_suffix) dst = os.path.join(html_dir, new_name) if os.path.exists(dst): print( p_warning( f"HTML rename collision: '{new_name}' already exists; " f"keeping pandoc name '{old_name}'" ) ) os.rename(src, os.path.join(html_dir, old_name)) restored.append(old_name) continue os.rename(src, dst) for old_name in restored: filenames_mapping[old_name] = old_name apply_filename_mapping_to_html_dir(html_dir, filenames_mapping) renamed_count = len(pending) - len(restored) print( f"[diff-debug] Renamed {renamed_count} HTML file(s) to heading-based names" ) for old_name, new_name in sorted(pending.items()): if old_name in restored: continue print(f"[diff-debug] {old_name} -> {new_name}") return filenames_mapping def get_document_title_from_html(html_dir: str, front_page: str = "front-page.html"): Loading Loading @@ -1072,34 +1137,20 @@ def postprocess(html_dir: str, no_lazy_toc: bool = False): ### Arguments - `html_dir`: Directory containing the HTML files to be processed """ with get_timer().section("Postprocessing: 1 - Get dirty filenames mapping"): filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames( html_dir ) with get_timer().section("Postprocessing: 1-2 - Rename HTML files by heading"): filenames_mapping = rename_html_files_by_heading(html_dir) images_mapping = {} html_files = [] processed_soups = [] # Read and rename all HTML files with get_timer().section("Postprocessing: 2 - Read and Rename HTML Files"): with get_timer().section("Postprocessing: 2 - Read renamed HTML files"): for filename in os.listdir(html_dir): if filename.endswith(".html"): with open( os.path.join(html_dir, filename), "r", encoding="utf-8" ) as file: html_content = file.read() if filename == "index.html": new_filename = filename else: new_filename = apply_renaming_logic(html_content, filename, "html") os.rename( os.path.join(html_dir, filename), os.path.join(html_dir, new_filename), ) html_files.append((new_filename, html_content)) html_files.append((filename, html_content)) # First pass: process all HTML files while keeping soup in memory Loading
generateSpecWebSite/md_to_docx_converter/src/utils.py +86 −18 Original line number Diff line number Diff line Loading @@ -488,6 +488,61 @@ def get_output_doc_path(dest: str): return f"{dest}/{OUTPUT_DOC_NAME}" def slugify_heading_text(text: str) -> str: """Turn heading words into a stable filename slug.""" text = text.lower().strip() text = re.sub(r"\([^)]*\)", "", text) text = re.sub(r"[^\w\s-]", "", text) text = re.sub(r"[\s_]+", "-", text) text = re.sub(r"-+", "-", text) return text.strip("-") def find_page_h1(soup: BeautifulSoup): """Return the page H1, preferring the main #editor content area.""" editor = soup.find("div", id="editor") search_root = editor if editor else soup return search_root.find("h1", id=True) or search_root.find("h1") def split_html_href(href: str) -> tuple[str, str, str]: """Split href into (path_prefix, filename, fragment). path_prefix is '' or './'.""" prefix = "./" if href.startswith("./") else "" path = href[2:] if prefix else href if "#" in path: filename, fragment = path.split("#", 1) else: filename, fragment = path, "" return prefix, filename, fragment def remap_html_href(href: str, filenames_mapping: dict[str, str]) -> str | None: """Return an updated href when filename is in the mapping, else None.""" if not href or href.startswith(("http://", "https://", "mailto:", "#")): return None prefix, filename, fragment = split_html_href(href) if not filename or filename not in filenames_mapping: return None new_name = filenames_mapping[filename] if new_name == filename and not fragment: return None new_href = f"{prefix}{new_name}" if fragment: new_href = f"{new_href}#{fragment}" return new_href def apply_filename_mapping_to_html( soup: BeautifulSoup, filenames_mapping: dict[str, str] ) -> BeautifulSoup: """Update internal page links after HTML files are renamed.""" for link in soup.find_all("a", href=True): new_href = remap_html_href(link["href"], filenames_mapping) if new_href: link["href"] = new_href return soup def apply_renaming_logic(text: str, filename: str, postfix: str) -> str: new_filename = filename if postfix == "md": Loading @@ -514,26 +569,39 @@ def apply_renaming_logic(text: str, filename: str, postfix: str) -> str: if soup.find("div", class_="ZA"): new_filename = f"front-page.{postfix}" else: title_tag = soup.find("h1", id=True) title = title_tag.get_text() if title.startswith("Annex"): annex_number = title.split()[1].lower().replace(":", "") new_filename = f"annex-{annex_number}.{postfix}" title_tag = find_page_h1(soup) if title_tag is None: return re.sub(r"^\d+-", "", filename).lower() title = " ".join(title_tag.get_text().split()) annex_match = re.match(r"^Annex\s+([A-Za-z])", title, re.IGNORECASE) if annex_match: new_filename = f"annex-{annex_match.group(1).lower()}.{postfix}" else: header_regex = r"^(\d+)\s" match = re.match(header_regex, title) annex_sub_match = re.match( r"^([A-Z]\.\d+(?:\.\d+)*)\s+(.+)", title ) if annex_sub_match: clause_id = annex_sub_match.group(1).lower() slug = slugify_heading_text(annex_sub_match.group(2)) if slug: new_filename = f"{clause_id}-{slug}.{postfix}" else: new_filename = f"{clause_id}.{postfix}" else: match = re.match(r"^(\d+)\s+(.+)", title) if match: chapter_number = match.group(1) if postfix == "md" and chapter_number == "1": new_filename = f"{SCOPE}.{postfix}" elif postfix == "md" and chapter_number == "2": new_filename = f"{REFS}.{postfix}" elif postfix == "md" and chapter_number == "3": new_filename = f"{DEFS}.{postfix}" slug = slugify_heading_text(match.group(2)) if slug: new_filename = f"{chapter_number}-{slug}.{postfix}" else: new_filename = f"clause-{chapter_number}.{postfix}" else: # it is a clause without a number, just use the filename without the leading number slug = slugify_heading_text(title) if slug: new_filename = f"{slug}.{postfix}" else: new_filename = re.sub(r"^\d+-", "", filename).lower() return new_filename Loading