Adding debug for release notes diff + renaming html files (2904106d) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateSpecWebSite/md_to_docx_converter/advancedTOCLogic.js

+22 −4

Original line number	Diff line number	Diff line
		@@ -55,11 +55,29 @@ function getIdFromURL() {
		}
		}
		} else {
		// no anchor, deduct from filename
		const filename = window.location.pathname.split("/").pop().split(".")[0];
		const topHeadingId = filename.split("-")[1];
		// no anchor — match the current page filename against TOC hrefs
		const filename = window.location.pathname.split("/").pop();
		if (filename) {
		const tocLink = Array.from(document.querySelectorAll("nav#TOC a")).find(
		(a) => {
		const href = a.getAttribute("href") \|\| "";
		const hrefFile = href.split("#")[0].replace(/^\.\//, "");
		return hrefFile === filename;
		}
		);
		if (tocLink && tocLink.id) {
		tocId = tocLink.id;
		}
		}
		if (!tocId && filename) {
		// fallback for pandoc-style {number}-{slug}.html names
		const basename = filename.split(".")[0];
		const parts = basename.split("-");
		const topHeadingId =
		parts.length > 1 ? parts.slice(1).join("-") : basename;
		tocId = `toc-${topHeadingId}`;
		}
		}
		return tocId;
		}

generateSpecWebSite/md_to_docx_converter/convert.py

+125 −6

Original line number	Diff line number	Diff line
		@@ -18,6 +18,7 @@ from src.utils import (
		get_output_doc_path,
		p_error,
		p_label,
		p_warning,
		validate_src_directory,
		validate_type,
		validate_conversion,
		@@ -35,7 +36,10 @@ from src.to_md.preprocessing import preprocess as preprocess_md
		from src.to_md.postprocessing import postprocess as postprocess_md

		from src.to_html.preprocessing import preprocess as preprocess_html
		from src.to_html.postprocessing import postprocess as postprocess_html
		from src.to_html.postprocessing import (
		postprocess as postprocess_html,
		rename_html_files_by_heading,
		)
		from src.to_html.postprocessing import clone_toc_in_file as clone_toc_in_file_html

		from src.to_docx.preprocessing import preprocess as preprocess_docx
		@@ -383,6 +387,9 @@ def convert(conversion_args=None):
		shutil.copy2(css_file, os.path.join(FILEGEN_DIR, FOLDER))
		shutil.copy("advancedTOCLogic.js", DEST)

		with get_timer().section("Rename HTML files to heading-based names"):
		rename_html_files_by_heading(DEST)

		# Cleanup the consolidated Markdown
		handle_consolidated_md("delete", SRC, CONSOLIDATED_MD_PATH)

		@@ -492,6 +499,56 @@ def convert(conversion_args=None):
		convert_md_to_html()


		def _editor_text_length(html_path: str) -> int \| None:
		"""Return #editor text length, -1 if no editor div, None if file missing."""
		if not os.path.isfile(html_path):
		return None
		try:
		with open(html_path, encoding="utf-8") as html_file:
		soup = BeautifulSoup(html_file.read(), "html.parser")
		except OSError:
		return None
		editor = soup.find("div", id="editor")
		if editor is None:
		return -1
		return len(editor.get_text(strip=True))


		def _log_diff_html_inventory(diff_source_dir: str, dest_dir: str) -> None:
		"""Log base vs new HTML filenames to spot pairing mismatches."""
		if not os.path.isdir(diff_source_dir):
		print(
		p_warning(
		f"[diff-debug] Base HTML directory does not exist: {diff_source_dir}"
		)
		)
		return
		base_files = {
		name
		for name in os.listdir(diff_source_dir)
		if name.endswith(".html")
		}
		new_files = {name for name in os.listdir(dest_dir) if name.endswith(".html")}
		only_in_new = sorted(new_files - base_files)
		only_in_base = sorted(base_files - new_files)
		print(
		f"[diff-debug] HTML inventory: base={len(base_files)}, new={len(new_files)}, "
		f"paired={len(base_files & new_files)}"
		)
		if only_in_new:
		print(
		p_warning(
		f"[diff-debug] New HTML without base counterpart ({len(only_in_new)}): "
		f"{', '.join(only_in_new)}"
		)
		)
		if only_in_base:
		print(
		f"[diff-debug] Base HTML without new counterpart ({len(only_in_base)}): "
		f"{', '.join(only_in_base)}"
		)


		def get_comparing_folder_from_git():
		# Check if the DIFF_PATH is a git repo
		if not git_helper.is_git_repo(SRC):
		@@ -557,9 +614,27 @@ def get_comparing_folder_from_git():
		sys.exit(1)
		else:
		branch_target = None
		cached_base_commit, cached_rename_scheme = load_target_commit(
		os.path.join(FILEGEN_DIR, f"{FOLDER}-base")
		)
		print(
		f"[diff-debug] Diff git target: requested={p_label(GIT_CHECKOUT_NAME)}, "
		f"resolved={p_label(target_to_checkout)}"
		f"{f', branch={p_label(branch_target)}' if branch_target else ''}"
		)
		print(
		f"[diff-debug] Current repo state: branch={p_label(current_branch)}, "
		f"commit={p_label(current_commit_hash)}"
		)
		print(
		f"[diff-debug] Cached base commit: {p_label(cached_base_commit or '(none)')}, "
		f"rename scheme: {p_label(cached_rename_scheme or '(none)')}, "
		f"matches resolved={cached_base_commit == target_to_checkout}, "
		f"matches scheme={cached_rename_scheme == HTML_RENAME_SCHEME}"
		)
		if (
		load_target_commit(os.path.join(FILEGEN_DIR, f"{FOLDER}-base"))
		== target_to_checkout
		cached_base_commit == target_to_checkout
		and cached_rename_scheme == HTML_RENAME_SCHEME
		):
		print(
		f"The base HTML files for commit {p_label(target_to_checkout)} {f'(branch: {p_label(branch_target)}) ' if branch_target else ''}have already been generated. Using existing files for diff..."
		@@ -677,17 +752,25 @@ def restore_original_state(original_branch, changes_stashed):
		apply_stash(changes_stashed)


		HTML_RENAME_SCHEME = "heading-title-v2"


		def save_target_commit(target_commit, path):
		with open(os.path.join(path, ".gittargetcommit"), "w") as f:
		f.write(target_commit)
		f.write(f"{target_commit}\n{HTML_RENAME_SCHEME}\n")


		def load_target_commit(path):
		target_commit_path = os.path.join(path, ".gittargetcommit")
		if os.path.exists(target_commit_path):
		with open(target_commit_path, "r") as f:
		return f.read().strip()
		return None
		lines = f.read().strip().splitlines()
		if not lines:
		return None, None
		commit = lines[0]
		scheme = lines[1] if len(lines) > 1 else None
		return commit, scheme
		return None, None


		### Run script
		@@ -727,18 +810,54 @@ if SRC_TYPE == "md" and DEST_TYPE == "html":
		else:
		diff_source_dir = os.path.join(FILEGEN_DIR, f"{FOLDER}-base", "html")
		print(f"Applying diff from source directory: {diff_source_dir}...")
		_log_diff_html_inventory(diff_source_dir, DEST)
		# Iterate through new HTML files and create diffs
		filename_count_mapping = {}
		missing_base_files: list[str] = []
		empty_base_editors: list[str] = []
		for filename in os.listdir(DEST):
		if filename.endswith(".html"):
		new_file_path = os.path.join(DEST, filename)
		comparing_file_path = os.path.join(diff_source_dir, filename)
		old_editor_len = _editor_text_length(comparing_file_path)
		new_editor_len = _editor_text_length(new_file_path)
		if old_editor_len is None:
		missing_base_files.append(filename)
		print(
		p_warning(
		f"[diff-debug] No base HTML for '{filename}' "
		f"(expected: {comparing_file_path}) — page will diff as all-new"
		)
		)
		elif old_editor_len == 0:
		empty_base_editors.append(filename)
		print(
		p_warning(
		f"[diff-debug] Base HTML for '{filename}' has empty #editor "
		f"(new #editor={new_editor_len} chars)"
		)
		)
		# Create diff and overwrite the new file
		count = make_trackchanges_diff(
		comparing_file_path, new_file_path, new_file_path
		)
		print(
		f"[diff-debug] {filename}: old_editor="
		f"{old_editor_len if old_editor_len is not None else 'missing'} chars, "
		f"new_editor={new_editor_len} chars, changes={count or 0}"
		)
		if count and count > 0:
		filename_count_mapping[filename] = count
		print(
		f"[diff-debug] Diff summary: pages_with_changes={len(filename_count_mapping)}, "
		f"missing_base={len(missing_base_files)}, empty_base_editor={len(empty_base_editors)}"
		)
		if missing_base_files:
		print(
		p_warning(
		f"[diff-debug] Pages missing base HTML: {', '.join(missing_base_files)}"
		)
		)
		shutil.copy("diffVisualizer.js", DEST)
		toc_soup = None
		if os.path.exists(toc_path):

generateSpecWebSite/md_to_docx_converter/html_diff.py

+11 −5

Original line number	Diff line number	Diff line
		@@ -764,14 +764,20 @@ def get_element_repr(elem):
		def make_trackchanges_diff(
		old_path: str, new_path: str, out_path: str = "diff.html"
		) -> int \| None:
		old_path_obj = Path(old_path)
		new_path_obj = Path(new_path)
		try:
		old_html = Path(old_path).read_text(encoding="utf-8")
		except Exception as e:
		old_html = old_path_obj.read_text(encoding="utf-8")
		except FileNotFoundError:
		old_html = "<html><body><div id='editor'></div></body></html>"
		except OSError as exc:
		print(f"[diff-debug] Failed to read old HTML '{old_path}': {exc}")
		old_html = "<html><body><div id='editor'></div></body></html>"

		try:
		new_html = Path(new_path).read_text(encoding="utf-8")
		except Exception as e:
		new_html = new_path_obj.read_text(encoding="utf-8")
		except OSError as exc:
		print(f"[diff-debug] Failed to read new HTML '{new_path}': {exc}")
		new_html = "<html><body><div id='editor'></div></body></html>"

		# Parse with BeautifulSoup

generateSpecWebSite/md_to_docx_converter/src/to_html/postprocessing.py

+77 −26

Original line number	Diff line number	Diff line
		@@ -4,7 +4,7 @@ from bs4 import BeautifulSoup, Tag, NavigableString
		from ..time_book import get_timer

		from src.utils import (
		apply_renaming_logic,
		apply_filename_mapping_to_html,
		get_dirty_filenames_mapping_with_expected_filenames,
		p_error,
		p_warning,
		@@ -45,15 +45,80 @@ def fix_toc_links(soup: BeautifulSoup, filenames_mapping: dict):
		Fixes the table of contents links in the HTML by updating their href attributes
		based on the provided filenames mapping.
		"""
		toc_links = soup.select("#TOC a")
		return apply_filename_mapping_to_html(soup, filenames_mapping)

		for link in toc_links:
		href = link.get("href", "")
		before_ash, after_ash = href.split("#", 1) if "#" in href else (href, "")
		if before_ash in filenames_mapping:
		link["href"] = f"{filenames_mapping[before_ash]}#{after_ash}"

		return soup
		def apply_filename_mapping_to_html_dir(
		html_dir: str, filenames_mapping: dict[str, str]
		) -> None:
		"""Rewrite internal links in every HTML file after chunk renaming."""
		if not any(old != new for old, new in filenames_mapping.items()):
		return

		for filename in os.listdir(html_dir):
		if not filename.endswith(".html"):
		continue
		file_path = os.path.join(html_dir, filename)
		with open(file_path, encoding="utf-8") as html_file:
		soup = BeautifulSoup(html_file.read(), "html.parser")
		soup = apply_filename_mapping_to_html(soup, filenames_mapping)
		with open(file_path, "w", encoding="utf-8") as html_file:
		html_file.write(str(soup))


		def rename_html_files_by_heading(html_dir: str) -> dict[str, str]:
		"""
		Rename pandoc sequential chunked HTML files to stable names derived from each
		page's H1 heading (e.g. 5-conventions.html -> 4-conventions.html when the
		heading is "4 Conventions").
		"""
		filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir)
		pending = {
		old_name: new_name
		for old_name, new_name in filenames_mapping.items()
		if old_name != new_name
		}
		if not pending:
		return filenames_mapping

		temp_suffix = ".heading-rename"
		for old_name in pending:
		os.rename(
		os.path.join(html_dir, old_name),
		os.path.join(html_dir, old_name + temp_suffix),
		)

		restored: list[str] = []
		for old_name, new_name in pending.items():
		src = os.path.join(html_dir, old_name + temp_suffix)
		dst = os.path.join(html_dir, new_name)
		if os.path.exists(dst):
		print(
		p_warning(
		f"HTML rename collision: '{new_name}' already exists; "
		f"keeping pandoc name '{old_name}'"
		)
		)
		os.rename(src, os.path.join(html_dir, old_name))
		restored.append(old_name)
		continue
		os.rename(src, dst)

		for old_name in restored:
		filenames_mapping[old_name] = old_name

		apply_filename_mapping_to_html_dir(html_dir, filenames_mapping)

		renamed_count = len(pending) - len(restored)
		print(
		f"[diff-debug] Renamed {renamed_count} HTML file(s) to heading-based names"
		)
		for old_name, new_name in sorted(pending.items()):
		if old_name in restored:
		continue
		print(f"[diff-debug] {old_name} -> {new_name}")

		return filenames_mapping


		def get_document_title_from_html(html_dir: str, front_page: str = "front-page.html"):
		@@ -1072,34 +1137,20 @@ def postprocess(html_dir: str, no_lazy_toc: bool = False):
		### Arguments
		- `html_dir`: Directory containing the HTML files to be processed
		"""
		with get_timer().section("Postprocessing: 1 - Get dirty filenames mapping"):
		filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(
		html_dir
		)
		with get_timer().section("Postprocessing: 1-2 - Rename HTML files by heading"):
		filenames_mapping = rename_html_files_by_heading(html_dir)
		images_mapping = {}
		html_files = []
		processed_soups = []

		# Read and rename all HTML files
		with get_timer().section("Postprocessing: 2 - Read and Rename HTML Files"):
		with get_timer().section("Postprocessing: 2 - Read renamed HTML files"):
		for filename in os.listdir(html_dir):
		if filename.endswith(".html"):
		with open(
		os.path.join(html_dir, filename), "r", encoding="utf-8"
		) as file:
		html_content = file.read()

		if filename == "index.html":
		new_filename = filename
		else:
		new_filename = apply_renaming_logic(html_content, filename, "html")

		os.rename(
		os.path.join(html_dir, filename),
		os.path.join(html_dir, new_filename),
		)

		html_files.append((new_filename, html_content))
		html_files.append((filename, html_content))

		# First pass: process all HTML files while keeping soup in memory

generateSpecWebSite/md_to_docx_converter/src/utils.py

+86 −18

Original line number	Diff line number	Diff line
		@@ -488,6 +488,61 @@ def get_output_doc_path(dest: str):
		return f"{dest}/{OUTPUT_DOC_NAME}"


		def slugify_heading_text(text: str) -> str:
		"""Turn heading words into a stable filename slug."""
		text = text.lower().strip()
		text = re.sub(r"$[^)]*$", "", text)
		text = re.sub(r"[^\w\s-]", "", text)
		text = re.sub(r"[\s_]+", "-", text)
		text = re.sub(r"-+", "-", text)
		return text.strip("-")


		def find_page_h1(soup: BeautifulSoup):
		"""Return the page H1, preferring the main #editor content area."""
		editor = soup.find("div", id="editor")
		search_root = editor if editor else soup
		return search_root.find("h1", id=True) or search_root.find("h1")


		def split_html_href(href: str) -> tuple[str, str, str]:
		"""Split href into (path_prefix, filename, fragment). path_prefix is '' or './'."""
		prefix = "./" if href.startswith("./") else ""
		path = href[2:] if prefix else href
		if "#" in path:
		filename, fragment = path.split("#", 1)
		else:
		filename, fragment = path, ""
		return prefix, filename, fragment


		def remap_html_href(href: str, filenames_mapping: dict[str, str]) -> str \| None:
		"""Return an updated href when filename is in the mapping, else None."""
		if not href or href.startswith(("http://", "https://", "mailto:", "#")):
		return None
		prefix, filename, fragment = split_html_href(href)
		if not filename or filename not in filenames_mapping:
		return None
		new_name = filenames_mapping[filename]
		if new_name == filename and not fragment:
		return None
		new_href = f"{prefix}{new_name}"
		if fragment:
		new_href = f"{new_href}#{fragment}"
		return new_href


		def apply_filename_mapping_to_html(
		soup: BeautifulSoup, filenames_mapping: dict[str, str]
		) -> BeautifulSoup:
		"""Update internal page links after HTML files are renamed."""
		for link in soup.find_all("a", href=True):
		new_href = remap_html_href(link["href"], filenames_mapping)
		if new_href:
		link["href"] = new_href
		return soup


		def apply_renaming_logic(text: str, filename: str, postfix: str) -> str:
		new_filename = filename
		if postfix == "md":
		@@ -514,26 +569,39 @@ def apply_renaming_logic(text: str, filename: str, postfix: str) -> str:
		if soup.find("div", class_="ZA"):
		new_filename = f"front-page.{postfix}"
		else:
		title_tag = soup.find("h1", id=True)
		title = title_tag.get_text()
		if title.startswith("Annex"):
		annex_number = title.split()[1].lower().replace(":", "")
		new_filename = f"annex-{annex_number}.{postfix}"
		title_tag = find_page_h1(soup)
		if title_tag is None:
		return re.sub(r"^\d+-", "", filename).lower()

		title = " ".join(title_tag.get_text().split())
		annex_match = re.match(r"^Annex\s+([A-Za-z])", title, re.IGNORECASE)
		if annex_match:
		new_filename = f"annex-{annex_match.group(1).lower()}.{postfix}"
		else:
		header_regex = r"^(\d+)\s"
		match = re.match(header_regex, title)
		annex_sub_match = re.match(
		r"^([A-Z]\.\d+(?:\.\d+)*)\s+(.+)", title
		)
		if annex_sub_match:
		clause_id = annex_sub_match.group(1).lower()
		slug = slugify_heading_text(annex_sub_match.group(2))
		if slug:
		new_filename = f"{clause_id}-{slug}.{postfix}"
		else:
		new_filename = f"{clause_id}.{postfix}"
		else:
		match = re.match(r"^(\d+)\s+(.+)", title)
		if match:
		chapter_number = match.group(1)
		if postfix == "md" and chapter_number == "1":
		new_filename = f"{SCOPE}.{postfix}"
		elif postfix == "md" and chapter_number == "2":
		new_filename = f"{REFS}.{postfix}"
		elif postfix == "md" and chapter_number == "3":
		new_filename = f"{DEFS}.{postfix}"
		slug = slugify_heading_text(match.group(2))
		if slug:
		new_filename = f"{chapter_number}-{slug}.{postfix}"
		else:
		new_filename = f"clause-{chapter_number}.{postfix}"
		else:
		# it is a clause without a number, just use the filename without the leading number
		slug = slugify_heading_text(title)
		if slug:
		new_filename = f"{slug}.{postfix}"
		else:
		new_filename = re.sub(r"^\d+-", "", filename).lower()
		return new_filename