feat: enhance preprocessing and postprocessing with timing functionality and improved file handling (501186aa) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/src/to_html/postprocessing.py

+86 −71

Original line number	Diff line number	Diff line
		import os, re, html, json
		from bs4 import BeautifulSoup, Tag, NavigableString
		from ..time_book import get_timer

		from src.utils import (
		apply_renaming_logic,
		@@ -342,6 +343,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
		date_text["data-replace"] = "DATE"
		date_text.string = text
		close_bracket = NavigableString(")")
		date.clear()
		date.append(open_bracket)
		date.append(date_text)
		date.append(close_bracket)
		@@ -624,21 +626,21 @@ def shorten_toc_text(soup: BeautifulSoup):
		"""
		Remove informative/normative from TOC only
		"""
		tocTexts = soup.select("#TOC .norm")
		for tocText in tocTexts:
		tocText.decompose()
		# Find TOC element once to avoid repeated document searches
		toc = soup.select_one("#TOC")

		tocTexts = soup.select(".norm")
		for tocText in tocTexts:
		tocText.string = "Normative"
		if toc:
		# Remove norm and inform elements from TOC first (reduces DOM size for subsequent searches)
		for elem in toc.find_all(class_=["norm", "inform"]):
		elem.decompose()

		tocTexts = soup.select("#TOC .inform")
		for tocText in tocTexts:
		tocText.decompose()
		# Update remaining norm elements (now only outside TOC)
		for elem in soup.find_all(class_="norm"):
		elem.string = "Normative"

		tocTexts = soup.select(".inform")
		for tocText in tocTexts:
		tocText.string = "Informative"
		# Update remaining inform elements (now only outside TOC)
		for elem in soup.find_all(class_="inform"):
		elem.string = "Informative"

		return soup

		@@ -917,68 +919,81 @@ def postprocess(html_dir: str):
		### Arguments
		- `html_dir`: Directory containing the HTML files to be processed
		"""
		filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir)
		with get_timer().section("Postprocessing: 1 - Get dirty filenames mapping"):
		filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(
		html_dir
		)
		images_mapping = {}
		html_files = []
		processed_soups = []

		# Read and rename all HTML files
		with get_timer().section("Postprocessing: 2 - Read and Rename HTML Files"):
		for filename in os.listdir(html_dir):
		if filename.endswith(".html"):
		with open(os.path.join(html_dir, filename), "r", encoding="utf-8") as file:
		html = file.read()
		with open(
		os.path.join(html_dir, filename), "r", encoding="utf-8"
		) as file:
		html_content = file.read()

		if filename == "index.html":
		new_filename = filename
		else:
		new_filename = apply_renaming_logic(html, filename, "html")
		new_filename = apply_renaming_logic(html_content, filename, "html")

		os.rename(
		os.path.join(html_dir, filename), os.path.join(html_dir, new_filename)
		os.path.join(html_dir, filename),
		os.path.join(html_dir, new_filename),
		)
		file_path = os.path.join(html_dir, new_filename)

		with open(file_path, "r", encoding="utf-8") as html:
		soup = BeautifulSoup(html, "html.parser")
		html_files.append((new_filename, html_content))

		# First pass: process all HTML files while keeping soup in memory
		with get_timer().section("Postprocessing: 3 - First Pass - Process HTML Files"):
		for new_filename, html_content in html_files:
		with get_timer().section(
		f"Postprocessing: 3.a - Processing {new_filename}"
		):
		soup = BeautifulSoup(html_content, "html.parser")
		soup = shorten_toc_text(soup)
		soup = remove_code_blocks_with_only_images(soup)
		soup = format_examples_and_notes(soup)
		soup = format_tables(soup)

		if new_filename == "front-page.html":
		soup = format_front_page(soup)

		if (
		new_filename.replace(".html", "") in files_with_references
		): # Reference-specific formatting
		soup = format_references(soup)
		else:
		soup = add_links_to_references_in_text(soup)

		soup = fix_toc_links(soup, filenames_mapping)
		soup = move_dangling_brackets_out_of_links(soup)
		soup = fix_ex_json_spacing(soup)
		soup = unwrap_gt_lt_code_tags(soup)

		soup = handle_ew_div(soup)

		soup = remove_links_from_labels(soup)
		soup = add_ids_to_labels(soup)
		soup = replace_dash_characters(soup)
		soup = move_figure_id_to_FL_elements(soup)
		soup = fix_custom_tags(soup)
		soup = fix_lists(soup)

		images, soup = extract_images_from_html(soup)
		for image_id, image_src in images.items():
		images_mapping[image_src] = {"id": image_id, "file": new_filename}

		contents = soup.decode_contents()
		images_mapping[image_src] = {
		"id": image_id,
		"file": new_filename,
		}

		with open(file_path, "w", encoding="utf-8") as html:
		html.write(contents)
		# Keep soup in memory instead of writing and re-reading
		processed_soups.append((new_filename, soup))

		for filename in os.listdir(html_dir):
		if filename.endswith(".html"):
		# Second pass: reuse soup objects already in memory
		with get_timer().section(
		"Postprocessing: 4 - Second Pass - Add Links and Write Files"
		):
		for filename, soup in processed_soups:
		file_path = os.path.join(html_dir, filename)
		with open(file_path, "r", encoding="utf-8") as html:
		soup = BeautifulSoup(html, "html.parser")

		try:
		soup = add_custom_link_to_images(soup, images_mapping)
		@@ -988,8 +1003,8 @@ def postprocess(html_dir: str):
		print(p_error(str(e)))
		os._exit(1)

		# Write the final file only once
		contents = soup.decode_contents()

		with open(file_path, "w", encoding="utf-8") as html:
		html.write(contents)

md_to_docx_converter/src/to_html/preprocessing.py

+88 −77

Original line number	Diff line number	Diff line
		import os, re, os, json
		import os, re, json
		import sys
		from typing_extensions import Literal
		from ..time_book import get_timer

		from src.constants import (
		NORMATIVE_REF_FILE,
		@@ -28,8 +29,6 @@ files_with_references = [NORMATIVE_REF_FILE, INFORMATIVE_REF_FILE]


		# region Helpers


		def undo_prettier_formatting(text: str) -> str:
		"""Undo any formatting changes made by Prettier to ensure the Markdown is in a more raw format for processing."""

		@@ -649,10 +648,16 @@ def add_ids_to_references(file_contents: str, filename: str):
		return f'<span id="{new_ref}" />[{new_ref}]'

		file_contents = re.sub(
		REF_REGEX_I, replace_informative_ref, file_contents, flags=re.MULTILINE
		REF_REGEX_I,
		replace_informative_ref,
		file_contents,
		flags=re.MULTILINE,
		)
		file_contents = re.sub(
		REF_REGEX_N, replace_normative_ref, file_contents, flags=re.MULTILINE
		REF_REGEX_N,
		replace_normative_ref,
		file_contents,
		flags=re.MULTILINE,
		)

		with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file:
		@@ -666,8 +671,6 @@ def add_ids_to_references(file_contents: str, filename: str):


		# endregion


		def preprocess(
		src: str, src_type: str, consolidated_md_path: str, file_order_json: str
		):
		@@ -699,27 +702,32 @@ def preprocess(
		annexes = DEFAULT_ANNEXES

		# create REFERENCE_MAPPING_MD_TO_HTML file locally if it doesn't exist
		with get_timer().section("Preprocessing: 1 - Creating reference mapping file"):
		if not os.path.exists(REFERENCE_MAPPING_MD_TO_HTML):
		with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file:
		json.dump({}, ref_file, indent=4)

		with get_timer().section("Preprocessing: 2 - Preprocessing Markdown files"):
		if file_order_json:
		with open(file_order_json, "r") as file:
		json_data = json.load(file)
		clauses = json_data.get("clauses")
		annexes = json_data.get("annexes")

		files, clauses_filenames, annexes_filenames = get_file_order(src, clauses, annexes)
		files, clauses_filenames, annexes_filenames = get_file_order(
		src, clauses, annexes
		)
		files = [f"{filename}.md" for filename in files]
		clauses_filenames = [f"{filename}.md" for filename in clauses_filenames]
		annexes_filenames = [f"{filename}.md" for filename in annexes_filenames]
		preprocessed_filenames = []
		with get_timer().section("Preprocessing: 3 - Processing individual files"):
		for filename in files:
		filename_without_extension = filename[:-3] # Remove .md extension
		if filename.endswith(src_type) and filename != "consolidated.md":
		input_path = os.path.join(src, filename)
		try:
		text = open(input_path, "r", encoding="utf-8").read()
		with open(input_path, "r", encoding="utf-8") as file:
		text = file.read()

		text = undo_prettier_formatting(text)
		run_format_checks(filename, text.splitlines())
		@@ -749,7 +757,8 @@ def preprocess(
		r"([\w-]+?).md", r"--preprocessed--\1.md", filename
		) # Ensure file order is preserved by keeping the number in front
		output_path = os.path.join(src, new_filename)
		open(output_path, "w", encoding="utf-8").write(text)
		with open(output_path, "w", encoding="utf-8") as file:
		file.write(text)
		preprocessed_filenames.append(new_filename)
		except Exception as e:
		# print(f"Error: {e}")
		@@ -766,8 +775,10 @@ def preprocess(
		if f.startswith("--preprocessed--"):
		os.remove(os.path.join(src, f))
		sys.exit(1)
		pass

		handle_consolidated_md("create", src, consolidated_md_path, preprocessed_filenames)
		p_warning(f"Warning: Could not preprocess {input_path}: {e}")
		with get_timer().section("Preprocessing: 4 - Creating consolidated Markdown file"):
		handle_consolidated_md(
		"create", src, consolidated_md_path, preprocessed_filenames
		)

		return filename_numbers_mapping