Loading md_to_docx_converter/src/to_html/postprocessing.py +86 −71 Original line number Diff line number Diff line import os, re, html, json from bs4 import BeautifulSoup, Tag, NavigableString from ..time_book import get_timer from src.utils import ( apply_renaming_logic, Loading Loading @@ -342,6 +343,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup: date_text["data-replace"] = "DATE" date_text.string = text close_bracket = NavigableString(")") date.clear() date.append(open_bracket) date.append(date_text) date.append(close_bracket) Loading Loading @@ -624,21 +626,21 @@ def shorten_toc_text(soup: BeautifulSoup): """ Remove informative/normative from TOC only """ tocTexts = soup.select("#TOC .norm") for tocText in tocTexts: tocText.decompose() # Find TOC element once to avoid repeated document searches toc = soup.select_one("#TOC") tocTexts = soup.select(".norm") for tocText in tocTexts: tocText.string = "Normative" if toc: # Remove norm and inform elements from TOC first (reduces DOM size for subsequent searches) for elem in toc.find_all(class_=["norm", "inform"]): elem.decompose() tocTexts = soup.select("#TOC .inform") for tocText in tocTexts: tocText.decompose() # Update remaining norm elements (now only outside TOC) for elem in soup.find_all(class_="norm"): elem.string = "Normative" tocTexts = soup.select(".inform") for tocText in tocTexts: tocText.string = "Informative" # Update remaining inform elements (now only outside TOC) for elem in soup.find_all(class_="inform"): elem.string = "Informative" return soup Loading Loading @@ -917,68 +919,81 @@ def postprocess(html_dir: str): ### Arguments - `html_dir`: Directory containing the HTML files to be processed """ filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir) with get_timer().section("Postprocessing: 1 - Get dirty filenames mapping"): filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames( html_dir ) images_mapping = {} html_files = [] processed_soups = [] # Read and rename all HTML files with get_timer().section("Postprocessing: 2 - Read and Rename HTML Files"): for filename in os.listdir(html_dir): if filename.endswith(".html"): with open(os.path.join(html_dir, filename), "r", encoding="utf-8") as file: html = file.read() with open( os.path.join(html_dir, filename), "r", encoding="utf-8" ) as file: html_content = file.read() if filename == "index.html": new_filename = filename else: new_filename = apply_renaming_logic(html, filename, "html") new_filename = apply_renaming_logic(html_content, filename, "html") os.rename( os.path.join(html_dir, filename), os.path.join(html_dir, new_filename) os.path.join(html_dir, filename), os.path.join(html_dir, new_filename), ) file_path = os.path.join(html_dir, new_filename) with open(file_path, "r", encoding="utf-8") as html: soup = BeautifulSoup(html, "html.parser") html_files.append((new_filename, html_content)) # First pass: process all HTML files while keeping soup in memory with get_timer().section("Postprocessing: 3 - First Pass - Process HTML Files"): for new_filename, html_content in html_files: with get_timer().section( f"Postprocessing: 3.a - Processing {new_filename}" ): soup = BeautifulSoup(html_content, "html.parser") soup = shorten_toc_text(soup) soup = remove_code_blocks_with_only_images(soup) soup = format_examples_and_notes(soup) soup = format_tables(soup) if new_filename == "front-page.html": soup = format_front_page(soup) if ( new_filename.replace(".html", "") in files_with_references ): # Reference-specific formatting soup = format_references(soup) else: soup = add_links_to_references_in_text(soup) soup = fix_toc_links(soup, filenames_mapping) soup = move_dangling_brackets_out_of_links(soup) soup = fix_ex_json_spacing(soup) soup = unwrap_gt_lt_code_tags(soup) soup = handle_ew_div(soup) soup = remove_links_from_labels(soup) soup = add_ids_to_labels(soup) soup = replace_dash_characters(soup) soup = move_figure_id_to_FL_elements(soup) soup = fix_custom_tags(soup) soup = fix_lists(soup) images, soup = extract_images_from_html(soup) for image_id, image_src in images.items(): images_mapping[image_src] = {"id": image_id, "file": new_filename} contents = soup.decode_contents() images_mapping[image_src] = { "id": image_id, "file": new_filename, } with open(file_path, "w", encoding="utf-8") as html: html.write(contents) # Keep soup in memory instead of writing and re-reading processed_soups.append((new_filename, soup)) for filename in os.listdir(html_dir): if filename.endswith(".html"): # Second pass: reuse soup objects already in memory with get_timer().section( "Postprocessing: 4 - Second Pass - Add Links and Write Files" ): for filename, soup in processed_soups: file_path = os.path.join(html_dir, filename) with open(file_path, "r", encoding="utf-8") as html: soup = BeautifulSoup(html, "html.parser") try: soup = add_custom_link_to_images(soup, images_mapping) Loading @@ -988,8 +1003,8 @@ def postprocess(html_dir: str): print(p_error(str(e))) os._exit(1) # Write the final file only once contents = soup.decode_contents() with open(file_path, "w", encoding="utf-8") as html: html.write(contents) Loading md_to_docx_converter/src/to_html/preprocessing.py +88 −77 Original line number Diff line number Diff line import os, re, os, json import os, re, json import sys from typing_extensions import Literal from ..time_book import get_timer from src.constants import ( NORMATIVE_REF_FILE, Loading Loading @@ -28,8 +29,6 @@ files_with_references = [NORMATIVE_REF_FILE, INFORMATIVE_REF_FILE] # region Helpers def undo_prettier_formatting(text: str) -> str: """Undo any formatting changes made by Prettier to ensure the Markdown is in a more raw format for processing.""" Loading Loading @@ -649,10 +648,16 @@ def add_ids_to_references(file_contents: str, filename: str): return f'<span id="{new_ref}" />[{new_ref}]' file_contents = re.sub( REF_REGEX_I, replace_informative_ref, file_contents, flags=re.MULTILINE REF_REGEX_I, replace_informative_ref, file_contents, flags=re.MULTILINE, ) file_contents = re.sub( REF_REGEX_N, replace_normative_ref, file_contents, flags=re.MULTILINE REF_REGEX_N, replace_normative_ref, file_contents, flags=re.MULTILINE, ) with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file: Loading @@ -666,8 +671,6 @@ def add_ids_to_references(file_contents: str, filename: str): # endregion def preprocess( src: str, src_type: str, consolidated_md_path: str, file_order_json: str ): Loading Loading @@ -699,27 +702,32 @@ def preprocess( annexes = DEFAULT_ANNEXES # create REFERENCE_MAPPING_MD_TO_HTML file locally if it doesn't exist with get_timer().section("Preprocessing: 1 - Creating reference mapping file"): if not os.path.exists(REFERENCE_MAPPING_MD_TO_HTML): with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file: json.dump({}, ref_file, indent=4) with get_timer().section("Preprocessing: 2 - Preprocessing Markdown files"): if file_order_json: with open(file_order_json, "r") as file: json_data = json.load(file) clauses = json_data.get("clauses") annexes = json_data.get("annexes") files, clauses_filenames, annexes_filenames = get_file_order(src, clauses, annexes) files, clauses_filenames, annexes_filenames = get_file_order( src, clauses, annexes ) files = [f"{filename}.md" for filename in files] clauses_filenames = [f"{filename}.md" for filename in clauses_filenames] annexes_filenames = [f"{filename}.md" for filename in annexes_filenames] preprocessed_filenames = [] with get_timer().section("Preprocessing: 3 - Processing individual files"): for filename in files: filename_without_extension = filename[:-3] # Remove .md extension if filename.endswith(src_type) and filename != "consolidated.md": input_path = os.path.join(src, filename) try: text = open(input_path, "r", encoding="utf-8").read() with open(input_path, "r", encoding="utf-8") as file: text = file.read() text = undo_prettier_formatting(text) run_format_checks(filename, text.splitlines()) Loading Loading @@ -749,7 +757,8 @@ def preprocess( r"([\w-]+?).md", r"--preprocessed--\1.md", filename ) # Ensure file order is preserved by keeping the number in front output_path = os.path.join(src, new_filename) open(output_path, "w", encoding="utf-8").write(text) with open(output_path, "w", encoding="utf-8") as file: file.write(text) preprocessed_filenames.append(new_filename) except Exception as e: # print(f"Error: {e}") Loading @@ -766,8 +775,10 @@ def preprocess( if f.startswith("--preprocessed--"): os.remove(os.path.join(src, f)) sys.exit(1) pass handle_consolidated_md("create", src, consolidated_md_path, preprocessed_filenames) p_warning(f"Warning: Could not preprocess {input_path}: {e}") with get_timer().section("Preprocessing: 4 - Creating consolidated Markdown file"): handle_consolidated_md( "create", src, consolidated_md_path, preprocessed_filenames ) return filename_numbers_mapping Loading
md_to_docx_converter/src/to_html/postprocessing.py +86 −71 Original line number Diff line number Diff line import os, re, html, json from bs4 import BeautifulSoup, Tag, NavigableString from ..time_book import get_timer from src.utils import ( apply_renaming_logic, Loading Loading @@ -342,6 +343,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup: date_text["data-replace"] = "DATE" date_text.string = text close_bracket = NavigableString(")") date.clear() date.append(open_bracket) date.append(date_text) date.append(close_bracket) Loading Loading @@ -624,21 +626,21 @@ def shorten_toc_text(soup: BeautifulSoup): """ Remove informative/normative from TOC only """ tocTexts = soup.select("#TOC .norm") for tocText in tocTexts: tocText.decompose() # Find TOC element once to avoid repeated document searches toc = soup.select_one("#TOC") tocTexts = soup.select(".norm") for tocText in tocTexts: tocText.string = "Normative" if toc: # Remove norm and inform elements from TOC first (reduces DOM size for subsequent searches) for elem in toc.find_all(class_=["norm", "inform"]): elem.decompose() tocTexts = soup.select("#TOC .inform") for tocText in tocTexts: tocText.decompose() # Update remaining norm elements (now only outside TOC) for elem in soup.find_all(class_="norm"): elem.string = "Normative" tocTexts = soup.select(".inform") for tocText in tocTexts: tocText.string = "Informative" # Update remaining inform elements (now only outside TOC) for elem in soup.find_all(class_="inform"): elem.string = "Informative" return soup Loading Loading @@ -917,68 +919,81 @@ def postprocess(html_dir: str): ### Arguments - `html_dir`: Directory containing the HTML files to be processed """ filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir) with get_timer().section("Postprocessing: 1 - Get dirty filenames mapping"): filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames( html_dir ) images_mapping = {} html_files = [] processed_soups = [] # Read and rename all HTML files with get_timer().section("Postprocessing: 2 - Read and Rename HTML Files"): for filename in os.listdir(html_dir): if filename.endswith(".html"): with open(os.path.join(html_dir, filename), "r", encoding="utf-8") as file: html = file.read() with open( os.path.join(html_dir, filename), "r", encoding="utf-8" ) as file: html_content = file.read() if filename == "index.html": new_filename = filename else: new_filename = apply_renaming_logic(html, filename, "html") new_filename = apply_renaming_logic(html_content, filename, "html") os.rename( os.path.join(html_dir, filename), os.path.join(html_dir, new_filename) os.path.join(html_dir, filename), os.path.join(html_dir, new_filename), ) file_path = os.path.join(html_dir, new_filename) with open(file_path, "r", encoding="utf-8") as html: soup = BeautifulSoup(html, "html.parser") html_files.append((new_filename, html_content)) # First pass: process all HTML files while keeping soup in memory with get_timer().section("Postprocessing: 3 - First Pass - Process HTML Files"): for new_filename, html_content in html_files: with get_timer().section( f"Postprocessing: 3.a - Processing {new_filename}" ): soup = BeautifulSoup(html_content, "html.parser") soup = shorten_toc_text(soup) soup = remove_code_blocks_with_only_images(soup) soup = format_examples_and_notes(soup) soup = format_tables(soup) if new_filename == "front-page.html": soup = format_front_page(soup) if ( new_filename.replace(".html", "") in files_with_references ): # Reference-specific formatting soup = format_references(soup) else: soup = add_links_to_references_in_text(soup) soup = fix_toc_links(soup, filenames_mapping) soup = move_dangling_brackets_out_of_links(soup) soup = fix_ex_json_spacing(soup) soup = unwrap_gt_lt_code_tags(soup) soup = handle_ew_div(soup) soup = remove_links_from_labels(soup) soup = add_ids_to_labels(soup) soup = replace_dash_characters(soup) soup = move_figure_id_to_FL_elements(soup) soup = fix_custom_tags(soup) soup = fix_lists(soup) images, soup = extract_images_from_html(soup) for image_id, image_src in images.items(): images_mapping[image_src] = {"id": image_id, "file": new_filename} contents = soup.decode_contents() images_mapping[image_src] = { "id": image_id, "file": new_filename, } with open(file_path, "w", encoding="utf-8") as html: html.write(contents) # Keep soup in memory instead of writing and re-reading processed_soups.append((new_filename, soup)) for filename in os.listdir(html_dir): if filename.endswith(".html"): # Second pass: reuse soup objects already in memory with get_timer().section( "Postprocessing: 4 - Second Pass - Add Links and Write Files" ): for filename, soup in processed_soups: file_path = os.path.join(html_dir, filename) with open(file_path, "r", encoding="utf-8") as html: soup = BeautifulSoup(html, "html.parser") try: soup = add_custom_link_to_images(soup, images_mapping) Loading @@ -988,8 +1003,8 @@ def postprocess(html_dir: str): print(p_error(str(e))) os._exit(1) # Write the final file only once contents = soup.decode_contents() with open(file_path, "w", encoding="utf-8") as html: html.write(contents) Loading
md_to_docx_converter/src/to_html/preprocessing.py +88 −77 Original line number Diff line number Diff line import os, re, os, json import os, re, json import sys from typing_extensions import Literal from ..time_book import get_timer from src.constants import ( NORMATIVE_REF_FILE, Loading Loading @@ -28,8 +29,6 @@ files_with_references = [NORMATIVE_REF_FILE, INFORMATIVE_REF_FILE] # region Helpers def undo_prettier_formatting(text: str) -> str: """Undo any formatting changes made by Prettier to ensure the Markdown is in a more raw format for processing.""" Loading Loading @@ -649,10 +648,16 @@ def add_ids_to_references(file_contents: str, filename: str): return f'<span id="{new_ref}" />[{new_ref}]' file_contents = re.sub( REF_REGEX_I, replace_informative_ref, file_contents, flags=re.MULTILINE REF_REGEX_I, replace_informative_ref, file_contents, flags=re.MULTILINE, ) file_contents = re.sub( REF_REGEX_N, replace_normative_ref, file_contents, flags=re.MULTILINE REF_REGEX_N, replace_normative_ref, file_contents, flags=re.MULTILINE, ) with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file: Loading @@ -666,8 +671,6 @@ def add_ids_to_references(file_contents: str, filename: str): # endregion def preprocess( src: str, src_type: str, consolidated_md_path: str, file_order_json: str ): Loading Loading @@ -699,27 +702,32 @@ def preprocess( annexes = DEFAULT_ANNEXES # create REFERENCE_MAPPING_MD_TO_HTML file locally if it doesn't exist with get_timer().section("Preprocessing: 1 - Creating reference mapping file"): if not os.path.exists(REFERENCE_MAPPING_MD_TO_HTML): with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file: json.dump({}, ref_file, indent=4) with get_timer().section("Preprocessing: 2 - Preprocessing Markdown files"): if file_order_json: with open(file_order_json, "r") as file: json_data = json.load(file) clauses = json_data.get("clauses") annexes = json_data.get("annexes") files, clauses_filenames, annexes_filenames = get_file_order(src, clauses, annexes) files, clauses_filenames, annexes_filenames = get_file_order( src, clauses, annexes ) files = [f"{filename}.md" for filename in files] clauses_filenames = [f"{filename}.md" for filename in clauses_filenames] annexes_filenames = [f"{filename}.md" for filename in annexes_filenames] preprocessed_filenames = [] with get_timer().section("Preprocessing: 3 - Processing individual files"): for filename in files: filename_without_extension = filename[:-3] # Remove .md extension if filename.endswith(src_type) and filename != "consolidated.md": input_path = os.path.join(src, filename) try: text = open(input_path, "r", encoding="utf-8").read() with open(input_path, "r", encoding="utf-8") as file: text = file.read() text = undo_prettier_formatting(text) run_format_checks(filename, text.splitlines()) Loading Loading @@ -749,7 +757,8 @@ def preprocess( r"([\w-]+?).md", r"--preprocessed--\1.md", filename ) # Ensure file order is preserved by keeping the number in front output_path = os.path.join(src, new_filename) open(output_path, "w", encoding="utf-8").write(text) with open(output_path, "w", encoding="utf-8") as file: file.write(text) preprocessed_filenames.append(new_filename) except Exception as e: # print(f"Error: {e}") Loading @@ -766,8 +775,10 @@ def preprocess( if f.startswith("--preprocessed--"): os.remove(os.path.join(src, f)) sys.exit(1) pass handle_consolidated_md("create", src, consolidated_md_path, preprocessed_filenames) p_warning(f"Warning: Could not preprocess {input_path}: {e}") with get_timer().section("Preprocessing: 4 - Creating consolidated Markdown file"): handle_consolidated_md( "create", src, consolidated_md_path, preprocessed_filenames ) return filename_numbers_mapping