fix: prevent bs4 to create html tags out of <text> (5008002f) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/src/to_html/postprocessing.py

+100 −78

Original line number	Diff line number	Diff line
		import os, json
		import re
		import os, re, html
		from bs4 import BeautifulSoup, Tag, NavigableString

		from src.utils import (
		@@ -12,10 +11,7 @@ from src.constants import ABBREVIATION_CLASS

		normative_file = "clause-2"
		informative_file = "clause-2"
		files_with_references = [
		normative_file,
		informative_file
		]
		files_with_references = [normative_file, informative_file]


		# region Helpers
		@@ -80,7 +76,9 @@ def unwrap_gt_lt_code_tags(soup: BeautifulSoup):
		codes = soup.select("code:not(pre > code):not(em > code)")

		for code in codes:
		code.unwrap()
		text = NavigableString(html.unescape(code.get_text()))
		code.insert_before(text)
		code.decompose()

		return soup

		@@ -197,7 +195,9 @@ def format_examples_and_notes(soup: BeautifulSoup):
		def get_label_text_and_class(para: Tag):
		"""Get the label text from the paragraph and determine the class to assign to the div"""
		text = para.contents[0].split(":")[0] + ":"
		remaining_text = para.contents[0].split(": ")[1] if ": " in para.contents[0] else ""
		remaining_text = (
		para.contents[0].split(": ")[1] if ": " in para.contents[0] else ""
		)
		cls = ""

		if "[!tip]" in text:
		@@ -223,7 +223,9 @@ def format_examples_and_notes(soup: BeautifulSoup):
		if not label_para:
		continue

		label_text, label_class, remaining_contents = get_label_text_and_class(label_para)
		label_text, label_class, remaining_contents = get_label_text_and_class(
		label_para
		)

		new_parent_div = soup.new_tag("div", attrs={"class": label_class})

		@@ -238,7 +240,9 @@ def format_examples_and_notes(soup: BeautifulSoup):

		# Process body
		body_div = soup.new_tag("div")
		if remaining_contents: # this happens when there is not an empty line after the [!note] or [!tip], better to do here because in md we have gridtables to take care of
		if (
		remaining_contents
		): # this happens when there is not an empty line after the [!note] or [!tip], better to do here because in md we have gridtables to take care of
		para_container = soup.new_tag("p")
		for content in remaining_contents:
		para_container.append(content)
		@@ -312,31 +316,11 @@ def add_links_to_references_in_text(soup):

		return soup

		# Pattern for informative references with "i." prefix
		REF_REGEX_I = r"(?<!\[)\[(i\.[A-Za-z0-9]+)\]"

		# Pattern for normative references without "i." prefix
		REF_REGEX_N = r"(?<!\[)\[(n\.[A-Za-z0-9]+)\]"

		def insert_link_with_reference(
		content, is_informative
		):
		if content.parent is None:
		return
		REG_REGEX = r"(?<!\[)\[(i\.\|n\.)?[A-Za-z0-9]+\]"

		def insert_link_with_reference(content, is_informative):
		opening_bracket_index = content.find("[")
		closing_bracket_index = content.find("]") + 1

		if opening_bracket_index > 0:
		before_text = content[:opening_bracket_index]
		else:
		before_text = ""

		if closing_bracket_index < len(content):
		after_text = content[closing_bracket_index:]
		else:
		after_text = ""

		internal_text = content[opening_bracket_index + 1 : closing_bracket_index - 1]

		# prepare the new <a> tag
		@@ -347,27 +331,31 @@ def add_links_to_references_in_text(soup):
		)
		a = soup.new_tag("a", attrs={"href": link})
		a.append(f"[{internal_text.replace('n.', '')}]")
		content.replace_with(a)

		# Add any remaining text after the <a> tag
		a.insert_before(NavigableString(before_text))
		a.insert_after(NavigableString(after_text))
		return a

		def process_text_nodes(element):
		for content in list(element.contents):
		if isinstance(content, NavigableString):
		split_content = content.split(" ")
		for part in split_content:
		element = NavigableString(part + " ")
		content.insert_before(element)
		if re.match(REF_REGEX_I, part):
		insert_link_with_reference(
		element, is_informative=True
		)
		elif re.match(REF_REGEX_N, part):
		insert_link_with_reference(
		element, is_informative=False
		)
		before_text = ""
		after_text = content
		while True:
		match = re.search(REG_REGEX, after_text)
		if match:
		before_text = after_text[: match.start()]
		after_text = after_text[match.end() :]
		if before_text:
		content.insert_before(NavigableString(before_text))

		is_informative = match.group(1) == "i."
		# replace content with the <a> tag
		match_text = match.group(0)
		a = insert_link_with_reference(match_text, is_informative)
		content.insert_before(a)
		else:
		if after_text:
		content.insert_before(NavigableString(after_text))
		break

		content.extract()

		elif isinstance(content, Tag) and not content.name in ["a", "code"]:
		@@ -432,6 +420,7 @@ def remove_links_from_labels(soup: BeautifulSoup):
		a_tag.unwrap()
		return soup


		def add_ids_to_labels(soup: BeautifulSoup):
		"""
		Add ids to label elements if they don't have one.
		@@ -447,6 +436,7 @@ def add_ids_to_labels(soup: BeautifulSoup):
		label.attrs["id"] = f"Table_{id}"
		return soup


		def replace_dash_characters(soup: BeautifulSoup):
		"""
		Replace dash characters in the a_tags and ids with the correct ones.
		@@ -465,6 +455,7 @@ def replace_dash_characters(soup: BeautifulSoup):
		element["id"] = id.replace("‑", "-").replace("—", "-")
		return soup


		def move_figure_id_to_FL_elements(soup: BeautifulSoup):
		"""
		Move the id attributes from figure elements to their parent FL elements.
		@@ -486,6 +477,7 @@ def move_figure_id_to_FL_elements(soup: BeautifulSoup):

		return soup


		def fix_custom_tags(soup: BeautifulSoup):
		"""
		Fix custom tags in the HTML.
		@@ -507,37 +499,52 @@ def fix_custom_tags(soup: BeautifulSoup):
		class_name = "TH" if is_table else "FL"
		next_element = a.find_next("div", class_=class_name, id=True)
		if next_element:
		prefix = 'Table_' if is_table else 'Figure_'
		prefix = "Table_" if is_table else "Figure_"
		string_to_be_replaced = f"{prefix}below"
		new_a_text = next_element['id'].replace(prefix, "")
		new_a_text = next_element["id"].replace(prefix, "")
		a["href"] = href.replace(string_to_be_replaced, next_element["id"])
		a.string = a.string.replace("below", new_a_text)
		else:
		# flash an error
		print(p_error(f"Error: Found a broken custom tag in file {h1_tag.string}"))
		print(p_error(f"Error: No next element found for '{a.string}'. There are not any figures/tables above this tag."))
		print(
		p_error(f"Error: Found a broken custom tag in file {h1_tag.string}")
		)
		print(
		p_error(
		f"Error: No next element found for '{a.string}'. There are not any figures/tables above this tag."
		)
		)
		os._exit(1)
		elif href.endswith("above"):
		is_table = "Table" in href
		class_name = "TH" if is_table else "FL"
		previous_element = a.find_previous("div", class_=class_name, id=True)
		if previous_element:
		prefix = 'Table_' if is_table else 'Figure_'
		prefix = "Table_" if is_table else "Figure_"
		string_to_be_replaced = f"{prefix}above"
		new_a_text = previous_element['id'].replace(prefix, "")
		new_a_text = previous_element["id"].replace(prefix, "")
		a["href"] = href.replace(string_to_be_replaced, previous_element["id"])
		a.string = a.string.replace("above", new_a_text)
		else:
		# flash an error
		print(p_error(f"Error: Found a broken custom tag in file {h1_tag.string}"))
		print(p_error(f"Error: No previous element found for '{a.string}'. There are not any figures/tables above this tag."))
		print(
		p_error(f"Error: Found a broken custom tag in file {h1_tag.string}")
		)
		print(
		p_error(
		f"Error: No previous element found for '{a.string}'. There are not any figures/tables above this tag."
		)
		)
		os._exit(1)
		elif href.find("#") != -1 and href.find("root") != -1 and notAnImage(href): # when root is used in md
		elif (
		href.find("#") != -1 and href.find("root") != -1 and notAnImage(href)
		): # when root is used in md
		new_id_prefix = f"{h1_tag['id']}"
		a["href"] = href.replace("root", new_id_prefix)
		a.string = a.string.replace("root", new_id_prefix)
		return soup


		def extract_images_from_html(soup: BeautifulSoup) -> dict:
		"""
		Extracts image sources from the given HTML content.
		@@ -559,12 +566,17 @@ def extract_images_from_html(soup: BeautifulSoup) -> dict:
		src = img.get("src", "").replace("media/", "")
		images_mapping[id] = src
		figure_caption = fig.find("figcaption")
		if figure_caption: # TODO: check if we might want to keep the caption instead of removing it
		if (
		figure_caption
		): # TODO: check if we might want to keep the caption instead of removing it
		figure_caption.decompose()

		return images_mapping, soup

		def add_custom_link_to_images(soup: BeautifulSoup, images_mapping: dict) -> BeautifulSoup:

		def add_custom_link_to_images(
		soup: BeautifulSoup, images_mapping: dict
		) -> BeautifulSoup:
		"""
		Adds a custom link to images in the HTML content based on the provided images mapping.

		@@ -587,10 +599,13 @@ def add_custom_link_to_images(soup: BeautifulSoup, images_mapping: dict) -> Beau
		a["href"] = f"{image_info['file']}#{image_info['id']}"
		a.string = f"figure {image_info['id'].split('_')[1]}"
		else:
		raise ValueError(f"ERROR: Image '{filename}' not found in images mapping. Are you sure it exists in the media folder and is used in the document?")
		raise ValueError(
		f"ERROR: Image '{filename}' not found in images mapping. Are you sure it exists in the media folder and is used in the document?"
		)

		return soup


		def fix_capitalization_in_links(soup: BeautifulSoup) -> BeautifulSoup:
		"""
		Ensures that the capitalization in the link text matches the capitalization in the href attribute.
		@@ -613,7 +628,11 @@ def fix_capitalization_in_links(soup: BeautifulSoup) -> BeautifulSoup:
		# Second case: it is after a period
		elif a.previous_sibling and isinstance(a.previous_sibling, NavigableString):
		prev_text = a.previous_sibling.strip()
		if prev_text.endswith(".") or prev_text.endswith("!") or prev_text.endswith("?"):
		if (
		prev_text.endswith(".")
		or prev_text.endswith("!")
		or prev_text.endswith("?")
		):
		capitalized_text = text.capitalize()
		a.string = capitalized_text
		for span in span_clauses_tags:
		@@ -623,9 +642,15 @@ def fix_capitalization_in_links(soup: BeautifulSoup) -> BeautifulSoup:
		if span.parent and span.parent.contents[0] == span:
		capitalized_text = text.capitalize()
		span.string = capitalized_text
		elif span.previous_sibling and isinstance(span.previous_sibling, NavigableString):
		elif span.previous_sibling and isinstance(
		span.previous_sibling, NavigableString
		):
		prev_text = span.previous_sibling.strip()
		if prev_text.endswith(".") or prev_text.endswith("!") or prev_text.endswith("?"):
		if (
		prev_text.endswith(".")
		or prev_text.endswith("!")
		or prev_text.endswith("?")
		):
		capitalized_text = text.capitalize()
		span.string = capitalized_text
		return soup
		@@ -699,12 +724,9 @@ def postprocess(html_dir: str):
		soup = fix_custom_tags(soup)
		images, soup = extract_images_from_html(soup)
		for image_id, image_src in images.items():
		images_mapping[image_src] = {
		"id": image_id,
		"file": new_filename
		}
		images_mapping[image_src] = {"id": image_id, "file": new_filename}

		contents = soup.decode_contents(formatter=None)
		contents = soup.decode_contents()

		with open(file_path, "w", encoding="utf-8") as html:
		html.write(contents)
		@@ -723,7 +745,7 @@ def postprocess(html_dir: str):
		print(p_error(str(e)))
		os._exit(1)

		contents = soup.decode_contents(formatter=None)
		contents = soup.decode_contents()

		with open(file_path, "w", encoding="utf-8") as html:
		html.write(contents)