Commit 5008002f authored by Marco Cavalli's avatar Marco Cavalli
Browse files

fix: prevent bs4 to create html tags out of <text>

chore: refactor links generations with references
chore: apply prettier
parent a6ac7df8
Loading
Loading
Loading
Loading
+100 −78
Original line number Diff line number Diff line
import os, json
import re
import os, re, html
from bs4 import BeautifulSoup, Tag, NavigableString

from src.utils import (
@@ -12,10 +11,7 @@ from src.constants import ABBREVIATION_CLASS

normative_file = "clause-2"
informative_file = "clause-2"
files_with_references = [
    normative_file,
    informative_file
]
files_with_references = [normative_file, informative_file]


# region Helpers
@@ -80,7 +76,9 @@ def unwrap_gt_lt_code_tags(soup: BeautifulSoup):
    codes = soup.select("code:not(pre > code):not(em > code)")

    for code in codes:
        code.unwrap()
        text = NavigableString(html.unescape(code.get_text()))
        code.insert_before(text)
        code.decompose()

    return soup

@@ -197,7 +195,9 @@ def format_examples_and_notes(soup: BeautifulSoup):
    def get_label_text_and_class(para: Tag):
        """Get the label text from the paragraph and determine the class to assign to the div"""
        text = para.contents[0].split(":")[0] + ":"
        remaining_text = para.contents[0].split(": ")[1] if ": " in para.contents[0] else ""
        remaining_text = (
            para.contents[0].split(": ")[1] if ": " in para.contents[0] else ""
        )
        cls = ""

        if "[!tip]" in text:
@@ -223,7 +223,9 @@ def format_examples_and_notes(soup: BeautifulSoup):
        if not label_para:
            continue

        label_text, label_class, remaining_contents = get_label_text_and_class(label_para)
        label_text, label_class, remaining_contents = get_label_text_and_class(
            label_para
        )

        new_parent_div = soup.new_tag("div", attrs={"class": label_class})

@@ -238,7 +240,9 @@ def format_examples_and_notes(soup: BeautifulSoup):

        # Process body
        body_div = soup.new_tag("div")
        if remaining_contents: # this happens when there is not an empty line after the [!note] or [!tip], better to do here because in md we have gridtables to take care of
        if (
            remaining_contents
        ):  # this happens when there is not an empty line after the [!note] or [!tip], better to do here because in md we have gridtables to take care of
            para_container = soup.new_tag("p")
            for content in remaining_contents:
                para_container.append(content)
@@ -312,31 +316,11 @@ def add_links_to_references_in_text(soup):

        return soup

    # Pattern for informative references with "i." prefix
    REF_REGEX_I = r"(?<!\[)\[(i\.[A-Za-z0-9]+)\]"

    # Pattern for normative references without "i." prefix
    REF_REGEX_N = r"(?<!\[)\[(n\.[A-Za-z0-9]+)\]"

    def insert_link_with_reference(
        content, is_informative
    ):
        if content.parent is None:
            return
    REG_REGEX = r"(?<!\[)\[(i\.|n\.)?[A-Za-z0-9]+\]"

    def insert_link_with_reference(content, is_informative):
        opening_bracket_index = content.find("[")
        closing_bracket_index = content.find("]") + 1

        if opening_bracket_index > 0:
            before_text = content[:opening_bracket_index]
        else:
            before_text = ""
        
        if closing_bracket_index < len(content):
            after_text = content[closing_bracket_index:]
        else:
            after_text = ""
        
        internal_text = content[opening_bracket_index + 1 : closing_bracket_index - 1]

        # prepare the new <a> tag
@@ -347,27 +331,31 @@ def add_links_to_references_in_text(soup):
        )
        a = soup.new_tag("a", attrs={"href": link})
        a.append(f"[{internal_text.replace('n.', '')}]")
        content.replace_with(a)

        # Add any remaining text after the <a> tag
        a.insert_before(NavigableString(before_text))
        a.insert_after(NavigableString(after_text))
        return a

    def process_text_nodes(element):
        for content in list(element.contents):
            if isinstance(content, NavigableString):
                split_content = content.split(" ")
                for part in split_content:
                    element = NavigableString(part + " ")
                    content.insert_before(element)
                    if re.match(REF_REGEX_I, part):
                        insert_link_with_reference(
                            element, is_informative=True
                        )
                    elif re.match(REF_REGEX_N, part):
                        insert_link_with_reference(
                            element, is_informative=False
                        )
                before_text = ""
                after_text = content
                while True:
                    match = re.search(REG_REGEX, after_text)
                    if match:
                        before_text = after_text[: match.start()]
                        after_text = after_text[match.end() :]
                        if before_text:
                            content.insert_before(NavigableString(before_text))

                        is_informative = match.group(1) == "i."
                        # replace content with the <a> tag
                        match_text = match.group(0)
                        a = insert_link_with_reference(match_text, is_informative)
                        content.insert_before(a)
                    else:
                        if after_text:
                            content.insert_before(NavigableString(after_text))
                        break

                content.extract()

            elif isinstance(content, Tag) and not content.name in ["a", "code"]:
@@ -432,6 +420,7 @@ def remove_links_from_labels(soup: BeautifulSoup):
            a_tag.unwrap()
    return soup


def add_ids_to_labels(soup: BeautifulSoup):
    """
    Add ids to label elements if they don't have one.
@@ -447,6 +436,7 @@ def add_ids_to_labels(soup: BeautifulSoup):
                label.attrs["id"] = f"Table_{id}"
    return soup


def replace_dash_characters(soup: BeautifulSoup):
    """
    Replace dash characters in the a_tags and ids with the correct ones.
@@ -465,6 +455,7 @@ def replace_dash_characters(soup: BeautifulSoup):
            element["id"] = id.replace("", "-").replace("", "-")
    return soup


def move_figure_id_to_FL_elements(soup: BeautifulSoup):
    """
    Move the id attributes from figure elements to their parent FL elements.
@@ -486,6 +477,7 @@ def move_figure_id_to_FL_elements(soup: BeautifulSoup):

    return soup


def fix_custom_tags(soup: BeautifulSoup):
    """
    Fix custom tags in the HTML.
@@ -507,37 +499,52 @@ def fix_custom_tags(soup: BeautifulSoup):
            class_name = "TH" if is_table else "FL"
            next_element = a.find_next("div", class_=class_name, id=True)
            if next_element:
                prefix = 'Table_' if is_table else 'Figure_'
                prefix = "Table_" if is_table else "Figure_"
                string_to_be_replaced = f"{prefix}below"
                new_a_text = next_element['id'].replace(prefix, "")
                new_a_text = next_element["id"].replace(prefix, "")
                a["href"] = href.replace(string_to_be_replaced, next_element["id"])
                a.string = a.string.replace("below", new_a_text)
            else:
                # flash an error
                print(p_error(f"Error: Found a broken custom tag in file {h1_tag.string}"))
                print(p_error(f"Error: No next element found for '{a.string}'. There are not any figures/tables above this tag."))
                print(
                    p_error(f"Error: Found a broken custom tag in file {h1_tag.string}")
                )
                print(
                    p_error(
                        f"Error: No next element found for '{a.string}'. There are not any figures/tables above this tag."
                    )
                )
                os._exit(1)
        elif href.endswith("above"):
            is_table = "Table" in href
            class_name = "TH" if is_table else "FL"
            previous_element = a.find_previous("div", class_=class_name, id=True)
            if previous_element:
                prefix = 'Table_' if is_table else 'Figure_'
                prefix = "Table_" if is_table else "Figure_"
                string_to_be_replaced = f"{prefix}above"
                new_a_text = previous_element['id'].replace(prefix, "")
                new_a_text = previous_element["id"].replace(prefix, "")
                a["href"] = href.replace(string_to_be_replaced, previous_element["id"])
                a.string = a.string.replace("above", new_a_text)
            else:
                # flash an error
                print(p_error(f"Error: Found a broken custom tag in file {h1_tag.string}"))
                print(p_error(f"Error: No previous element found for '{a.string}'. There are not any figures/tables above this tag."))
                print(
                    p_error(f"Error: Found a broken custom tag in file {h1_tag.string}")
                )
                print(
                    p_error(
                        f"Error: No previous element found for '{a.string}'. There are not any figures/tables above this tag."
                    )
                )
                os._exit(1)
        elif href.find("#") != -1 and href.find("root") != -1 and notAnImage(href): # when root is used in md
        elif (
            href.find("#") != -1 and href.find("root") != -1 and notAnImage(href)
        ):  # when root is used in md
            new_id_prefix = f"{h1_tag['id']}"
            a["href"] = href.replace("root", new_id_prefix)
            a.string = a.string.replace("root", new_id_prefix)
    return soup


def extract_images_from_html(soup: BeautifulSoup) -> dict:
    """
    Extracts image sources from the given HTML content.
@@ -559,12 +566,17 @@ def extract_images_from_html(soup: BeautifulSoup) -> dict:
                src = img.get("src", "").replace("media/", "")
                images_mapping[id] = src
            figure_caption = fig.find("figcaption")
            if figure_caption: # TODO: check if we might want to keep the caption instead of removing it
            if (
                figure_caption
            ):  # TODO: check if we might want to keep the caption instead of removing it
                figure_caption.decompose()

    return images_mapping, soup

def add_custom_link_to_images(soup: BeautifulSoup, images_mapping: dict) -> BeautifulSoup:

def add_custom_link_to_images(
    soup: BeautifulSoup, images_mapping: dict
) -> BeautifulSoup:
    """
    Adds a custom link to images in the HTML content based on the provided images mapping.

@@ -587,10 +599,13 @@ def add_custom_link_to_images(soup: BeautifulSoup, images_mapping: dict) -> Beau
                a["href"] = f"{image_info['file']}#{image_info['id']}"
                a.string = f"figure {image_info['id'].split('_')[1]}"
            else:
                raise ValueError(f"ERROR: Image '{filename}' not found in images mapping. Are you sure it exists in the media folder and is used in the document?")
                raise ValueError(
                    f"ERROR: Image '{filename}' not found in images mapping. Are you sure it exists in the media folder and is used in the document?"
                )

    return soup


def fix_capitalization_in_links(soup: BeautifulSoup) -> BeautifulSoup:
    """
    Ensures that the capitalization in the link text matches the capitalization in the href attribute.
@@ -613,7 +628,11 @@ def fix_capitalization_in_links(soup: BeautifulSoup) -> BeautifulSoup:
        # Second case: it is after a period
        elif a.previous_sibling and isinstance(a.previous_sibling, NavigableString):
            prev_text = a.previous_sibling.strip()
            if prev_text.endswith(".") or prev_text.endswith("!") or prev_text.endswith("?"):
            if (
                prev_text.endswith(".")
                or prev_text.endswith("!")
                or prev_text.endswith("?")
            ):
                capitalized_text = text.capitalize()
                a.string = capitalized_text
    for span in span_clauses_tags:
@@ -623,9 +642,15 @@ def fix_capitalization_in_links(soup: BeautifulSoup) -> BeautifulSoup:
        if span.parent and span.parent.contents[0] == span:
            capitalized_text = text.capitalize()
            span.string = capitalized_text
        elif span.previous_sibling and isinstance(span.previous_sibling, NavigableString):
        elif span.previous_sibling and isinstance(
            span.previous_sibling, NavigableString
        ):
            prev_text = span.previous_sibling.strip()
            if prev_text.endswith(".") or prev_text.endswith("!") or prev_text.endswith("?"):
            if (
                prev_text.endswith(".")
                or prev_text.endswith("!")
                or prev_text.endswith("?")
            ):
                capitalized_text = text.capitalize()
                span.string = capitalized_text
    return soup
@@ -699,12 +724,9 @@ def postprocess(html_dir: str):
        soup = fix_custom_tags(soup)
        images, soup = extract_images_from_html(soup)
        for image_id, image_src in images.items():
            images_mapping[image_src] = {
                "id": image_id,
                "file": new_filename
            }
            images_mapping[image_src] = {"id": image_id, "file": new_filename}

        contents = soup.decode_contents(formatter=None)
        contents = soup.decode_contents()

        with open(file_path, "w", encoding="utf-8") as html:
            html.write(contents)
@@ -723,7 +745,7 @@ def postprocess(html_dir: str):
                print(p_error(str(e)))
                os._exit(1)

            contents = soup.decode_contents(formatter=None)
            contents = soup.decode_contents()

            with open(file_path, "w", encoding="utf-8") as html:
                html.write(contents)