Commit 803b02dc authored by Marco Cavalli's avatar Marco Cavalli
Browse files

feat: re-number references when converting to html

fix: remove empty spaces after links
fix: remove leading/trailing punctuation from above and below links
parent 29d024df
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -22,6 +22,8 @@ HANDLE_UNDERSCORE_CLASSES = [
    "HTML_Definition",
]

REFERENCE_MAPPING_MD_TO_HTML = "reference_mapping_md_to_html.json"


# Consolidated Markdown file and path
CONSOLIDATED_MD_NAME = "consolidated.md"
+92 −46
Original line number Diff line number Diff line
import os, re, html
import os, re, html, json
from bs4 import BeautifulSoup, Tag, NavigableString

from src.utils import (
@@ -8,7 +8,7 @@ from src.utils import (
    p_warning,
)

from src.constants import ABBREVIATION_CLASS
from src.constants import ABBREVIATION_CLASS, REFERENCE_MAPPING_MD_TO_HTML

normative_file = "clause-2"
informative_file = "clause-2"
@@ -79,10 +79,15 @@ def unwrap_gt_lt_code_tags(soup: BeautifulSoup):
    for code in codes:
        if code.parent and code.parent.name == "pre":
            span_text_only_children = code.find_all(
                lambda tag: isinstance(tag, Tag) and tag.name == "span" and len(tag.contents) == 1 and isinstance(tag.contents[0], NavigableString)
                lambda tag: isinstance(tag, Tag)
                and tag.name == "span"
                and len(tag.contents) == 1
                and isinstance(tag.contents[0], NavigableString)
            )
            for child in span_text_only_children:
                raw_text = child.get_text().replace("`<", "<").replace(">`", ">")
                raw_text = (
                    child.get_text().replace("`<", "<").replace(">`", ">")
                )
                text = NavigableString(html.unescape(raw_text))
                child.contents[0].replace_with(text)
        else:
@@ -150,7 +155,6 @@ def format_references(soup: BeautifulSoup):
            # Add body contents
            for contents in list(paragraph.contents):
                body_span.append(contents)
                body_span.append(NavigableString("\n"))

            # Append spans to div, and div to references list
            parent_div.append(tag_span)
@@ -303,11 +307,12 @@ def format_tables(soup: BeautifulSoup) -> BeautifulSoup:
            if len(tds) == len(tdsFirstRow):
                isNewRow = not isNewRow
            if isNewRow:
                existing_classes = tr.get('class', [])
                existing_classes = tr.get("class", [])
                if "bg-striped-row" not in existing_classes:
                    tr['class'] = existing_classes + ['bg-striped-row']
                    tr["class"] = existing_classes + ["bg-striped-row"]
    return soup


def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
    ZA = soup.find_all("div", class_="ZA")
    try:
@@ -316,7 +321,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
        try:
            # TITLE IN HEADER
            header = children[0]
            header['data-replace'] = 'WORKITEMNAME'
            header["data-replace"] = "WORKITEMNAME"
        except IndexError:
            print(p_warning("front-page is missing WORKITEMNAME information."))
        # Version in HEADER
@@ -324,7 +329,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
            version = children[1]
            new_span = soup.new_tag("span")
            version.wrap(new_span)
            new_span['data-replace'] = 'VERSION_NO'
            new_span["data-replace"] = "VERSION_NO"
        except IndexError:
            print(p_warning("front-page is missing VERSION_NO information."))
        # DATE IN HEADER
@@ -334,7 +339,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
            text = text.replace("(", "").replace(")", "")
            open_bracket = NavigableString("(")
            date_text = new_span = soup.new_tag("span")
            date_text['data-replace'] = 'DATE'
            date_text["data-replace"] = "DATE"
            date_text.string = text
            close_bracket = NavigableString(")")
            date.append(open_bracket)
@@ -343,34 +348,37 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
        except IndexError:
            print(p_warning("front-page is missing DATE information."))
    except IndexError:
        print(p_warning("front-page is missing the section with WORKITEMNAME, VERSION_NO, and DATE information."))
        print(
            p_warning(
                "front-page is missing the section with WORKITEMNAME, VERSION_NO, and DATE information."
            )
        )

    ZT = soup.find_all("div", class_="ZT")
    try:
        # first is title
        ZT[0]['data-replace'] = '{{TITLE}}'
        ZT[0]["data-replace"] = "{{TITLE}}"
    except IndexError:
        print(p_warning("front-page is missing TITLE information."))
    try:
        # second is part
        ZT[1]['data-replace'] = '{{PART}}'
        ZT[1]["data-replace"] = "{{PART}}"
    except IndexError:
        print(p_warning("front-page is missing PART information."))
    try:
        # third is subpart
        ZT[2]['data-replace'] = '{{SUBPART}}'
        ZT[2]["data-replace"] = "{{SUBPART}}"
    except IndexError:
        print(p_warning("front-page is missing SUBPART information."))
    try:
        # fourth is release
        ZT[3]['data-replace'] = '{{RELEASE}}'
        ZT[3]["data-replace"] = "{{RELEASE}}"
    except IndexError:
        print(p_warning("front-page is missing RELEASE information."))


    ZB = soup.find_all("div", class_="ZB")
    try:
        ZB[0]['data-replace'] = '{{TYPEDOCUMENT}}'
        ZB[0]["data-replace"] = "{{TYPEDOCUMENT}}"
    except IndexError:
        print(p_warning("front-page is missing TYPEDOCUMENT information."))

@@ -380,17 +388,21 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
        try:
            # SECOND is WORKITEM
            workitem = children[3]
            workitem['data-replace'] = '{{WORKITEM}}'
            workitem["data-replace"] = "{{WORKITEM}}"
        except IndexError:
            print(p_warning("front-page is missing WORKITEM information."))
        # FOURTH is KEYWORDS
        try:
            keywords = children[7]
            keywords['data-replace'] = '{{KEYWORDS}}'
            keywords["data-replace"] = "{{KEYWORDS}}"
        except IndexError:
            print(p_warning("front-page is missing KEYWORDS information."))
    except IndexError:
        print(p_warning("front-page is missing the section with WORKITEM and KEYWORDS information."))
        print(
            p_warning(
                "front-page is missing the section with WORKITEM and KEYWORDS information."
            )
        )

    return soup

@@ -445,9 +457,13 @@ def add_links_to_references_in_text(soup):
    REG_REGEX = r"(?<!\[)\[(i\.|n\.)?[A-Za-z0-9]+\]"

    def insert_link_with_reference(content, is_informative):
        with open(REFERENCE_MAPPING_MD_TO_HTML, "r") as ref_file:
            reference_mapping = json.load(ref_file)

        opening_bracket_index = content.find("[")
        closing_bracket_index = content.find("]") + 1
        internal_text = content[opening_bracket_index + 1 : closing_bracket_index - 1]
        internal_text = reference_mapping.get(internal_text, internal_text)

        # prepare the new <a> tag
        link = (
@@ -603,6 +619,7 @@ def move_figure_id_to_FL_elements(soup: BeautifulSoup):

    return soup


def shorten_toc_text(soup: BeautifulSoup):
    """
    Remove informative/normative from TOC only
@@ -625,6 +642,7 @@ def shorten_toc_text(soup: BeautifulSoup):

    return soup


def fix_custom_tags(soup: BeautifulSoup):
    """
    Fix custom tags in the HTML.
@@ -634,20 +652,40 @@ def fix_custom_tags(soup: BeautifulSoup):
        image_extensions = [".png", ".jpg", ".jpeg", ".svg"]
        return not any(href.endswith(ext) for ext in image_extensions)

    def remove_trailing_punctuation(a: Tag, needle: str) -> str:
    def remove_leading_punctuation(a: Tag, needle: str) -> None:
        if not a["href"].startswith("#" + needle):
            index = a["href"].find(needle)
            if index != -1 and index > 1:  # to account for the leading #
                new_href = f"#{a['href'][index:]}"
                a["href"] = new_href

        if not a.string.startswith(needle):
            index = a.string.find(needle)
            if index != -1 and index > 1:
                in_between_text = a["href"][1:index]  # exclude the leading #
                new_a_text = a.string[index:]
                a.string = new_a_text
                a.insert_before(NavigableString(in_between_text))

        return

    def remove_trailing_punctuation(a: Tag, needle: str) -> None:
        if not a["href"].endswith(needle):
            # find index of last occurrence of needle
            index = a["href"].rfind(needle)
            if index != -1 and index + len(needle) < len(a["href"]):
                remaining_text = a["href"][index + len(needle):]
                new_href = a["href"][: index + len(needle)]
                # a["href"] = href
                string_index = a.string.rfind(needle)
                new_a_text = a.string[:string_index + len(needle)]
                # a.string = a.string[:string_index + len(needle)]
                a["href"] = new_href

        if not a.string.endswith(needle):
            # find index of last occurrence of needle
            index = a.string.rfind(needle)
            if index != -1 and index + len(needle) < len(a.string):
                remaining_text = a.string[index + len(needle) :]
                new_a_text = a.string[: index + len(needle)]
                a.string = new_a_text
                a.insert_after(NavigableString(remaining_text))
                return new_href, new_a_text
        return None, None
        return

    # Example: Change <custom-tag> to <div class="custom-tag">
    h1_tag = soup.find("h1", id=True)
@@ -657,18 +695,19 @@ def fix_custom_tags(soup: BeautifulSoup):
    for a in a_tags:
        href = a.get("href", "")
        if href.find("+++below") != -1:
            new_href, new_a_text = remove_trailing_punctuation(a, "+++below")
            if new_href:
                href = new_href
            if new_a_text:
                a.string = new_a_text
            is_table = "Table" in href
            remove_leading_punctuation(a, "Table+++" if is_table else "Figure+++")
            remove_trailing_punctuation(a, "+++below")

            href = a.get("href", "")

            count_below = href.count("+++below")
            is_table = "Table" in href
            class_name = "TH" if is_table else "FL"
            next_nth_element = a
            for _ in range(count_below):
                next_nth_element = next_nth_element.find_next("div", class_=class_name, id=True)
                next_nth_element = next_nth_element.find_next(
                    "div", class_=class_name, id=True
                )
            if next_nth_element:
                prefix = "Table" if is_table else "Figure"
                postfix = "+++below" * count_below
@@ -688,17 +727,19 @@ def fix_custom_tags(soup: BeautifulSoup):
                )
                os._exit(1)
        elif href.find("+++above") != -1:
            new_href, new_a_text = remove_trailing_punctuation(a, "+++above")
            if new_href:
                href = new_href
            if new_a_text:
                a.string = new_a_text
            count_above = href.count("+++above")
            is_table = "Table" in href
            remove_leading_punctuation(a, "Table+++" if is_table else "Figure+++")
            remove_trailing_punctuation(a, "+++above")

            href = a.get("href", "")

            count_above = href.count("+++above")
            class_name = "TH" if is_table else "FL"
            next_nth_element = a
            for _ in range(count_above):
                next_nth_element = next_nth_element.find_previous("div", class_=class_name, id=True)
                next_nth_element = next_nth_element.find_previous(
                    "div", class_=class_name, id=True
                )
            if next_nth_element:
                prefix = "Table" if is_table else "Figure"
                postfix = "+++above" * count_above
@@ -725,6 +766,7 @@ def fix_custom_tags(soup: BeautifulSoup):
            a.string = a.string.replace("root", new_id_prefix)
    return soup


def fix_lists(soup: BeautifulSoup):
    """
    Fix lists that have been improperly nested due to markdown conversion.
@@ -738,6 +780,7 @@ def fix_lists(soup: BeautifulSoup):

    return soup


def extract_images_from_html(soup: BeautifulSoup) -> dict:
    """
    Extracts image sources from the given HTML content.
@@ -949,3 +992,6 @@ def postprocess(html_dir: str):

            with open(file_path, "w", encoding="utf-8") as html:
                html.write(contents)

    if os.path.exists(REFERENCE_MAPPING_MD_TO_HTML):
        os.remove(REFERENCE_MAPPING_MD_TO_HTML)
+136 −50
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@ from src.constants import (
    DIV_START_REGEX,
    DIV_END_REGEX,
    BAD_DIV_DELINEATOR_REGEX,
    REFERENCE_MAPPING_MD_TO_HTML,
)

from src.utils import (
@@ -28,6 +29,7 @@ files_with_references = [NORMATIVE_REF_FILE, INFORMATIVE_REF_FILE]

# region Helpers


def undo_prettier_formatting(text: str) -> str:
    """Undo any formatting changes made by Prettier to ensure the Markdown is in a more raw format for processing."""

@@ -52,6 +54,7 @@ def undo_prettier_formatting(text: str) -> str:

    return new_text


def run_format_checks(filename: str, file_lines: list[str]):
    """Runs various checks on the Markdown file contents to ensure they are properly formatted. If any improper formatting is detected, display any fatal errors or warnings as necessary."""

@@ -182,15 +185,20 @@ def run_format_checks(filename: str, file_lines: list[str]):
    check_divs()
    check_notes_and_examples()


def remove_ignore_prettier_statements(text: str) -> str:
    """Remove any existing <!-- prettier-ignore --> statements from the text to avoid duplication"""
    new_lines = []
    for line in text.split("\n"):
        if line.strip() != "<!-- prettier-ignore-start -->" and line.strip() != "<!-- prettier-ignore-end -->":
        if (
            line.strip() != "<!-- prettier-ignore-start -->"
            and line.strip() != "<!-- prettier-ignore-end -->"
        ):
            new_lines.append(line)

    return "\n".join(new_lines)


def add_divs_to_images_tables(text: str) -> str:
    """Add divs around images and their captions, and tables captions to the ones defined using the ETSI guidelines."""
    file_lines = text.split("\n")
@@ -220,9 +228,10 @@ def add_divs_to_images_tables(text : str) -> str:

    return "\n".join(new_file_lines) + "\n"


def handle_less_than_greater_than_text(file_contents: str):
    """Replace `<` and `>` with `&lt;` and `&gt;` respectively and wrap the whole section in single code ticks to allow the text to render in the HTML"""
    regex = r"\<(?!img\b|span\b|sup|/sup)(.+?)\>"
    regex = r"\<(?!img\b|span\b|sup|/sup|mark|/mark)(.+?)\>"
    replace = r"`&lt;\1&gt;`"
    table_regex = rf"\|([^|\n]*?{regex}[^|\n]*?)\|"

@@ -288,16 +297,26 @@ def add_empty_lines_in_notes_and_examples(file_contents: str):
        line = file_lines[i]

        # opening of a note or example
        if line.startswith(">>> [!note]") or line.startswith(">>> [!tip]") or line.startswith("| >>> [!note]"):
        if (
            line.startswith(">>> [!note]")
            or line.startswith(">>> [!tip]")
            or line.startswith("| >>> [!note]")
        ):
            new_file_lines.append(line)
            # Check if the next line exists and is not empty
            if i + 1 < len(file_lines) and file_lines[i + 1].strip() != "":
                if not line.startswith("| >>> [!note]"):
                    new_file_lines.append("")  # Add an empty line only for notes/examples outside tables
                    new_file_lines.append(
                        ""
                    )  # Add an empty line only for notes/examples outside tables
                else:
                    if not line.startswith("+") and not line.endswith("+"):
                        line_length = len(line) - 2  # Subtract 2 for the "|" at the start and end
                        new_file_lines.append("|" + " " * line_length + "|")  # Add an empty line
                        line_length = (
                            len(line) - 2
                        )  # Subtract 2 for the "|" at the start and end
                        new_file_lines.append(
                            "|" + " " * line_length + "|"
                        )  # Add an empty line

        # closing of a note or example
        elif line.find(">>>") != -1 or line.find("| >>>") != -1:
@@ -305,26 +324,43 @@ def add_empty_lines_in_notes_and_examples(file_contents: str):
            line_before = file_lines[i - 1] if i > 0 else ""
            empty_line_regex = r"^\s*$"
            empty_table_row_regex = r"^\|\s*\|$"
            if not re.match(empty_line_regex, line_before) and not line_before.startswith("| "):
                new_file_lines.append("")  # Add an empty line before any other blockquote
            elif not re.match(empty_table_row_regex, line) and line_before.startswith("| "):
                line_length = len(line) - 2  # Subtract 2 for the "|" at the start and end
                new_file_lines.append("|" + " " * line_length + "|")  # Add an empty line before any other blockquote in a table
            if not re.match(
                empty_line_regex, line_before
            ) and not line_before.startswith("| "):
                new_file_lines.append(
                    ""
                )  # Add an empty line before any other blockquote
            elif not re.match(empty_table_row_regex, line) and line_before.startswith(
                "| "
            ):
                line_length = (
                    len(line) - 2
                )  # Subtract 2 for the "|" at the start and end
                new_file_lines.append(
                    "|" + " " * line_length + "|"
                )  # Add an empty line before any other blockquote in a table

            new_file_lines.append(line)

            # check after the line
            if not line.startswith("| >>>"):  # we are not in a table
                new_file_lines.append("")  # Add an empty line after any other blockquote
                new_file_lines.append(
                    ""
                )  # Add an empty line after any other blockquote
            elif line.startswith("| >>>"):
                if not line.startswith("+-") and not line.startswith("+="):
                    line_length = len(line) - 2  # Subtract 2 for the "|" at the start and end
                    new_file_lines.append("|" + " " * line_length + "|")  # Add an empty line
                    line_length = (
                        len(line) - 2
                    )  # Subtract 2 for the "|" at the start and end
                    new_file_lines.append(
                        "|" + " " * line_length + "|"
                    )  # Add an empty line
        else:
            new_file_lines.append(line)
        i += 1
    return "\n".join(new_file_lines) + "\n"


# Used to keep track of clause numbers across multiple levels when auto-numbering
clauses_counters = [0] * MAX_HEADING_LEVEL
clauses_counters[0] = 3  # first 3 clauses are taken by mandatory files
@@ -461,9 +497,7 @@ def auto_number_content(
                # ensure we keep the table formatting by adding spaces at the end of the line if needed
                if diff_in_length > 0:
                    text_to_replace = text_to_replace + " " * diff_in_length
                new_line = line.replace(
                    text_to_replace, new_text
                )
                new_line = line.replace(text_to_replace, new_text)
        return new_line

    # take line and line number and replace the line number
@@ -481,14 +515,22 @@ def auto_number_content(
            new_line, new_heading = auto_number_heading(new_line)
            previous_heading = new_heading

            if example_counter >= 1 and first_example_line_index != -1 and "EXAMPLE" not in lines[first_example_line_index]:
            if (
                example_counter >= 1
                and first_example_line_index != -1
                and "EXAMPLE" not in lines[first_example_line_index]
            ):
                lines[
                    first_example_line_index
                ] += f" EXAMPLE{' 1' if example_counter > 1 else ''}:"
            example_counter = 0
            first_example_line_index = -1

            if note_counter >= 1 and first_note_line_index != -1 and "NOTE" not in lines[first_note_line_index]:
            if (
                note_counter >= 1
                and first_note_line_index != -1
                and "NOTE" not in lines[first_note_line_index]
            ):
                lines[
                    first_note_line_index
                ] += f" NOTE{' 1' if note_counter > 1 else ''}:"
@@ -512,7 +554,9 @@ def auto_number_content(
            new_line = auto_number_table(new_line)

            if note_in_table_counter >= 1 and first_note_in_table_line_index != -1:
                note_string = f"| >>> [!note] NOTE{' 1' if note_in_table_counter > 1 else ''}:"
                note_string = (
                    f"| >>> [!note] NOTE{' 1' if note_in_table_counter > 1 else ''}:"
                )
                note_string_length = len(note_string)
                text_to_be_replaced = "| >>> [!note]"
                text_to_be_replaced_length = len(text_to_be_replaced)
@@ -521,9 +565,7 @@ def auto_number_content(
                    text_to_be_replaced = text_to_be_replaced + " " * diff_in_length
                    lines[first_note_in_table_line_index] = lines[
                        first_note_in_table_line_index
                    ].replace(
                        text_to_be_replaced, note_string
                    )
                    ].replace(text_to_be_replaced, note_string)
            note_in_table_counter = 0
            first_note_in_table_line_index = -1

@@ -537,12 +579,20 @@ def auto_number_content(

    ### We need to run again the logic where we add the number in examples and notes since we might not have done it for all cases (it triggers on specific points, and if it happens the element is in the last heading/table it may be skipped)

    if example_counter >= 1 and first_example_line_index != -1 and "EXAMPLE" not in lines[first_example_line_index]:
    if (
        example_counter >= 1
        and first_example_line_index != -1
        and "EXAMPLE" not in lines[first_example_line_index]
    ):
        lines[
            first_example_line_index
        ] += f" EXAMPLE{' 1' if example_counter > 1 else ''}:"

    if note_counter >= 1 and first_note_line_index != -1 and "NOTE" not in lines[first_note_line_index]:
    if (
        note_counter >= 1
        and first_note_line_index != -1
        and "NOTE" not in lines[first_note_line_index]
    ):
        lines[first_note_line_index] += f" NOTE{' 1' if note_counter > 1 else ''}:"

    if note_in_table_counter >= 1 and first_note_in_table_line_index != -1:
@@ -555,9 +605,7 @@ def auto_number_content(
            text_to_be_replaced = text_to_be_replaced + " " * diff_in_length
            lines[first_note_in_table_line_index] = lines[
                first_note_in_table_line_index
            ].replace(
                text_to_be_replaced, note_string
            )
            ].replace(text_to_be_replaced, note_string)

    file_contents = "\n".join(lines) + "\n"
    return file_contents
@@ -569,18 +617,47 @@ def add_ids_to_references(file_contents: str, filename: str):
    def handle_references(file_contents: str, filename: str):
        """Make sure references are correctly escaped."""
        # Pattern for informative references with "i." prefix
        REF_REGEX_I = r"\[(i\.[A-Za-z0-9]+)\]"
        REF_REGEX_I = r"^\[(i\.[A-Za-z0-9]+)\]"

        # Pattern for normative references without "i." prefix
        REF_REGEX_N = r"\[(n\.[A-Za-z0-9]+)\]"
        # Pattern for normative references with "n." prefix
        REF_REGEX_N = r"^\[(n\.[A-Za-z0-9]+)\]"

        if (
            filename.replace(".md", "") in files_with_references
        ):  # references clauses, add span with ids
            REF_REPLACE_I = r'<span id="\1" />\[\1\]'
            REF_REPLACE_N = r'<span id="\1" />\[\1\]'
            file_contents = re.sub(REF_REGEX_I, REF_REPLACE_I, file_contents)
            file_contents = re.sub(REF_REGEX_N, REF_REPLACE_N, file_contents)

            with open(REFERENCE_MAPPING_MD_TO_HTML, "r") as ref_file:
                reference_mapping = json.load(ref_file)

            normative_index = 1
            informative_index = 1

            def replace_informative_ref(match: re.Match) -> str:
                nonlocal informative_index
                ref_id = match.group(1)
                new_ref = f"i.{informative_index}"
                reference_mapping[ref_id] = new_ref
                informative_index += 1
                return f'<span id="{new_ref}" />[{new_ref}]'

            def replace_normative_ref(match: re.Match) -> str:
                nonlocal normative_index
                ref_id = match.group(1)
                new_ref = f"n.{normative_index}"
                reference_mapping[ref_id] = new_ref
                normative_index += 1
                return f'<span id="{new_ref}" />[{new_ref}]'

            file_contents = re.sub(
                REF_REGEX_I, replace_informative_ref, file_contents, flags=re.MULTILINE
            )
            file_contents = re.sub(
                REF_REGEX_N, replace_normative_ref, file_contents, flags=re.MULTILINE
            )

            with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file:
                json.dump(reference_mapping, ref_file)

        return file_contents

    file_contents = handle_references(file_contents, filename)
@@ -621,6 +698,11 @@ def preprocess(
    clauses = DEFAULT_CLAUSES
    annexes = DEFAULT_ANNEXES

    # create REFERENCE_MAPPING_MD_TO_HTML file locally if it doesn't exist
    if not os.path.exists(REFERENCE_MAPPING_MD_TO_HTML):
        with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file:
            json.dump({}, ref_file, indent=4)

    if file_order_json:
        with open(file_order_json, "r") as file:
            json_data = json.load(file)
@@ -674,7 +756,11 @@ def preprocess(
                # print(
                #     f"Warning: Could not preprocess {input_path}. It may not be a valid UTF-8 text file or is missing."
                # )
                if e.args[0] == "DIV_DELINEATOR_ERROR" or e.args[0] == "NOTE_NUMBERING_ERROR" or e.args[0] == "EXAMPLE_NUMBERING_ERROR":
                if (
                    e.args[0] == "DIV_DELINEATOR_ERROR"
                    or e.args[0] == "NOTE_NUMBERING_ERROR"
                    or e.args[0] == "EXAMPLE_NUMBERING_ERROR"
                ):
                    # delete all files that start with --preprocessed--
                    for f in os.listdir(src):
                        if f.startswith("--preprocessed--"):