feat: re-number references when converting to html (803b02dc) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/src/constants.py

+2 −0

Original line number	Diff line number	Diff line
		@@ -22,6 +22,8 @@ HANDLE_UNDERSCORE_CLASSES = [
		"HTML_Definition",
		]

		REFERENCE_MAPPING_MD_TO_HTML = "reference_mapping_md_to_html.json"


		# Consolidated Markdown file and path
		CONSOLIDATED_MD_NAME = "consolidated.md"

md_to_docx_converter/src/to_html/postprocessing.py

+92 −46

Original line number	Diff line number	Diff line
		import os, re, html
		import os, re, html, json
		from bs4 import BeautifulSoup, Tag, NavigableString

		from src.utils import (
		@@ -8,7 +8,7 @@ from src.utils import (
		p_warning,
		)

		from src.constants import ABBREVIATION_CLASS
		from src.constants import ABBREVIATION_CLASS, REFERENCE_MAPPING_MD_TO_HTML

		normative_file = "clause-2"
		informative_file = "clause-2"
		@@ -79,10 +79,15 @@ def unwrap_gt_lt_code_tags(soup: BeautifulSoup):
		for code in codes:
		if code.parent and code.parent.name == "pre":
		span_text_only_children = code.find_all(
		lambda tag: isinstance(tag, Tag) and tag.name == "span" and len(tag.contents) == 1 and isinstance(tag.contents[0], NavigableString)
		lambda tag: isinstance(tag, Tag)
		and tag.name == "span"
		and len(tag.contents) == 1
		and isinstance(tag.contents[0], NavigableString)
		)
		for child in span_text_only_children:
		raw_text = child.get_text().replace("`<", "<").replace(">`", ">")
		raw_text = (
		child.get_text().replace("`<", "<").replace(">`", ">")
		)
		text = NavigableString(html.unescape(raw_text))
		child.contents[0].replace_with(text)
		else:
		@@ -150,7 +155,6 @@ def format_references(soup: BeautifulSoup):
		# Add body contents
		for contents in list(paragraph.contents):
		body_span.append(contents)
		body_span.append(NavigableString("\n"))

		# Append spans to div, and div to references list
		parent_div.append(tag_span)
		@@ -303,11 +307,12 @@ def format_tables(soup: BeautifulSoup) -> BeautifulSoup:
		if len(tds) == len(tdsFirstRow):
		isNewRow = not isNewRow
		if isNewRow:
		existing_classes = tr.get('class', [])
		existing_classes = tr.get("class", [])
		if "bg-striped-row" not in existing_classes:
		tr['class'] = existing_classes + ['bg-striped-row']
		tr["class"] = existing_classes + ["bg-striped-row"]
		return soup


		def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
		ZA = soup.find_all("div", class_="ZA")
		try:
		@@ -316,7 +321,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
		try:
		# TITLE IN HEADER
		header = children[0]
		header['data-replace'] = 'WORKITEMNAME'
		header["data-replace"] = "WORKITEMNAME"
		except IndexError:
		print(p_warning("front-page is missing WORKITEMNAME information."))
		# Version in HEADER
		@@ -324,7 +329,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
		version = children[1]
		new_span = soup.new_tag("span")
		version.wrap(new_span)
		new_span['data-replace'] = 'VERSION_NO'
		new_span["data-replace"] = "VERSION_NO"
		except IndexError:
		print(p_warning("front-page is missing VERSION_NO information."))
		# DATE IN HEADER
		@@ -334,7 +339,7 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
		text = text.replace("(", "").replace(")", "")
		open_bracket = NavigableString("(")
		date_text = new_span = soup.new_tag("span")
		date_text['data-replace'] = 'DATE'
		date_text["data-replace"] = "DATE"
		date_text.string = text
		close_bracket = NavigableString(")")
		date.append(open_bracket)
		@@ -343,34 +348,37 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
		except IndexError:
		print(p_warning("front-page is missing DATE information."))
		except IndexError:
		print(p_warning("front-page is missing the section with WORKITEMNAME, VERSION_NO, and DATE information."))
		print(
		p_warning(
		"front-page is missing the section with WORKITEMNAME, VERSION_NO, and DATE information."
		)
		)

		ZT = soup.find_all("div", class_="ZT")
		try:
		# first is title
		ZT[0]['data-replace'] = '{{TITLE}}'
		ZT[0]["data-replace"] = "{{TITLE}}"
		except IndexError:
		print(p_warning("front-page is missing TITLE information."))
		try:
		# second is part
		ZT[1]['data-replace'] = '{{PART}}'
		ZT[1]["data-replace"] = "{{PART}}"
		except IndexError:
		print(p_warning("front-page is missing PART information."))
		try:
		# third is subpart
		ZT[2]['data-replace'] = '{{SUBPART}}'
		ZT[2]["data-replace"] = "{{SUBPART}}"
		except IndexError:
		print(p_warning("front-page is missing SUBPART information."))
		try:
		# fourth is release
		ZT[3]['data-replace'] = '{{RELEASE}}'
		ZT[3]["data-replace"] = "{{RELEASE}}"
		except IndexError:
		print(p_warning("front-page is missing RELEASE information."))


		ZB = soup.find_all("div", class_="ZB")
		try:
		ZB[0]['data-replace'] = '{{TYPEDOCUMENT}}'
		ZB[0]["data-replace"] = "{{TYPEDOCUMENT}}"
		except IndexError:
		print(p_warning("front-page is missing TYPEDOCUMENT information."))

		@@ -380,17 +388,21 @@ def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
		try:
		# SECOND is WORKITEM
		workitem = children[3]
		workitem['data-replace'] = '{{WORKITEM}}'
		workitem["data-replace"] = "{{WORKITEM}}"
		except IndexError:
		print(p_warning("front-page is missing WORKITEM information."))
		# FOURTH is KEYWORDS
		try:
		keywords = children[7]
		keywords['data-replace'] = '{{KEYWORDS}}'
		keywords["data-replace"] = "{{KEYWORDS}}"
		except IndexError:
		print(p_warning("front-page is missing KEYWORDS information."))
		except IndexError:
		print(p_warning("front-page is missing the section with WORKITEM and KEYWORDS information."))
		print(
		p_warning(
		"front-page is missing the section with WORKITEM and KEYWORDS information."
		)
		)

		return soup

		@@ -445,9 +457,13 @@ def add_links_to_references_in_text(soup):
		REG_REGEX = r"(?<!\[)\[(i\.\|n\.)?[A-Za-z0-9]+\]"

		def insert_link_with_reference(content, is_informative):
		with open(REFERENCE_MAPPING_MD_TO_HTML, "r") as ref_file:
		reference_mapping = json.load(ref_file)

		opening_bracket_index = content.find("[")
		closing_bracket_index = content.find("]") + 1
		internal_text = content[opening_bracket_index + 1 : closing_bracket_index - 1]
		internal_text = reference_mapping.get(internal_text, internal_text)

		# prepare the new <a> tag
		link = (
		@@ -603,6 +619,7 @@ def move_figure_id_to_FL_elements(soup: BeautifulSoup):

		return soup


		def shorten_toc_text(soup: BeautifulSoup):
		"""
		Remove informative/normative from TOC only
		@@ -625,6 +642,7 @@ def shorten_toc_text(soup: BeautifulSoup):

		return soup


		def fix_custom_tags(soup: BeautifulSoup):
		"""
		Fix custom tags in the HTML.
		@@ -634,20 +652,40 @@ def fix_custom_tags(soup: BeautifulSoup):
		image_extensions = [".png", ".jpg", ".jpeg", ".svg"]
		return not any(href.endswith(ext) for ext in image_extensions)

		def remove_trailing_punctuation(a: Tag, needle: str) -> str:
		def remove_leading_punctuation(a: Tag, needle: str) -> None:
		if not a["href"].startswith("#" + needle):
		index = a["href"].find(needle)
		if index != -1 and index > 1: # to account for the leading #
		new_href = f"#{a['href'][index:]}"
		a["href"] = new_href

		if not a.string.startswith(needle):
		index = a.string.find(needle)
		if index != -1 and index > 1:
		in_between_text = a["href"][1:index] # exclude the leading #
		new_a_text = a.string[index:]
		a.string = new_a_text
		a.insert_before(NavigableString(in_between_text))

		return

		def remove_trailing_punctuation(a: Tag, needle: str) -> None:
		if not a["href"].endswith(needle):
		# find index of last occurrence of needle
		index = a["href"].rfind(needle)
		if index != -1 and index + len(needle) < len(a["href"]):
		remaining_text = a["href"][index + len(needle):]
		new_href = a["href"][: index + len(needle)]
		# a["href"] = href
		string_index = a.string.rfind(needle)
		new_a_text = a.string[:string_index + len(needle)]
		# a.string = a.string[:string_index + len(needle)]
		a["href"] = new_href

		if not a.string.endswith(needle):
		# find index of last occurrence of needle
		index = a.string.rfind(needle)
		if index != -1 and index + len(needle) < len(a.string):
		remaining_text = a.string[index + len(needle) :]
		new_a_text = a.string[: index + len(needle)]
		a.string = new_a_text
		a.insert_after(NavigableString(remaining_text))
		return new_href, new_a_text
		return None, None
		return

		# Example: Change <custom-tag> to <div class="custom-tag">
		h1_tag = soup.find("h1", id=True)
		@@ -657,18 +695,19 @@ def fix_custom_tags(soup: BeautifulSoup):
		for a in a_tags:
		href = a.get("href", "")
		if href.find("+++below") != -1:
		new_href, new_a_text = remove_trailing_punctuation(a, "+++below")
		if new_href:
		href = new_href
		if new_a_text:
		a.string = new_a_text
		is_table = "Table" in href
		remove_leading_punctuation(a, "Table+++" if is_table else "Figure+++")
		remove_trailing_punctuation(a, "+++below")

		href = a.get("href", "")

		count_below = href.count("+++below")
		is_table = "Table" in href
		class_name = "TH" if is_table else "FL"
		next_nth_element = a
		for _ in range(count_below):
		next_nth_element = next_nth_element.find_next("div", class_=class_name, id=True)
		next_nth_element = next_nth_element.find_next(
		"div", class_=class_name, id=True
		)
		if next_nth_element:
		prefix = "Table" if is_table else "Figure"
		postfix = "+++below" * count_below
		@@ -688,17 +727,19 @@ def fix_custom_tags(soup: BeautifulSoup):
		)
		os._exit(1)
		elif href.find("+++above") != -1:
		new_href, new_a_text = remove_trailing_punctuation(a, "+++above")
		if new_href:
		href = new_href
		if new_a_text:
		a.string = new_a_text
		count_above = href.count("+++above")
		is_table = "Table" in href
		remove_leading_punctuation(a, "Table+++" if is_table else "Figure+++")
		remove_trailing_punctuation(a, "+++above")

		href = a.get("href", "")

		count_above = href.count("+++above")
		class_name = "TH" if is_table else "FL"
		next_nth_element = a
		for _ in range(count_above):
		next_nth_element = next_nth_element.find_previous("div", class_=class_name, id=True)
		next_nth_element = next_nth_element.find_previous(
		"div", class_=class_name, id=True
		)
		if next_nth_element:
		prefix = "Table" if is_table else "Figure"
		postfix = "+++above" * count_above
		@@ -725,6 +766,7 @@ def fix_custom_tags(soup: BeautifulSoup):
		a.string = a.string.replace("root", new_id_prefix)
		return soup


		def fix_lists(soup: BeautifulSoup):
		"""
		Fix lists that have been improperly nested due to markdown conversion.
		@@ -738,6 +780,7 @@ def fix_lists(soup: BeautifulSoup):

		return soup


		def extract_images_from_html(soup: BeautifulSoup) -> dict:
		"""
		Extracts image sources from the given HTML content.
		@@ -949,3 +992,6 @@ def postprocess(html_dir: str):

		with open(file_path, "w", encoding="utf-8") as html:
		html.write(contents)

		if os.path.exists(REFERENCE_MAPPING_MD_TO_HTML):
		os.remove(REFERENCE_MAPPING_MD_TO_HTML)

md_to_docx_converter/src/to_html/preprocessing.py

+136 −50

Original line number	Diff line number	Diff line
		@@ -11,6 +11,7 @@ from src.constants import (
		DIV_START_REGEX,
		DIV_END_REGEX,
		BAD_DIV_DELINEATOR_REGEX,
		REFERENCE_MAPPING_MD_TO_HTML,
		)

		from src.utils import (
		@@ -28,6 +29,7 @@ files_with_references = [NORMATIVE_REF_FILE, INFORMATIVE_REF_FILE]

		# region Helpers


		def undo_prettier_formatting(text: str) -> str:
		"""Undo any formatting changes made by Prettier to ensure the Markdown is in a more raw format for processing."""

		@@ -52,6 +54,7 @@ def undo_prettier_formatting(text: str) -> str:

		return new_text


		def run_format_checks(filename: str, file_lines: list[str]):
		"""Runs various checks on the Markdown file contents to ensure they are properly formatted. If any improper formatting is detected, display any fatal errors or warnings as necessary."""

		@@ -182,15 +185,20 @@ def run_format_checks(filename: str, file_lines: list[str]):
		check_divs()
		check_notes_and_examples()


		def remove_ignore_prettier_statements(text: str) -> str:
		"""Remove any existing <!-- prettier-ignore --> statements from the text to avoid duplication"""
		new_lines = []
		for line in text.split("\n"):
		if line.strip() != "<!-- prettier-ignore-start -->" and line.strip() != "<!-- prettier-ignore-end -->":
		if (
		line.strip() != "<!-- prettier-ignore-start -->"
		and line.strip() != "<!-- prettier-ignore-end -->"
		):
		new_lines.append(line)

		return "\n".join(new_lines)


		def add_divs_to_images_tables(text: str) -> str:
		"""Add divs around images and their captions, and tables captions to the ones defined using the ETSI guidelines."""
		file_lines = text.split("\n")
		@@ -220,9 +228,10 @@ def add_divs_to_images_tables(text : str) -> str:

		return "\n".join(new_file_lines) + "\n"


		def handle_less_than_greater_than_text(file_contents: str):
		"""Replace `<` and `>` with `<` and `>` respectively and wrap the whole section in single code ticks to allow the text to render in the HTML"""
		regex = r"\<(?!img\b\|span\b\|sup\|/sup)(.+?)\>"
		regex = r"\<(?!img\b\|span\b\|sup\|/sup\|mark\|/mark)(.+?)\>"
		replace = r"`<\1>`"
		table_regex = rf"\\|([^\|\n]?{regex}[^\|\n]?)\\|"

		@@ -288,16 +297,26 @@ def add_empty_lines_in_notes_and_examples(file_contents: str):
		line = file_lines[i]

		# opening of a note or example
		if line.startswith(">>> [!note]") or line.startswith(">>> [!tip]") or line.startswith("\| >>> [!note]"):
		if (
		line.startswith(">>> [!note]")
		or line.startswith(">>> [!tip]")
		or line.startswith("\| >>> [!note]")
		):
		new_file_lines.append(line)
		# Check if the next line exists and is not empty
		if i + 1 < len(file_lines) and file_lines[i + 1].strip() != "":
		if not line.startswith("\| >>> [!note]"):
		new_file_lines.append("") # Add an empty line only for notes/examples outside tables
		new_file_lines.append(
		""
		) # Add an empty line only for notes/examples outside tables
		else:
		if not line.startswith("+") and not line.endswith("+"):
		line_length = len(line) - 2 # Subtract 2 for the "\|" at the start and end
		new_file_lines.append("\|" + " " * line_length + "\|") # Add an empty line
		line_length = (
		len(line) - 2
		) # Subtract 2 for the "\|" at the start and end
		new_file_lines.append(
		"\|" + " " * line_length + "\|"
		) # Add an empty line

		# closing of a note or example
		elif line.find(">>>") != -1 or line.find("\| >>>") != -1:
		@@ -305,26 +324,43 @@ def add_empty_lines_in_notes_and_examples(file_contents: str):
		line_before = file_lines[i - 1] if i > 0 else ""
		empty_line_regex = r"^\s*$"
		empty_table_row_regex = r"^\\|\s*\\|$"
		if not re.match(empty_line_regex, line_before) and not line_before.startswith("\| "):
		new_file_lines.append("") # Add an empty line before any other blockquote
		elif not re.match(empty_table_row_regex, line) and line_before.startswith("\| "):
		line_length = len(line) - 2 # Subtract 2 for the "\|" at the start and end
		new_file_lines.append("\|" + " " * line_length + "\|") # Add an empty line before any other blockquote in a table
		if not re.match(
		empty_line_regex, line_before
		) and not line_before.startswith("\| "):
		new_file_lines.append(
		""
		) # Add an empty line before any other blockquote
		elif not re.match(empty_table_row_regex, line) and line_before.startswith(
		"\| "
		):
		line_length = (
		len(line) - 2
		) # Subtract 2 for the "\|" at the start and end
		new_file_lines.append(
		"\|" + " " * line_length + "\|"
		) # Add an empty line before any other blockquote in a table

		new_file_lines.append(line)

		# check after the line
		if not line.startswith("\| >>>"): # we are not in a table
		new_file_lines.append("") # Add an empty line after any other blockquote
		new_file_lines.append(
		""
		) # Add an empty line after any other blockquote
		elif line.startswith("\| >>>"):
		if not line.startswith("+-") and not line.startswith("+="):
		line_length = len(line) - 2 # Subtract 2 for the "\|" at the start and end
		new_file_lines.append("\|" + " " * line_length + "\|") # Add an empty line
		line_length = (
		len(line) - 2
		) # Subtract 2 for the "\|" at the start and end
		new_file_lines.append(
		"\|" + " " * line_length + "\|"
		) # Add an empty line
		else:
		new_file_lines.append(line)
		i += 1
		return "\n".join(new_file_lines) + "\n"


		# Used to keep track of clause numbers across multiple levels when auto-numbering
		clauses_counters = [0] * MAX_HEADING_LEVEL
		clauses_counters[0] = 3 # first 3 clauses are taken by mandatory files
		@@ -461,9 +497,7 @@ def auto_number_content(
		# ensure we keep the table formatting by adding spaces at the end of the line if needed
		if diff_in_length > 0:
		text_to_replace = text_to_replace + " " * diff_in_length
		new_line = line.replace(
		text_to_replace, new_text
		)
		new_line = line.replace(text_to_replace, new_text)
		return new_line

		# take line and line number and replace the line number
		@@ -481,14 +515,22 @@ def auto_number_content(
		new_line, new_heading = auto_number_heading(new_line)
		previous_heading = new_heading

		if example_counter >= 1 and first_example_line_index != -1 and "EXAMPLE" not in lines[first_example_line_index]:
		if (
		example_counter >= 1
		and first_example_line_index != -1
		and "EXAMPLE" not in lines[first_example_line_index]
		):
		lines[
		first_example_line_index
		] += f" EXAMPLE{' 1' if example_counter > 1 else ''}:"
		example_counter = 0
		first_example_line_index = -1

		if note_counter >= 1 and first_note_line_index != -1 and "NOTE" not in lines[first_note_line_index]:
		if (
		note_counter >= 1
		and first_note_line_index != -1
		and "NOTE" not in lines[first_note_line_index]
		):
		lines[
		first_note_line_index
		] += f" NOTE{' 1' if note_counter > 1 else ''}:"
		@@ -512,7 +554,9 @@ def auto_number_content(
		new_line = auto_number_table(new_line)

		if note_in_table_counter >= 1 and first_note_in_table_line_index != -1:
		note_string = f"\| >>> [!note] NOTE{' 1' if note_in_table_counter > 1 else ''}:"
		note_string = (
		f"\| >>> [!note] NOTE{' 1' if note_in_table_counter > 1 else ''}:"
		)
		note_string_length = len(note_string)
		text_to_be_replaced = "\| >>> [!note]"
		text_to_be_replaced_length = len(text_to_be_replaced)
		@@ -521,9 +565,7 @@ def auto_number_content(
		text_to_be_replaced = text_to_be_replaced + " " * diff_in_length
		lines[first_note_in_table_line_index] = lines[
		first_note_in_table_line_index
		].replace(
		text_to_be_replaced, note_string
		)
		].replace(text_to_be_replaced, note_string)
		note_in_table_counter = 0
		first_note_in_table_line_index = -1

		@@ -537,12 +579,20 @@ def auto_number_content(

		### We need to run again the logic where we add the number in examples and notes since we might not have done it for all cases (it triggers on specific points, and if it happens the element is in the last heading/table it may be skipped)

		if example_counter >= 1 and first_example_line_index != -1 and "EXAMPLE" not in lines[first_example_line_index]:
		if (
		example_counter >= 1
		and first_example_line_index != -1
		and "EXAMPLE" not in lines[first_example_line_index]
		):
		lines[
		first_example_line_index
		] += f" EXAMPLE{' 1' if example_counter > 1 else ''}:"

		if note_counter >= 1 and first_note_line_index != -1 and "NOTE" not in lines[first_note_line_index]:
		if (
		note_counter >= 1
		and first_note_line_index != -1
		and "NOTE" not in lines[first_note_line_index]
		):
		lines[first_note_line_index] += f" NOTE{' 1' if note_counter > 1 else ''}:"

		if note_in_table_counter >= 1 and first_note_in_table_line_index != -1:
		@@ -555,9 +605,7 @@ def auto_number_content(
		text_to_be_replaced = text_to_be_replaced + " " * diff_in_length
		lines[first_note_in_table_line_index] = lines[
		first_note_in_table_line_index
		].replace(
		text_to_be_replaced, note_string
		)
		].replace(text_to_be_replaced, note_string)

		file_contents = "\n".join(lines) + "\n"
		return file_contents
		@@ -569,18 +617,47 @@ def add_ids_to_references(file_contents: str, filename: str):
		def handle_references(file_contents: str, filename: str):
		"""Make sure references are correctly escaped."""
		# Pattern for informative references with "i." prefix
		REF_REGEX_I = r"\[(i\.[A-Za-z0-9]+)\]"
		REF_REGEX_I = r"^\[(i\.[A-Za-z0-9]+)\]"

		# Pattern for normative references without "i." prefix
		REF_REGEX_N = r"\[(n\.[A-Za-z0-9]+)\]"
		# Pattern for normative references with "n." prefix
		REF_REGEX_N = r"^\[(n\.[A-Za-z0-9]+)\]"

		if (
		filename.replace(".md", "") in files_with_references
		): # references clauses, add span with ids
		REF_REPLACE_I = r'<span id="\1" />\[\1\]'
		REF_REPLACE_N = r'<span id="\1" />\[\1\]'
		file_contents = re.sub(REF_REGEX_I, REF_REPLACE_I, file_contents)
		file_contents = re.sub(REF_REGEX_N, REF_REPLACE_N, file_contents)

		with open(REFERENCE_MAPPING_MD_TO_HTML, "r") as ref_file:
		reference_mapping = json.load(ref_file)

		normative_index = 1
		informative_index = 1

		def replace_informative_ref(match: re.Match) -> str:
		nonlocal informative_index
		ref_id = match.group(1)
		new_ref = f"i.{informative_index}"
		reference_mapping[ref_id] = new_ref
		informative_index += 1
		return f'<span id="{new_ref}" />[{new_ref}]'

		def replace_normative_ref(match: re.Match) -> str:
		nonlocal normative_index
		ref_id = match.group(1)
		new_ref = f"n.{normative_index}"
		reference_mapping[ref_id] = new_ref
		normative_index += 1
		return f'<span id="{new_ref}" />[{new_ref}]'

		file_contents = re.sub(
		REF_REGEX_I, replace_informative_ref, file_contents, flags=re.MULTILINE
		)
		file_contents = re.sub(
		REF_REGEX_N, replace_normative_ref, file_contents, flags=re.MULTILINE
		)

		with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file:
		json.dump(reference_mapping, ref_file)

		return file_contents

		file_contents = handle_references(file_contents, filename)
		@@ -621,6 +698,11 @@ def preprocess(
		clauses = DEFAULT_CLAUSES
		annexes = DEFAULT_ANNEXES

		# create REFERENCE_MAPPING_MD_TO_HTML file locally if it doesn't exist
		if not os.path.exists(REFERENCE_MAPPING_MD_TO_HTML):
		with open(REFERENCE_MAPPING_MD_TO_HTML, "w") as ref_file:
		json.dump({}, ref_file, indent=4)

		if file_order_json:
		with open(file_order_json, "r") as file:
		json_data = json.load(file)
		@@ -674,7 +756,11 @@ def preprocess(
		# print(
		# f"Warning: Could not preprocess {input_path}. It may not be a valid UTF-8 text file or is missing."
		# )
		if e.args[0] == "DIV_DELINEATOR_ERROR" or e.args[0] == "NOTE_NUMBERING_ERROR" or e.args[0] == "EXAMPLE_NUMBERING_ERROR":
		if (
		e.args[0] == "DIV_DELINEATOR_ERROR"
		or e.args[0] == "NOTE_NUMBERING_ERROR"
		or e.args[0] == "EXAMPLE_NUMBERING_ERROR"
		):
		# delete all files that start with --preprocessed--
		for f in os.listdir(src):
		if f.startswith("--preprocessed--"):