fix: tags all references when th more occurrences in the same paragraph (8dd82672) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/src/to_html/postprocessing.py

+33 −26

Original line number	Diff line number	Diff line
		@@ -319,23 +319,34 @@ def add_links_to_references_in_text(soup):
		REF_REGEX_N = r"(?<!\[)\[(n\.[A-Za-z0-9]+)\]"

		def insert_link_with_reference(
		content, tag_contents, internal_match, is_informative
		content, is_informative
		):
		if content.parent is None:
		return

		before_text = str(content).split(tag_contents)[0]
		after_text = str(content).split(tag_contents)[1]
		tag_contents = tag_contents.replace("[n.", "[") # normative can't have a prefix, this won't affex informative
		opening_bracket_index = content.find("[")
		closing_bracket_index = content.find("]") + 1

		if opening_bracket_index > 0:
		before_text = content[:opening_bracket_index]
		else:
		before_text = ""

		if closing_bracket_index < len(content):
		after_text = content[closing_bracket_index:]
		else:
		after_text = ""

		internal_text = content[opening_bracket_index+1:closing_bracket_index-1]

		# prepare the new <a> tag
		link = (
		f"{informative_file}.html#{internal_match}"
		f"{informative_file}.html#{internal_text}"
		if is_informative
		else f"{normative_file}.html#{internal_match}"
		else f"{normative_file}.html#{internal_text}"
		)
		a = soup.new_tag("a", attrs={"href": link})
		a.append(f"{tag_contents}")
		a.append(f"[{internal_text.replace('n.', '')}]")
		content.replace_with(a)

		# Add any remaining text after the <a> tag
		@@ -345,25 +356,21 @@ def add_links_to_references_in_text(soup):
		def process_text_nodes(element):
		for content in list(element.contents):
		if isinstance(content, NavigableString):
		informative_match = re.search(REF_REGEX_I, str(content))
		normative_match = re.search(REF_REGEX_N, str(content))
		if informative_match:
		# get the content
		tag_contents = informative_match.group(0)
		internal_match = informative_match.group(1)
		split_content = content.split(" ")
		for part in split_content:
		element = NavigableString(part + " ")
		content.insert_before(element)
		if re.match(REF_REGEX_I, part):
		insert_link_with_reference(
		content, tag_contents, internal_match, is_informative=True
		element, is_informative=True
		)

		if normative_match:
		# get the content
		tag_contents = normative_match.group(0)
		internal_match = normative_match.group(1)
		elif re.match(REF_REGEX_N, part):
		insert_link_with_reference(
		content, tag_contents, internal_match, is_informative=False
		element, is_informative=False
		)
		content.extract()

		elif isinstance(content, Tag):
		elif isinstance(content, Tag) and not content.name in ["a", "code"]:
		process_text_nodes(content)

		for element in soup.find_all(["p", "div"]):