feat: support ETSI guidelines for images and captions (52291eb5) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/md_to_html.lua

+115 −1

Original line number	Diff line number	Diff line
		@@ -80,6 +80,118 @@ local function wrap_table_cell_contents(el, default_class) --- Apply TAH or tagg
		return el
		end

		local function wrap_images_in_table(el) --- Wrap images in table cells with a div with the FL class
		local function process_content(content)
		local result = {}
		for i, item in ipairs(content) do
		if item.t == "Para" then
		-- Check the first element in the div
		if #item.content > 0 and item.content[1].t == "Image" then
		local new_div = pandoc.Div({ item }, pandoc.Attr("", { "FL" }))
		result[i] = new_div
		else
		local processed_content = process_content(item.content)
		item.content = processed_content
		result[i] = item
		end
		elseif item.t == "Div" then
		-- Process content inside divs recursively
		local processed_content = process_content(item.content)
		item.content = processed_content
		result[i] = item
		else
		result[i] = item
		end
		end
		return result
		end

		local function wrap_images_in_cell(cell)
		cell.content = process_content(cell.content)
		return cell
		end

		if el.head then
		for _, row in ipairs(el.head.rows or {}) do
		for i, cell in ipairs(row.cells) do
		row.cells[i] = wrap_images_in_cell(cell)
		end
		end
		end

		if el.bodies then
		for _, body in ipairs(el.bodies or {}) do
		for _, row in ipairs(body.body or {}) do
		for i, cell in ipairs(row.cells) do
		row.cells[i] = wrap_images_in_cell(cell)
		end
		end
		end
		end

		return el
		end

		local function wrap_images_captions_in_table(el) --- Wrap images in table cells with a div with the FL class
		local function process_content(content)
		local result = {}
		for i, item in ipairs(content) do
		if item.t == "Para" then
		-- Check if paragraph starts with "Figure"
		local is_figure_caption = false

		if #item.content > 0 and item.content[1].t == "Strong" then
		-- Get the text from the Strong element
		local strong_text = pandoc.utils.stringify(item.content[1])
		if strong_text:match("^Figure") then
		is_figure_caption = true
		end
		end

		if is_figure_caption then
		-- If it's a figure caption, wrap it in a div with TF class
		local para_content = pandoc.Para(pandoc.utils.stringify(item))
		result[i] = pandoc.Div({ para_content }, pandoc.Attr("", { "TF" }))
		else
		result[i] = item
		end
		elseif item.t == "Div" then
		-- Process content inside divs recursively
		local processed_content = process_content(item.content)
		item.content = processed_content
		result[i] = item
		else
		result[i] = item
		end
		end
		return result
		end

		local function wrap_images_captions_in_cell(cell)
		cell.content = process_content(cell.content)
		return cell
		end

		if el.head then
		for _, row in ipairs(el.head.rows or {}) do
		for i, cell in ipairs(row.cells) do
		row.cells[i] = wrap_images_captions_in_cell(cell)
		end
		end
		end

		if el.bodies then
		for _, body in ipairs(el.bodies or {}) do
		for _, row in ipairs(body.body or {}) do
		for i, cell in ipairs(row.cells) do
		row.cells[i] = wrap_images_captions_in_cell(cell)
		end
		end
		end
		end
		return el
		end

		local function handle_ex_no_tan(el) --- Preserves the main structure of Examples and Notes - that is, two paragraphs within the div with either the EX, the NO, or the TAN class. To do so, consolidate all the body text within a single paragraph.
		local tag = nil
		local text_fragments = {}
		@@ -112,6 +224,8 @@ end

		function Table(el)
		el = wrap_table_cell_contents(el, "TAL") --- All other tables that aren't wrapped with a div, use TAL as a default table body class
		el = wrap_images_in_table(el) --- Wrap images in table cells with a div with the FL class
		el = wrap_images_captions_in_table(el) --- Wrap images in table cells with a div with the FL class

		return el
		end

md_to_docx_converter/md_to_html_3.lua

+5 −3

Original line number	Diff line number	Diff line
		@@ -45,8 +45,8 @@ function CustomTagsToLinks(el, prefix)

		local remaining_text = child.text
		local start_index = remaining_text:find(prefix .. "+++")
		-- logfile:write("DEBUG: Paragrafo con contenuto -> " .. pandoc.utils.stringify(child) .. "\n")
		-- logfile:write("DEBUG: Paragrafo con contenuto -> " .. prefix .. "+++" .. "\n")
		-- logfile:write("DEBUG: Par with content -> " .. pandoc.utils.stringify(child) .. "\n")
		-- logfile:write("DEBUG: Par with content -> " .. prefix .. "+++" .. "\n")
		-- logfile:flush()

		if start_index > 1 then
		@@ -78,7 +78,9 @@ function CustomTagsToLinks(el, prefix)

		local id_prefix = ""
		local html_filename = ""
		if prefix ~= "Clause" then
		if prefix == "Figure" and #parts == 2 and (id:find("^[%w_-]+%.png$") or id:find("^[%w_-]+%.jpg$") or id:find("^[%w_-]+%.jpeg$") or id:find("^[%w_-]+%.svg$")) then
		id_prefix = "Figure+++"
		elseif prefix ~= "Clause" then
		id_prefix = prefix .. "_"
		end
		if filename then

md_to_docx_converter/src/to_html/postprocessing.py

+113 −24

Original line number	Diff line number	Diff line
		@@ -196,7 +196,8 @@ def format_examples_and_notes(soup: BeautifulSoup):

		def get_label_text_and_class(para: Tag):
		"""Get the label text from the paragraph and determine the class to assign to the div"""
		text = para.get_text()
		text = para.contents[0].split(":")[0] + ":"
		remaining_text = para.contents[0].split(": ")[1] if ": " in para.contents[0] else ""
		cls = ""

		if "[!tip]" in text:
		@@ -204,9 +205,12 @@ def format_examples_and_notes(soup: BeautifulSoup):
		else:
		cls = "TAN" if para.find_parent("td") else "NO"

		text = text.replace("[!note]", "").replace("[!tip]", "").strip()
		text = text.replace("[!note] ", "").replace("[!tip] ", "")

		return text, cls
		remaining_contents = para.contents[1:]
		if remaining_text:
		remaining_contents.insert(0, NavigableString(remaining_text))
		return text, cls, remaining_contents

		# Take only the top-level blockquotes to simplify logic
		blockquotes = [
		@@ -219,7 +223,7 @@ def format_examples_and_notes(soup: BeautifulSoup):
		if not label_para:
		continue

		label_text, label_class = get_label_text_and_class(label_para)
		label_text, label_class, remaining_contents = get_label_text_and_class(label_para)

		new_parent_div = soup.new_tag("div", attrs={"class": label_class})

		@@ -234,7 +238,12 @@ def format_examples_and_notes(soup: BeautifulSoup):

		# Process body
		body_div = soup.new_tag("div")

		if remaining_contents: # this happens when there is not an empty line after the [!note] or [!tip], better to do here because in md we have gridtables to take care of
		para_container = soup.new_tag("p")
		for content in remaining_contents:
		para_container.append(content)
		body_div.append(para_container)
		else:
		current = blockquote.next_sibling

		while current:
		@@ -413,11 +422,23 @@ def remove_links_from_labels(soup: BeautifulSoup):
		for label in labels:
		a_tag = label.find("a")
		if a_tag:
		id = a_tag.get("href").split("#")[-1]
		label.attrs["id"] = id
		a_tag.unwrap()
		return soup

		def add_ids_to_labels(soup: BeautifulSoup):
		"""
		Add ids to label elements if they don't have one.
		"""
		labels = soup.find_all("div", class_=["TF", "TH"])
		for label in labels:
		if not label.get("id"):
		label_text = label.get_text().strip()
		id = label_text.split(":")[0].split(" ")[1]
		if label_text.startswith("Figure"):
		label.attrs["id"] = f"Figure_{id}"
		elif label_text.startswith("Table"):
		label.attrs["id"] = f"Table_{id}"
		return soup

		def move_figure_id_to_FL_elements(soup: BeautifulSoup):
		"""
		@@ -487,7 +508,55 @@ def fix_custom_tags(soup: BeautifulSoup):
		os._exit(1)
		return soup

		# endregion
		def extract_images_from_html(soup: BeautifulSoup) -> dict:
		"""
		Extracts image sources from the given HTML content.

		Args:
		html (str): The HTML content as a string.

		Returns:
		dict: A dictionary mapping image filenames to their full paths.
		"""
		figures = soup.find_all("div", class_="FL")
		images_mapping = {}

		for fig in figures:
		id = fig.get("id", "")
		if id:
		img = fig.find("img")
		if img:
		src = img.get("src", "").replace("media/", "")
		images_mapping[id] = src

		return images_mapping, soup

		def add_custom_link_to_images(soup: BeautifulSoup, images_mapping: dict) -> BeautifulSoup:
		"""
		Adds a custom link to images in the HTML content based on the provided images mapping.

		Args:
		html (str): The HTML content as a string.
		images_mapping (dict): A dictionary mapping image filenames to their full paths.

		Returns:
		str: The modified HTML content with custom links added to images.
		"""
		#look for text that matches the pattern Figure+++<filename>
		a_tags = soup.find_all("a")
		for a in a_tags:
		href = a.get("href", "")
		if "Figure+++" in href:
		# Extract the filename from the href
		filename = href.split("+++")[1]
		if filename in images_mapping:
		image_info = images_mapping[filename]
		a["href"] = f"{image_info['file']}#{image_info['id']}"
		a.string = f"figure {image_info['id'].split('_')[1]}"
		else:
		raise ValueError(f"ERROR: Image '{filename}' not found in images mapping. Are you sure it exists in the media folder and is used in the document?")

		return soup


		def postprocess(html_dir: str):
		@@ -502,13 +571,8 @@ def postprocess(html_dir: str):
		### Arguments
		- `html_dir`:
		"""

		try:
		os.remove("filename_numbers_mapping.json")
		except FileNotFoundError:
		pass

		filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir)
		images_mapping = {}

		for filename in os.listdir(html_dir):
		if filename.endswith(".html"):
		@@ -544,8 +608,33 @@ def postprocess(html_dir: str):
		soup = format_examples_and_notes(soup)

		soup = remove_links_from_labels(soup)
		soup = add_ids_to_labels(soup)
		soup = move_figure_id_to_FL_elements(soup)
		soup = fix_custom_tags(soup)
		images, soup = extract_images_from_html(soup)
		for image_id, image_src in images.items():
		images_mapping[image_src] = {
		"id": image_id,
		"file": new_filename
		}

		contents = soup.decode_contents(formatter=None)

		with open(file_path, "w", encoding="utf-8") as html:
		html.write(contents)

		for filename in os.listdir(html_dir):
		if filename.endswith(".html"):
		file_path = os.path.join(html_dir, filename)
		with open(file_path, "r", encoding="utf-8") as html:
		soup = BeautifulSoup(html, "html.parser")

		try:
		soup = add_custom_link_to_images(soup, images_mapping)
		except ValueError as e:
		print(p_error(f"Error in file {filename}:"))
		print(p_error(str(e)))
		os._exit(1)

		contents = soup.decode_contents(formatter=None)

md_to_docx_converter/src/to_html/preprocessing.py

+31 −0

Original line number	Diff line number	Diff line
		@@ -115,6 +115,35 @@ def run_format_checks(filename: str, file_lines: list[str]):
		check_divs()


		def add_divs_to_images_tables(text : str) -> str:
		"""Add divs around images and their captions, and tables captions to the ones defined using the ETSI guidelines."""
		file_lines = text.split("\n")
		new_file_lines = []
		TABLE_CAPTION_REGEX = r"^\\Table"
		IMAGE_CAPTION_REGEX = r"^\\Figure"
		IMAGE_DEF_REGEX = r"^!\[.\]$.$"

		for line in file_lines:
		if re.match(IMAGE_DEF_REGEX, line):
		# If the line is an image definition, add divs around it
		new_file_lines.append("::: FL")
		new_file_lines.append(line)
		new_file_lines.append(":::")
		elif re.match(IMAGE_CAPTION_REGEX, line):
		# If the line is an image caption, add divs around it
		new_file_lines.append("::: TF")
		new_file_lines.append(line.replace("**", ""))
		new_file_lines.append(":::")
		elif re.match(TABLE_CAPTION_REGEX, line):
		# If the line is a table caption, add divs around it
		new_file_lines.append("::: TH")
		new_file_lines.append(line.replace("**", ""))
		new_file_lines.append(":::")
		else:
		new_file_lines.append(line)

		return "\n".join(new_file_lines) + "\n"

		def handle_less_than_greater_than_text(file_contents: str):
		"""Replace `<` and `>` with `<` and `>` respectively and wrap the whole section in single code ticks to allow the text to render in the HTML"""
		regex = r"\<(?!img\b\|span\b\|sup\|/sup)(.+?)\>"
		@@ -465,6 +494,8 @@ def preprocess(

		run_format_checks(filename, text.splitlines())

		text = add_divs_to_images_tables(text)

		if filename in clauses_filenames:
		text = auto_number_content(text, "clauses")
		filename_numbers_mapping[filename_without_extension] = (

md_to_docx_converter/src/to_md/cleaning.py

+46 −0

Original line number	Diff line number	Diff line
		@@ -43,6 +43,7 @@ def ensure_correct_css_class_use(soup: BeautifulSoup):
		"ondemand_CHAR_size_9_color_000000",
		"ondemand_CHAR_size_12_color_000000",
		"example-title",
		"block"
		]

		divs_to_unwrap = soup.find_all(
		@@ -81,6 +82,10 @@ def ensure_correct_css_class_use(soup: BeautifulSoup):
		"ondemand_CHAR_name_Times_New_Roman_size_10_color_595959",
		"HTML_Keyboard",
		],
		[
		"ETSI-code_Char",
		"HTML_Sample"
		],
		]

		for i in range(len(class_pairs)):
		@@ -1128,6 +1133,43 @@ def fix_references(soup: BeautifulSoup):
		text_element.replace_with(new_text)
		return soup

		def remove_style_from_images(soup: BeautifulSoup):
		"""Removes any style attributes from images"""
		for img in soup.find_all("img"):
		if img.has_attr("style"):
		del img["style"]
		if img.has_attr("alt"):
		del img["alt"]

		parent_tag = img.parent
		if parent_tag and parent_tag.name == "span" and parent_tag.has_attr("style"):
		# remove parent span if it only contains the image and has a style attribute and leave the img
		parent_tag.replace_with(img)

		return soup

		def handle_figures_tables_structure(soup: BeautifulSoup):
		"""Ensures that figures and tables captions are properly structured to follow ETSI guidelines"""
		fl_divs = soup.find_all("div", class_="FL")
		tf_divds = soup.find_all("div", class_="TF")
		th_divds = soup.find_all("div", class_="TH")

		for div in fl_divs + tf_divds + th_divds:
		if div.get("class") == ["FL"]:
		div_contents = [content for content in div.contents if not is_whitespace_navstr(content)]
		if len(div_contents) == 1 and isinstance(div_contents[0], Tag) and div_contents[0].name == "img":
		img = div_contents[0]
		div.replace_with(img)
		if div.get("class") == ["TF"] or div.get("class") == ["TH"]:
		# add leading and trailing ** to the text in the div
		text = div.get_text(strip=True)
		if text:
		new_text = f"{text}"
		new_tag = soup.new_tag("p")
		new_tag.string = new_text
		div.replace_with(new_tag)

		return soup
		# endregion


		@@ -1168,4 +1210,8 @@ def cleaning(soup: BeautifulSoup, css_src: bool):

		soup = fix_references(soup)

		soup = remove_style_from_images(soup)

		soup = handle_figures_tables_structure(soup)

		return soup