fix: remove badges when converting to docx (a14f8197) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/customized_reference.docx

+178 B (27.5 KiB)

File changed.

No diff preview for this file type.

md_to_docx_converter/src/constants.py

+1 −1

Original line number	Diff line number	Diff line
		@@ -61,7 +61,7 @@ REFERENCE_DOC = "customized_reference.docx"
		OUTPUT_DOC_NAME = "document.docx"

		# Classes for examples and notes
		EXAMPLE_NOTE_CLASSES = ["EX", "NO", "TAN"]
		EXAMPLE_NOTE_CLASSES = ["EX", "NO", "TAN", "Source Code"]

		# HTML tags to look for nested in examples and notes - Pandoc doesn't handle these well, so they need to be handled
		BOLD_TAGS = ["strong", "b"]

md_to_docx_converter/src/to_docx/postprocessing.py

+8 −2

Original line number	Diff line number	Diff line
		@@ -255,13 +255,19 @@ def format_examples_and_notes(doc: Doc):
		if in_example_or_note:
		# Continue example or note
		if paragraph.style.name in EXAMPLE_NOTE_CLASSES:
		# Still in example or note
		if not paragraph.text.startswith("\t"):
		if not paragraph.text.startswith("\t") and paragraph.style.name != "Source Code":
		paragraph.text = f"\t{paragraph.text}"
		else:
		# No longer in example or note
		in_example_or_note = False

		if paragraph.style.name == "Source Code":
		if in_example_or_note == True:
		paragraph.style = "EX Source Code"
		# apply to all its runs the HTML-Sample style with no space suffix
		for run in paragraph.runs:
		run.style = "HTML-Sample"

		document_paragraphs = list(iter_paragraphs(doc))

		handle_paragraphs(document_paragraphs)

md_to_docx_converter/src/to_docx/preprocessing.py

+63 −45

Original line number	Diff line number	Diff line
		import code
		import copy
		import os
		import re
		@@ -16,11 +17,7 @@ from src.constants import (
		WORD_A4_MAX_HEIGHT_PIXEL,
		WORD_A4_MAX_WIDTH_PIXEL,
		)
		from src.utils import (
		combine_biu_classes,
		handle_html_consolidation,
		p_warning
		)
		from src.utils import combine_biu_classes, handle_html_consolidation, p_warning


		# region Helpers
		@@ -106,6 +103,22 @@ def remove_pandoc_toc(soup: BeautifulSoup):
		return soup


		def remove_badges(soup: BeautifulSoup):
		"""
		Removes badge elements added to the HTML that are not necessary for the Docx.
		Badges are typically represented as images with specific classes or IDs.
		"""
		# Example: Remove images with class 'badge' or id 'badge-container'
		for badge in soup.select(".inform, .norm"):
		text = badge.get_text()
		if not text.startswith("(") and not text.endswith(")"):
		text = "(" + text + ")"
		badge.insert_before(NavigableString(text))
		badge.decompose()

		return soup


		def compute_height_and_width_from_file(file_path: str):
		"""
		Computes the height and width of images in the document based on the provided file.
		@@ -216,6 +229,22 @@ def modify_links(soup: BeautifulSoup):
		return soup


		def handle_italic_monospace(soup: BeautifulSoup):
		"""Handles spans that apply both the `HTML_Italic` and `HTML_Monospace` classes by converting them into a single custom tag that can be handled during postprocessing."""
		ems = soup.find_all("em")


		for em in ems:
		code = em.find("code", recursive=False)
		if code:
		text = code.get_text()
		new_span = soup.new_tag("span", attrs={"class": "HTML-Sample"})
		new_span.string = text
		em.replace_with(new_span)

		return soup


		def get_plaintext_from_codeblock(pre: Tag):
		"""
		Return a list of the lines of text contained in a code block.
		@@ -223,7 +252,9 @@ def get_plaintext_from_codeblock(pre: Tag):
		The text is retrieved from the provided `<pre>` tag's `<code>` child tag, which it always has. Preserves indentation by replacing any tabs with tab placeholders, which is necessary because Pandoc trims preceding whitespace.
		"""
		# There will only be one code tag inside the pre tag
		code = pre.find("code")
		code = pre.find("code", recursive=False)
		if not code:
		return [] # Nothing to do here

		# Get the direct children of the code tag. Each span contains an <a> tag and a series of spans representing a single line's worth of text.
		code_children = code.find_all("span", recursive=False)
		@@ -292,35 +323,22 @@ def handle_examples_and_notes(soup: BeautifulSoup):
		"""Apply the HTML Sample style to the individual lines and merge the first line with the tag"""
		# Get code blocks' lines
		pre = body.find_all("pre")[0]
		lines = get_plaintext_from_codeblock(pre)

		# Make a new div for the tag and the body
		consolidated_div = soup.new_tag("div")
		code = pre.find("code", recursive=False)
		if not code:
		return soup # Nothing to do here

		# Make the new paragraph for the first line, containing the label text and the first line of the code block
		label_text = NavigableString(
		f"{tag.get_text()}\t"
		) # Add tab for indentation

		first_body_span = soup.new_tag("span", attrs={"class": "HTML_Sample"})
		first_body_span.append(lines.pop(0))

		label_and_first_line_para = soup.new_tag("p")
		label_and_first_line_para.append(label_text)
		label_and_first_line_para.append(first_body_span)

		consolidated_div.append(label_and_first_line_para)

		# For the rest of the lines, add tabs to their beginnings and add them as subsequent paragraphs
		for line in lines:
		line_paragraph = soup.new_tag("p", attrs={"class": "HTML_Sample"})
		line_paragraph.append(line)
		consolidated_div.append(line_paragraph)

		tag.insert_before(consolidated_div)
		new_pre = soup.new_tag("pre")
		new_code = soup.new_tag("code")
		new_code.append(code.get_text())
		new_pre.append(new_code)
		tag.insert_before(new_pre)
		tag.decompose()
		body.decompose()

		return soup

		# Existing tag and (first or only) body element
		@@ -343,25 +361,19 @@ def handle_examples_and_notes(soup: BeautifulSoup):
		Ensure the code block has the correct indentation by prepending a tab placeholder
		"""
		pre = element.find_all("pre")[0]
		new_pre = soup.new_tag("pre")
		new_code = soup.new_tag("code")

		lines: list[str] = get_plaintext_from_codeblock(pre)

		codeblock_div = soup.new_tag("div", attrs={"class": "EX"})
		for line in lines:
		# Create suffix to tell whether paragraph should have space after it
		suffix = NO_SPACE
		if line == lines[-1]:
		suffix = WITH_SPACE

		line_paragraph = soup.new_tag(
		"p", attrs={"class": f"HTML_Sample/{suffix}"}
		) # This class/style name doesn't exist, but will be normalized later on in postprocessing.
		line_paragraph.append(line)
		code = pre.find("code", recursive=False)
		if not code:
		return soup # Nothing to do here

		codeblock_div.append(line_paragraph)
		raw = code.get_text()
		new_code.append(f"{raw}")
		new_pre.append(new_code)

		pre.parent.parent.append(codeblock_div)
		pre.decompose()
		element.insert_before(new_pre)
		element.decompose()

		return soup

		@@ -509,6 +521,8 @@ def convert_codeblock_styles_to_etsi(soup: BeautifulSoup):

		for pre in pres:
		lines: list[str] = get_plaintext_from_codeblock(pre)
		if len(lines) == 0:
		return soup # Nothing to do here

		new_codeblock = soup.new_tag("div")

		@@ -610,7 +624,9 @@ def prepare_table_cell_classes(soup: BeautifulSoup):
		para = soup.new_tag("p").append(child.get_text())
		child.replace_with(para)
		except ValueError:
		p_warning(f'Could not add child: {repr(child)} to paragraph in table cell')
		p_warning(
		f"Could not add child: {repr(child)} to paragraph in table cell"
		)

		div.unwrap()

		@@ -680,11 +696,13 @@ def preprocess(
		soup = BeautifulSoup(html, "html.parser")

		soup = remove_pandoc_toc(soup)
		soup = remove_badges(soup)
		soup = change_images_to_use_high_quality(soup, src)
		soup = modify_links(soup)
		soup = handle_italic_monospace(soup)
		soup = handle_examples_and_notes(soup)
		soup = handle_abbreviations(soup)
		soup = convert_codeblock_styles_to_etsi(soup)
		# soup = convert_codeblock_styles_to_etsi(soup)
		soup = cleanup_code_tags(soup)
		soup = create_custom_tags_for_bold_italic_underline_styles(soup)
		soup = prepare_table_cell_classes(soup)

md_to_docx_converter/src/utils.py

+1 −0

Original line number	Diff line number	Diff line
		@@ -310,6 +310,7 @@ def get_html_to_docx_command(dest: str, consolidated_html_path, output_doc_path)
		output_doc_path,
		"--lua-filter=html_to_docx.lua",
		f"--reference-doc={REFERENCE_DOC}",
		"--preserve-tabs",
		]

		return command