chore: update descriptions (d2f9e597) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/src/to_html/postprocessing.py

+23 −7

Original line number	Diff line number	Diff line
		@@ -551,6 +551,9 @@ def extract_images_from_html(soup: BeautifulSoup) -> dict:
		if img:
		src = img.get("src", "").replace("media/", "")
		images_mapping[id] = src
		figure_caption = fig.find("figcaption")
		if figure_caption: # TODO: check if we might want to keep the caption instead of removing it
		figure_caption.decompose()

		return images_mapping, soup

		@@ -624,14 +627,27 @@ def fix_capitalization_in_links(soup: BeautifulSoup) -> BeautifulSoup:
		def postprocess(html_dir: str):
		"""
		### Description
		Iterates through the generated HTML files, making the following changes:
		- rename files with the same logic used for md files
		- Removes excess spacing between some lines in examples by removing <br /> tags added by Pandoc at the end of some lines
		- Ensures correct formatting of references
		- Removes code blocks that contain only img tags
		Iterates through the generated HTML files, applying various transformations to improve
		formatting, fix links, and ensure consistent styling. This includes:

		- Renaming files using the same logic applied to MD files
		- Formatting references properly in reference sections
		- Adding links to references mentioned in the text
		- Fixing table of contents links
		- Adjusting bracket placement around links
		- Removing excess spacing in code examples
		- Unwrapping code tags that should render as plaintext
		- Formatting abbreviation sections properly
		- Processing examples, notes and tips
		- Fixing image handling in code blocks
		- Adding and fixing IDs for figures and tables
		- Fixing custom tags for relative figure/table references
		- Normalizing dash characters in links and IDs
		- Ensuring proper capitalization in links
		- Creating cross-references for images

		### Arguments
		- `html_dir`:
		- `html_dir`: Directory containing the HTML files to be processed
		"""
		filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir)
		images_mapping = {}

md_to_docx_converter/src/to_html/preprocessing.py

+16 −4

Original line number	Diff line number	Diff line
		@@ -573,14 +573,26 @@ def preprocess(
		):
		"""
		### Description
		Preprocesses Markdown files
		1. Converts custom code blocks back to Pandoc's expected format
		2. Consolidates Markdown input into a single temporary file
		Preprocesses Markdown files to prepare them for conversion to HTML by applying various transformations:

		1. Performs format validation checks on divs, notes, and examples
		2. Removes any existing prettier-ignore statements
		3. Adds appropriate divs around images, tables, and their captions
		4. Auto-numbers clauses, annexes, examples, notes, figures, and tables
		5. Adds IDs to references for proper linking
		6. Handles special characters like '<' and '>' to ensure proper rendering
		7. Adds IDs to headings for navigation and cross-referencing
		8. Ensures proper formatting of notes and examples with empty lines
		9. Consolidates all preprocessed files into a single Markdown file

		### Arguments
		- `src`: The absolute or relative path of the directory containing the source Markdown files
		- `src_type`: The source file type, `md`
		- `consolidated_md_path`: The path at which the consolidated Markdown file will be created
		- `file_order_json`: Path to JSON file specifying custom order of clauses and annexes

		### Returns
		- A dictionary mapping filenames to their numeric/alphabetic positions in the document
		"""
		filename_numbers_mapping = {}
		clauses = DEFAULT_CLAUSES

md_to_docx_converter/src/to_md/postprocessing.py

+15 −9

Original line number	Diff line number	Diff line
		@@ -224,16 +224,22 @@ def remove_leading_tab(filename: str) -> str:
		def postprocess(markdown_dir: str, is_cleanup: bool):
		"""
		### Description
		Iterates through the generated Markdown files, making the following changes:
		1. Changes links from HTML to Markdown
		2. Converts Example and Note bocks into a more human-managable format
		3. Converts sequences of monospaced text to a human-readable format
		4. Executes some final cleaning
		5. "Unescapes" some of the characters that Pandoc escaped (that is, prepended with a backslash)
		Iterates through the generated Markdown files, applying various transformations to improve
		formatting and readability:

		1. Removes table of contents from front page
		2. Removes HTML comments from all files
		3. Converts links from HTML to Markdown format
		4. Unescapes special characters that Pandoc unnecessarily escaped:
		- Angle brackets, curly braces, parentheses, quotes
		- Square brackets with special handling for tables and spans
		5. Adds prettier-ignore comments before and after tables to preserve formatting
		6. Renames HTML classes to use hyphens instead of underscores
		7. Applies special handling for reference pages

		### Args
		- `markdown_dir`: string representation of the destination Markdown directory
		- `is_cleanup`: `True` if additional formatting should be performmed on the generated Markdown, `False` otherwise. This will be `True` when converting from "dirty" HTML.
		- `markdown_dir`: String path to directory containing the generated Markdown files
		- `is_cleanup`: `True` if additional formatting should be performed (when converting from "dirty" HTML), `False` otherwise
		"""

		for filename in os.listdir(markdown_dir):

md_to_docx_converter/src/to_md/preprocessing.py

+22 −10

Original line number	Diff line number	Diff line
		@@ -418,17 +418,29 @@ def preprocess(
		"""
		### Description
		Preprocessing mandatory for conversion from HTML to Markdown. Operating on a single source HTML file, performs the following tasks:
		1. Removes the table of contents Pandoc adds to the HTML.
		2. Performs cleanup tasks if the `--cleanup` flag is passed.
		3. Ensure abbreviations' meanings have the correct indentation and alignment.
		4. Formats notes and examples to be more managable for humans to modify.
		5. Removes Pandoc-generated metadata from headers

		1. Removes the table of contents, CSS links, headers/footers, buttons, and other Pandoc-generated elements
		2. Removes flex and flex-item elements that aren't needed in Markdown
		3. Formats abbreviations to have a more readable structure in Markdown
		4. Restructures notes and examples:
		- Converts inner divs to paragraphs
		- Moves related code blocks into the body of notes/examples (when cleanup flag is enabled)
		5. Simplifies headings by:
		- Removing data-number attributes
		- Generating cleaner IDs based on heading text
		- Creating a mapping between old and new IDs
		6. Updates links in the document to use expected filenames (when cleanup flag is enabled)
		7. Applies additional cleaning operations for "dirty" HTML sources

		### Arguments
		- `src_path`: the absolute or relative path to the source HTML file
		- `dest_path`: the absolute or relative path where the processed HTML file, which Pandoc will then convert to Markdown, will be saved.
		- `is_cleanup`: `True` if additional formatting should be performmed on the generated Markdown, `False` otherwise. This will be `True` when converting from "dirty" HTML.
		- `css_src`: A list of the absolute or relative paths to all source CSS files. Necessary to pass down to `preprocess_cleaning`.
		- `src_path`: The absolute or relative path to the source HTML file
		- `dest_path`: The absolute or relative path where the processed HTML file will be saved
		- `is_cleanup`: `True` if additional formatting should be performed (when converting from "dirty" HTML), `False` otherwise
		- `css_src`: A list of absolute or relative paths to all source CSS files (needed for cleanup processing)
		- `filenames_mapping`: Dictionary mapping old filenames to expected filenames

		### Returns
		- A dictionary mapping old heading IDs to new heading IDs
		"""

		with open(src_path, "r", encoding="utf-8") as html: