feat: enhance README and codebase with improved handling of conversion... (b79ec65f) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/README.md

+0 −4

Original line number	Diff line number	Diff line
		@@ -260,10 +260,6 @@ Starting with HTML files contained in the default source location (_GENERATED_FI

		`convert.py --frm html --to docx --folder {folder_name}`

		Specify a different directory containing the HTML files.

		`convert.py --frm html --to docx --folder {folder_name} --src relative/or/absolute/source/path`

		[^1]: These steps may not be necessary with WSL 2, but it is recommended to follow them nevertheless.

		[^2]: Method subject to change

md_to_docx_converter/advancedTOCLogic.js

+27 −9

Original line number	Diff line number	Diff line
		@@ -79,24 +79,42 @@ function scrollToClosestAnchorInTOC() {
		}

		function openCorrectPanelDependingOnStoredElement() {
		const originalLiElement = document.getElementById(
		localStorage.getItem("scrollTo")
		).parentElement;
		const scrollToId = localStorage.getItem("scrollTo");
		if (!scrollToId) {
		return;
		}

		const targetElement = document.getElementById(scrollToId);
		if (!targetElement) {
		return;
		}

		const originalLiElement = targetElement.parentElement;
		if (!originalLiElement) {
		return;
		}

		originalLiElement.classList.add("open");
		originalLiElement.classList.add("active");

		// we are working with li elements, so we skip the ul parent
		let parentElement = originalLiElement.parentElement.parentElement; // above li element or the nav
		let parentElement = originalLiElement.parentElement?.parentElement; // above li element or the nav
		if (!parentElement) {
		return;
		}

		// this handles all the other cases
		while (parentElement.id !== "TOC") {
		const parentElementTOCId = parentElement
		.querySelector("a")
		.getAttribute("id");
		while (parentElement && parentElement.id !== "TOC") {
		const aElement = parentElement.querySelector("a");
		if (!aElement) {
		break;
		}

		const parentElementTOCId = aElement.getAttribute("id");
		if (parentElementTOCId) {
		parentElement.classList.add("open");
		// we are working with li elements, so we skip the ul parent
		parentElement = parentElement.parentElement.parentElement;
		parentElement = parentElement.parentElement?.parentElement;
		} else {
		break;
		}

md_to_docx_converter/convert.py

+94 −1

Original line number	Diff line number	Diff line
		@@ -141,9 +141,13 @@ if not args.src and IS_DIFF and IS_GIT_DIFF:
		sys.exit(1)

		SRC_TYPE: str = "html" if str(args.frm).startswith("html") else args.frm
		if args.src:
		if args.src and not (args.frm == "html" and args.to == "docx"):
		SRC = args.src
		else:
		if args.src and args.frm == "html" and args.to == "docx":
		print(
		f"Warning: The {p_label('--src')} argument is ignored when converting from {p_label('html')} to {p_label('docx')}."
		)
		SRC = f"{FILEGEN_DIR}/{FOLDER}/{args.frm}" # Use args.frm to get "html_dirty"

		IS_CLEANUP: bool = str(args.frm) == "html_dirty"
		@@ -371,6 +375,7 @@ def convert(conversion_args=None):
		"""
		Handles conversion from HTML to a single Docx file.
		"""
		html_folder = os.path.join(DEST, "html")
		preprocess_docx(
		SRC, SRC_TYPE, TO_DOCX_EXCLUDED_HTML_FILES, CONSOLIDATED_HTML_PATH
		)
		@@ -699,6 +704,94 @@ if SRC_TYPE == "md" and DEST_TYPE == "html":
		with open(file_path, "w", encoding="utf-8") as f:
		f.write(str(soup))

		if SRC_TYPE == "md" and DEST_TYPE == "docx":
		print("Converting Markdown files to DOCX...")
		work_dir = os.path.join(FILEGEN_DIR, f"{FOLDER}")
		if os.path.exists(work_dir):
		try:
		shutil.rmtree(work_dir)
		except OSError as e:
		print(f"Warning: Could not fully remove {work_dir}: {e}")
		# Try to remove recursively with ignore_errors as fallback
		shutil.rmtree(work_dir, ignore_errors=True)
		os.makedirs(work_dir)
		md_to_html_args = argparse.Namespace(
		frm="md",
		to="html",
		folder=f"{FOLDER}",
		src=SRC,
		file_order=FILE_ORDER_JSON,
		diff=args.diff,
		diff_git=args.diff_git,
		)
		try:
		print("Generating HTML files from Markdown...")
		with t.section("Generate HTML files from Markdown for DOCX conversion"):
		subprocess.run(
		[
		sys.executable,
		"convert.py",
		"--frm",
		md_to_html_args.frm,
		"--to",
		md_to_html_args.to,
		"--folder",
		md_to_html_args.folder,
		"--src",
		md_to_html_args.src,
		*(
		["--file_order", md_to_html_args.file_order]
		if md_to_html_args.file_order
		else []
		),
		*(
		(
		["--diff-git"]
		if md_to_html_args.diff_git is True
		else ["--diff-git", md_to_html_args.diff_git]
		)
		if md_to_html_args.diff_git is not None
		else (["--diff"] if md_to_html_args.diff else [])
		),
		],
		# Note:
		# the case diff_git is not None and diff is False should not happen since --diff-git implies --diff.
		# If changes are done to the logic of how these flags are set, the command construction logic should be updated accordingly.
		check=True,
		capture_output=True,
		text=True,
		)
		html_work_dir = os.path.join(work_dir, "html")
		html_to_docx_args = argparse.Namespace(
		frm="html",
		to="docx",
		folder=f"{FOLDER}",
		src=html_work_dir,
		diff=IS_DIFF,
		)
		print("Converting HTML files to DOCX...")
		with t.section("Convert generated HTML files to DOCX"):
		subprocess.run(
		[
		sys.executable,
		"convert.py",
		"--frm",
		html_to_docx_args.frm,
		"--to",
		html_to_docx_args.to,
		"--folder",
		html_to_docx_args.folder,
		"--src",
		html_to_docx_args.src,
		],
		check=True,
		capture_output=True,
		text=True,
		)
		except subprocess.CalledProcessError as e:
		print(p_error(f"Error during conversion:\n{e.stderr}"))
		sys.exit(1)

		print("Post-processing completed successfully.")
		if TIMER_ENABLED:
		print("Timing report:")

md_to_docx_converter/customCSS.css

+15 −4

Original line number	Diff line number	Diff line
		@@ -57,15 +57,26 @@ body {
		position: relative;
		overflow: visible;
		display: block;
		background-color: red;
		border-radius: 10px;
		padding-left: 10px;
		padding-right: 10px;
		margin-right: 10px;
		width: 100%;
		}

		nav>ul>li>a.diff-changes-enable::after {
		content: " ●";
		color: green;
		#TOC a.diff-changes-enable::after {
		/* content: " ●"; */
		content: attr(data-diff-count);
		background-color: green;
		border-radius: 10px;
		padding-left: 5px;
		padding-right: 5px;
		margin-left: 5px;
		color: white;
		}

		nav>ul>li>a.diff-changes-enable:hover::before {
		#TOC a.diff-changes-enable:hover::before {
		content: "Total Changes: " attr(data-diff-count);
		position: absolute;
		left: 2%;

md_to_docx_converter/src/to_docx/include_frontpage.py

+3 −2

Original line number	Diff line number	Diff line
		@@ -6,18 +6,19 @@ from docx.oxml.ns import nsdecls
		import os
		from src.constants import TEXT_TO_REPLACE_IN_FRONTPAGE
		from bs4 import BeautifulSoup, Tag, NavigableString
		from src.to_docx.preprocessing import remove_diff_marks

		def scrap_replacements_from_html(front_page_html_file: str) -> dict:
		replacements = {}
		with open(front_page_html_file, 'r', encoding='utf-8') as f:
		soup = BeautifulSoup(f, 'html.parser')
		soup = remove_diff_marks(soup)
		for key in TEXT_TO_REPLACE_IN_FRONTPAGE:
		element = soup.find(attrs={"data-replace": key})
		if element:
		if key == "DATE":
		# Special handling for DATE to format it as needed
		date_text = element.get_text(strip=True)
		print(f"Found date: {date_text}")
		split_date = date_text.split('-')
		YEAR = split_date[0] if len(split_date) > 0 else ''
		MONTH = split_date[1] if len(split_date) > 1 else ''