feat: front-page is added when generating the word (29d024df) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/front_page_template.docx

0 → 100644

+120 KiB

File added.

No diff preview for this file type.

md_to_docx_converter/src/constants.py

+15 −0

Original line number	Diff line number	Diff line
		@@ -161,3 +161,18 @@ BAD_COLON_GROUP_REGEX = (
		# Match lines that start with : or :: and are not followed by letters
		BAD_DIV_DELINEATOR_REGEX = r"^\s*(?::{1,2})(?![a-zA-Z])"
		# endregion

		TEXT_TO_REPLACE_IN_FRONTPAGE = [
		"WORKITEMNAME",
		"VERSION_NO",
		"DATE",
		"{{TYPEDOCUMENT}}",
		"{{TITLE}}",
		"{{PART}}",
		"{{SUBPART}}",
		"{{RELEASE}}",
		"{{WORKITEM}}",
		"{{KEYWORDS}}",
		"yyyy",
		"-mm",
		]
		No newline at end of file

md_to_docx_converter/src/to_docx/include_frontpage.py

0 → 100644

+74 −0

Original line number	Diff line number	Diff line
		from docx import Document
		from docx.enum.text import WD_BREAK
		from docxcompose.composer import Composer
		from docx.oxml import parse_xml
		from docx.oxml.ns import nsdecls
		import os
		from src.constants import TEXT_TO_REPLACE_IN_FRONTPAGE
		from bs4 import BeautifulSoup, Tag, NavigableString

		def scrap_replacements_from_html(front_page_html_file: str) -> dict:
		replacements = {}
		with open(front_page_html_file, 'r', encoding='utf-8') as f:
		soup = BeautifulSoup(f, 'html.parser')
		for key in TEXT_TO_REPLACE_IN_FRONTPAGE:
		element = soup.find(attrs={"data-replace": key})
		if element:
		if key == "DATE":
		# Special handling for DATE to format it as needed
		date_text = element.get_text(strip=True)
		print(f"Found date: {date_text}")
		split_date = date_text.split('-')
		YEAR = split_date[0] if len(split_date) > 0 else ''
		MONTH = split_date[1] if len(split_date) > 1 else ''
		replacements["yyyy"] = YEAR
		replacements["-mm"] = f"-{MONTH}"
		else:
		replacements[key] = element.get_text(strip=True)
		return replacements

		def include_frontpage(output_path):
		def merge_docx(output_path: str, *input_paths: str):
		master = Document(input_paths[0])
		composer = Composer(master)
		for path in input_paths[1:]:
		p = master.add_paragraph()
		run = p.add_run()
		run.add_break(WD_BREAK.PAGE)

		composer.append(Document(path))
		composer.save(output_path)


		def replace_placeholders(file_path: str, replacements: dict, output_path: str = None):
		doc = Document(file_path)

		for paragraph in doc.paragraphs:
		for key in TEXT_TO_REPLACE_IN_FRONTPAGE:
		value = replacements.get(key, "")
		if key in paragraph.text:
		for run in paragraph.runs:
		run.text = run.text.replace(key, value)

		doc.save(output_path or file_path)

		def set_update_fields_on_open(doc_path: str):
		"""Force to update fields when opening the document in Word."""
		doc = Document(doc_path)

		settings = doc.settings.element
		update_fields = parse_xml(
		f'<w:updateFields {nsdecls("w")} w:val="true"/>'
		)
		settings.append(update_fields)

		doc.save(doc_path)

		front_page_html = os.path.join(os.path.dirname(output_path), "html", "front-page.html")
		replacements = scrap_replacements_from_html(front_page_html)
		document = os.path.join(output_path, "document.docx")
		front_page_filled = os.path.join(output_path, "front_page_filled.docx")
		replace_placeholders("front_page_template.docx", replacements, front_page_filled)
		merge_docx(document, front_page_filled, document)
		# set_update_fields_on_open(document)
		os.remove(front_page_filled)
		No newline at end of file

md_to_docx_converter/src/to_docx/postprocessing.py

+5 −2

Original line number	Diff line number	Diff line
		import re
		import re, os
		from docx import Document
		from docx.shared import Pt
		from docx.document import Document as Doc
		@@ -27,6 +27,7 @@ from src.constants import (
		UNDERLINE_TAGS,
		WITH_SPACE,
		)
		from src.to_docx.include_frontpage import include_frontpage
		from src.utils import get_bold_italic_underline_css_classes


		@@ -577,7 +578,6 @@ def postprocess(docx_dir: str):
		### Arguments
		- `docx_dir`: The absolute or relative path at which the generated Docx was saved
		"""

		doc: Doc = Document(docx_dir)

		doc = format_references(doc)
		@@ -589,3 +589,6 @@ def postprocess(docx_dir: str):
		doc = set_keep_with_next_false(doc)

		doc.save(docx_dir)

		containing_folder = os.path.dirname(docx_dir)
		include_frontpage(containing_folder)

md_to_docx_converter/src/to_html/postprocessing.py

+90 −0

Original line number	Diff line number	Diff line
		@@ -5,6 +5,7 @@ from src.utils import (
		apply_renaming_logic,
		get_dirty_filenames_mapping_with_expected_filenames,
		p_error,
		p_warning,
		)

		from src.constants import ABBREVIATION_CLASS
		@@ -307,6 +308,92 @@ def format_tables(soup: BeautifulSoup) -> BeautifulSoup:
		tr['class'] = existing_classes + ['bg-striped-row']
		return soup

		def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
		ZA = soup.find_all("div", class_="ZA")
		try:
		p = ZA[0].find("p")
		children = list(p.children)
		try:
		# TITLE IN HEADER
		header = children[0]
		header['data-replace'] = 'WORKITEMNAME'
		except IndexError:
		print(p_warning("front-page is missing WORKITEMNAME information."))
		# Version in HEADER
		try:
		version = children[1]
		new_span = soup.new_tag("span")
		version.wrap(new_span)
		new_span['data-replace'] = 'VERSION_NO'
		except IndexError:
		print(p_warning("front-page is missing VERSION_NO information."))
		# DATE IN HEADER
		try:
		date = children[2]
		text = date.get_text(strip=True)
		text = text.replace("(", "").replace(")", "")
		open_bracket = NavigableString("(")
		date_text = new_span = soup.new_tag("span")
		date_text['data-replace'] = 'DATE'
		date_text.string = text
		close_bracket = NavigableString(")")
		date.append(open_bracket)
		date.append(date_text)
		date.append(close_bracket)
		except IndexError:
		print(p_warning("front-page is missing DATE information."))
		except IndexError:
		print(p_warning("front-page is missing the section with WORKITEMNAME, VERSION_NO, and DATE information."))

		ZT = soup.find_all("div", class_="ZT")
		try:
		# first is title
		ZT[0]['data-replace'] = '{{TITLE}}'
		except IndexError:
		print(p_warning("front-page is missing TITLE information."))
		try:
		# second is part
		ZT[1]['data-replace'] = '{{PART}}'
		except IndexError:
		print(p_warning("front-page is missing PART information."))
		try:
		# third is subpart
		ZT[2]['data-replace'] = '{{SUBPART}}'
		except IndexError:
		print(p_warning("front-page is missing SUBPART information."))
		try:
		# fourth is release
		ZT[3]['data-replace'] = '{{RELEASE}}'
		except IndexError:
		print(p_warning("front-page is missing RELEASE information."))


		ZB = soup.find_all("div", class_="ZB")
		try:
		ZB[0]['data-replace'] = '{{TYPEDOCUMENT}}'
		except IndexError:
		print(p_warning("front-page is missing TYPEDOCUMENT information."))

		TAC = soup.find_all("div", class_="TAC")
		try:
		children = list(TAC[0].children)
		try:
		# SECOND is WORKITEM
		workitem = children[3]
		workitem['data-replace'] = '{{WORKITEM}}'
		except IndexError:
		print(p_warning("front-page is missing WORKITEM information."))
		# FOURTH is KEYWORDS
		try:
		keywords = children[7]
		keywords['data-replace'] = '{{KEYWORDS}}'
		except IndexError:
		print(p_warning("front-page is missing KEYWORDS information."))
		except IndexError:
		print(p_warning("front-page is missing the section with WORKITEM and KEYWORDS information."))

		return soup


		def add_links_to_references_in_text(soup):
		def reform_broken_links_in_text(soup: BeautifulSoup):
		@@ -811,6 +898,9 @@ def postprocess(html_dir: str):
		soup = format_examples_and_notes(soup)
		soup = format_tables(soup)

		if new_filename == "front-page.html":
		soup = format_front_page(soup)

		if (
		new_filename.replace(".html", "") in files_with_references
		): # Reference-specific formatting