Loading md_to_docx_converter/front_page_template.docx 0 → 100644 +120 KiB File added.No diff preview for this file type. View file md_to_docx_converter/src/constants.py +15 −0 Original line number Diff line number Diff line Loading @@ -161,3 +161,18 @@ BAD_COLON_GROUP_REGEX = ( # Match lines that start with : or :: and are not followed by letters BAD_DIV_DELINEATOR_REGEX = r"^\s*(?::{1,2})(?![a-zA-Z])" # endregion TEXT_TO_REPLACE_IN_FRONTPAGE = [ "WORKITEMNAME", "VERSION_NO", "DATE", "{{TYPEDOCUMENT}}", "{{TITLE}}", "{{PART}}", "{{SUBPART}}", "{{RELEASE}}", "{{WORKITEM}}", "{{KEYWORDS}}", "yyyy", "-mm", ] No newline at end of file md_to_docx_converter/src/to_docx/include_frontpage.py 0 → 100644 +74 −0 Original line number Diff line number Diff line from docx import Document from docx.enum.text import WD_BREAK from docxcompose.composer import Composer from docx.oxml import parse_xml from docx.oxml.ns import nsdecls import os from src.constants import TEXT_TO_REPLACE_IN_FRONTPAGE from bs4 import BeautifulSoup, Tag, NavigableString def scrap_replacements_from_html(front_page_html_file: str) -> dict: replacements = {} with open(front_page_html_file, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') for key in TEXT_TO_REPLACE_IN_FRONTPAGE: element = soup.find(attrs={"data-replace": key}) if element: if key == "DATE": # Special handling for DATE to format it as needed date_text = element.get_text(strip=True) print(f"Found date: {date_text}") split_date = date_text.split('-') YEAR = split_date[0] if len(split_date) > 0 else '' MONTH = split_date[1] if len(split_date) > 1 else '' replacements["yyyy"] = YEAR replacements["-mm"] = f"-{MONTH}" else: replacements[key] = element.get_text(strip=True) return replacements def include_frontpage(output_path): def merge_docx(output_path: str, *input_paths: str): master = Document(input_paths[0]) composer = Composer(master) for path in input_paths[1:]: p = master.add_paragraph() run = p.add_run() run.add_break(WD_BREAK.PAGE) composer.append(Document(path)) composer.save(output_path) def replace_placeholders(file_path: str, replacements: dict, output_path: str = None): doc = Document(file_path) for paragraph in doc.paragraphs: for key in TEXT_TO_REPLACE_IN_FRONTPAGE: value = replacements.get(key, "") if key in paragraph.text: for run in paragraph.runs: run.text = run.text.replace(key, value) doc.save(output_path or file_path) def set_update_fields_on_open(doc_path: str): """Force to update fields when opening the document in Word.""" doc = Document(doc_path) settings = doc.settings.element update_fields = parse_xml( f'<w:updateFields {nsdecls("w")} w:val="true"/>' ) settings.append(update_fields) doc.save(doc_path) front_page_html = os.path.join(os.path.dirname(output_path), "html", "front-page.html") replacements = scrap_replacements_from_html(front_page_html) document = os.path.join(output_path, "document.docx") front_page_filled = os.path.join(output_path, "front_page_filled.docx") replace_placeholders("front_page_template.docx", replacements, front_page_filled) merge_docx(document, front_page_filled, document) # set_update_fields_on_open(document) os.remove(front_page_filled) No newline at end of file md_to_docx_converter/src/to_docx/postprocessing.py +5 −2 Original line number Diff line number Diff line import re import re, os from docx import Document from docx.shared import Pt from docx.document import Document as Doc Loading Loading @@ -27,6 +27,7 @@ from src.constants import ( UNDERLINE_TAGS, WITH_SPACE, ) from src.to_docx.include_frontpage import include_frontpage from src.utils import get_bold_italic_underline_css_classes Loading Loading @@ -577,7 +578,6 @@ def postprocess(docx_dir: str): ### Arguments - `docx_dir`: The absolute or relative path at which the generated Docx was saved """ doc: Doc = Document(docx_dir) doc = format_references(doc) Loading @@ -589,3 +589,6 @@ def postprocess(docx_dir: str): doc = set_keep_with_next_false(doc) doc.save(docx_dir) containing_folder = os.path.dirname(docx_dir) include_frontpage(containing_folder) md_to_docx_converter/src/to_html/postprocessing.py +90 −0 Original line number Diff line number Diff line Loading @@ -5,6 +5,7 @@ from src.utils import ( apply_renaming_logic, get_dirty_filenames_mapping_with_expected_filenames, p_error, p_warning, ) from src.constants import ABBREVIATION_CLASS Loading Loading @@ -307,6 +308,92 @@ def format_tables(soup: BeautifulSoup) -> BeautifulSoup: tr['class'] = existing_classes + ['bg-striped-row'] return soup def format_front_page(soup: BeautifulSoup) -> BeautifulSoup: ZA = soup.find_all("div", class_="ZA") try: p = ZA[0].find("p") children = list(p.children) try: # TITLE IN HEADER header = children[0] header['data-replace'] = 'WORKITEMNAME' except IndexError: print(p_warning("front-page is missing WORKITEMNAME information.")) # Version in HEADER try: version = children[1] new_span = soup.new_tag("span") version.wrap(new_span) new_span['data-replace'] = 'VERSION_NO' except IndexError: print(p_warning("front-page is missing VERSION_NO information.")) # DATE IN HEADER try: date = children[2] text = date.get_text(strip=True) text = text.replace("(", "").replace(")", "") open_bracket = NavigableString("(") date_text = new_span = soup.new_tag("span") date_text['data-replace'] = 'DATE' date_text.string = text close_bracket = NavigableString(")") date.append(open_bracket) date.append(date_text) date.append(close_bracket) except IndexError: print(p_warning("front-page is missing DATE information.")) except IndexError: print(p_warning("front-page is missing the section with WORKITEMNAME, VERSION_NO, and DATE information.")) ZT = soup.find_all("div", class_="ZT") try: # first is title ZT[0]['data-replace'] = '{{TITLE}}' except IndexError: print(p_warning("front-page is missing TITLE information.")) try: # second is part ZT[1]['data-replace'] = '{{PART}}' except IndexError: print(p_warning("front-page is missing PART information.")) try: # third is subpart ZT[2]['data-replace'] = '{{SUBPART}}' except IndexError: print(p_warning("front-page is missing SUBPART information.")) try: # fourth is release ZT[3]['data-replace'] = '{{RELEASE}}' except IndexError: print(p_warning("front-page is missing RELEASE information.")) ZB = soup.find_all("div", class_="ZB") try: ZB[0]['data-replace'] = '{{TYPEDOCUMENT}}' except IndexError: print(p_warning("front-page is missing TYPEDOCUMENT information.")) TAC = soup.find_all("div", class_="TAC") try: children = list(TAC[0].children) try: # SECOND is WORKITEM workitem = children[3] workitem['data-replace'] = '{{WORKITEM}}' except IndexError: print(p_warning("front-page is missing WORKITEM information.")) # FOURTH is KEYWORDS try: keywords = children[7] keywords['data-replace'] = '{{KEYWORDS}}' except IndexError: print(p_warning("front-page is missing KEYWORDS information.")) except IndexError: print(p_warning("front-page is missing the section with WORKITEM and KEYWORDS information.")) return soup def add_links_to_references_in_text(soup): def reform_broken_links_in_text(soup: BeautifulSoup): Loading Loading @@ -811,6 +898,9 @@ def postprocess(html_dir: str): soup = format_examples_and_notes(soup) soup = format_tables(soup) if new_filename == "front-page.html": soup = format_front_page(soup) if ( new_filename.replace(".html", "") in files_with_references ): # Reference-specific formatting Loading Loading
md_to_docx_converter/front_page_template.docx 0 → 100644 +120 KiB File added.No diff preview for this file type. View file
md_to_docx_converter/src/constants.py +15 −0 Original line number Diff line number Diff line Loading @@ -161,3 +161,18 @@ BAD_COLON_GROUP_REGEX = ( # Match lines that start with : or :: and are not followed by letters BAD_DIV_DELINEATOR_REGEX = r"^\s*(?::{1,2})(?![a-zA-Z])" # endregion TEXT_TO_REPLACE_IN_FRONTPAGE = [ "WORKITEMNAME", "VERSION_NO", "DATE", "{{TYPEDOCUMENT}}", "{{TITLE}}", "{{PART}}", "{{SUBPART}}", "{{RELEASE}}", "{{WORKITEM}}", "{{KEYWORDS}}", "yyyy", "-mm", ] No newline at end of file
md_to_docx_converter/src/to_docx/include_frontpage.py 0 → 100644 +74 −0 Original line number Diff line number Diff line from docx import Document from docx.enum.text import WD_BREAK from docxcompose.composer import Composer from docx.oxml import parse_xml from docx.oxml.ns import nsdecls import os from src.constants import TEXT_TO_REPLACE_IN_FRONTPAGE from bs4 import BeautifulSoup, Tag, NavigableString def scrap_replacements_from_html(front_page_html_file: str) -> dict: replacements = {} with open(front_page_html_file, 'r', encoding='utf-8') as f: soup = BeautifulSoup(f, 'html.parser') for key in TEXT_TO_REPLACE_IN_FRONTPAGE: element = soup.find(attrs={"data-replace": key}) if element: if key == "DATE": # Special handling for DATE to format it as needed date_text = element.get_text(strip=True) print(f"Found date: {date_text}") split_date = date_text.split('-') YEAR = split_date[0] if len(split_date) > 0 else '' MONTH = split_date[1] if len(split_date) > 1 else '' replacements["yyyy"] = YEAR replacements["-mm"] = f"-{MONTH}" else: replacements[key] = element.get_text(strip=True) return replacements def include_frontpage(output_path): def merge_docx(output_path: str, *input_paths: str): master = Document(input_paths[0]) composer = Composer(master) for path in input_paths[1:]: p = master.add_paragraph() run = p.add_run() run.add_break(WD_BREAK.PAGE) composer.append(Document(path)) composer.save(output_path) def replace_placeholders(file_path: str, replacements: dict, output_path: str = None): doc = Document(file_path) for paragraph in doc.paragraphs: for key in TEXT_TO_REPLACE_IN_FRONTPAGE: value = replacements.get(key, "") if key in paragraph.text: for run in paragraph.runs: run.text = run.text.replace(key, value) doc.save(output_path or file_path) def set_update_fields_on_open(doc_path: str): """Force to update fields when opening the document in Word.""" doc = Document(doc_path) settings = doc.settings.element update_fields = parse_xml( f'<w:updateFields {nsdecls("w")} w:val="true"/>' ) settings.append(update_fields) doc.save(doc_path) front_page_html = os.path.join(os.path.dirname(output_path), "html", "front-page.html") replacements = scrap_replacements_from_html(front_page_html) document = os.path.join(output_path, "document.docx") front_page_filled = os.path.join(output_path, "front_page_filled.docx") replace_placeholders("front_page_template.docx", replacements, front_page_filled) merge_docx(document, front_page_filled, document) # set_update_fields_on_open(document) os.remove(front_page_filled) No newline at end of file
md_to_docx_converter/src/to_docx/postprocessing.py +5 −2 Original line number Diff line number Diff line import re import re, os from docx import Document from docx.shared import Pt from docx.document import Document as Doc Loading Loading @@ -27,6 +27,7 @@ from src.constants import ( UNDERLINE_TAGS, WITH_SPACE, ) from src.to_docx.include_frontpage import include_frontpage from src.utils import get_bold_italic_underline_css_classes Loading Loading @@ -577,7 +578,6 @@ def postprocess(docx_dir: str): ### Arguments - `docx_dir`: The absolute or relative path at which the generated Docx was saved """ doc: Doc = Document(docx_dir) doc = format_references(doc) Loading @@ -589,3 +589,6 @@ def postprocess(docx_dir: str): doc = set_keep_with_next_false(doc) doc.save(docx_dir) containing_folder = os.path.dirname(docx_dir) include_frontpage(containing_folder)
md_to_docx_converter/src/to_html/postprocessing.py +90 −0 Original line number Diff line number Diff line Loading @@ -5,6 +5,7 @@ from src.utils import ( apply_renaming_logic, get_dirty_filenames_mapping_with_expected_filenames, p_error, p_warning, ) from src.constants import ABBREVIATION_CLASS Loading Loading @@ -307,6 +308,92 @@ def format_tables(soup: BeautifulSoup) -> BeautifulSoup: tr['class'] = existing_classes + ['bg-striped-row'] return soup def format_front_page(soup: BeautifulSoup) -> BeautifulSoup: ZA = soup.find_all("div", class_="ZA") try: p = ZA[0].find("p") children = list(p.children) try: # TITLE IN HEADER header = children[0] header['data-replace'] = 'WORKITEMNAME' except IndexError: print(p_warning("front-page is missing WORKITEMNAME information.")) # Version in HEADER try: version = children[1] new_span = soup.new_tag("span") version.wrap(new_span) new_span['data-replace'] = 'VERSION_NO' except IndexError: print(p_warning("front-page is missing VERSION_NO information.")) # DATE IN HEADER try: date = children[2] text = date.get_text(strip=True) text = text.replace("(", "").replace(")", "") open_bracket = NavigableString("(") date_text = new_span = soup.new_tag("span") date_text['data-replace'] = 'DATE' date_text.string = text close_bracket = NavigableString(")") date.append(open_bracket) date.append(date_text) date.append(close_bracket) except IndexError: print(p_warning("front-page is missing DATE information.")) except IndexError: print(p_warning("front-page is missing the section with WORKITEMNAME, VERSION_NO, and DATE information.")) ZT = soup.find_all("div", class_="ZT") try: # first is title ZT[0]['data-replace'] = '{{TITLE}}' except IndexError: print(p_warning("front-page is missing TITLE information.")) try: # second is part ZT[1]['data-replace'] = '{{PART}}' except IndexError: print(p_warning("front-page is missing PART information.")) try: # third is subpart ZT[2]['data-replace'] = '{{SUBPART}}' except IndexError: print(p_warning("front-page is missing SUBPART information.")) try: # fourth is release ZT[3]['data-replace'] = '{{RELEASE}}' except IndexError: print(p_warning("front-page is missing RELEASE information.")) ZB = soup.find_all("div", class_="ZB") try: ZB[0]['data-replace'] = '{{TYPEDOCUMENT}}' except IndexError: print(p_warning("front-page is missing TYPEDOCUMENT information.")) TAC = soup.find_all("div", class_="TAC") try: children = list(TAC[0].children) try: # SECOND is WORKITEM workitem = children[3] workitem['data-replace'] = '{{WORKITEM}}' except IndexError: print(p_warning("front-page is missing WORKITEM information.")) # FOURTH is KEYWORDS try: keywords = children[7] keywords['data-replace'] = '{{KEYWORDS}}' except IndexError: print(p_warning("front-page is missing KEYWORDS information.")) except IndexError: print(p_warning("front-page is missing the section with WORKITEM and KEYWORDS information.")) return soup def add_links_to_references_in_text(soup): def reform_broken_links_in_text(soup: BeautifulSoup): Loading Loading @@ -811,6 +898,9 @@ def postprocess(html_dir: str): soup = format_examples_and_notes(soup) soup = format_tables(soup) if new_filename == "front-page.html": soup = format_front_page(soup) if ( new_filename.replace(".html", "") in files_with_references ): # Reference-specific formatting Loading