Commit 29d024df authored by Marco Cavalli's avatar Marco Cavalli
Browse files

feat: front-page is added when generating the word

parent ed0b9245
Loading
Loading
Loading
Loading
+120 KiB

File added.

No diff preview for this file type.

+15 −0
Original line number Diff line number Diff line
@@ -161,3 +161,18 @@ BAD_COLON_GROUP_REGEX = (
# Match lines that start with : or :: and are not followed by letters
BAD_DIV_DELINEATOR_REGEX = r"^\s*(?::{1,2})(?![a-zA-Z])"
# endregion

TEXT_TO_REPLACE_IN_FRONTPAGE = [
    "WORKITEMNAME",
    "VERSION_NO",
    "DATE",
    "{{TYPEDOCUMENT}}",
    "{{TITLE}}",
    "{{PART}}",
    "{{SUBPART}}",
    "{{RELEASE}}",
    "{{WORKITEM}}",
    "{{KEYWORDS}}",
    "yyyy",
    "-mm",
]
 No newline at end of file
+74 −0
Original line number Diff line number Diff line
from docx import Document
from docx.enum.text import WD_BREAK
from docxcompose.composer import Composer
from docx.oxml import parse_xml
from docx.oxml.ns import nsdecls
import os
from src.constants import TEXT_TO_REPLACE_IN_FRONTPAGE
from bs4 import BeautifulSoup, Tag, NavigableString

def scrap_replacements_from_html(front_page_html_file: str) -> dict:
    replacements = {}
    with open(front_page_html_file, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        for key in TEXT_TO_REPLACE_IN_FRONTPAGE:
            element = soup.find(attrs={"data-replace": key})
            if element:
                if key == "DATE":
                    # Special handling for DATE to format it as needed
                    date_text = element.get_text(strip=True)
                    print(f"Found date: {date_text}")
                    split_date = date_text.split('-')
                    YEAR = split_date[0] if len(split_date) > 0 else ''
                    MONTH = split_date[1] if len(split_date) > 1 else ''
                    replacements["yyyy"] = YEAR
                    replacements["-mm"] = f"-{MONTH}"
                else:
                    replacements[key] = element.get_text(strip=True)
    return replacements

def include_frontpage(output_path):
    def merge_docx(output_path: str, *input_paths: str):
        master = Document(input_paths[0])
        composer = Composer(master)
        for path in input_paths[1:]:
            p = master.add_paragraph()
            run = p.add_run()
            run.add_break(WD_BREAK.PAGE)
            
            composer.append(Document(path))
        composer.save(output_path)


    def replace_placeholders(file_path: str, replacements: dict, output_path: str = None):
        doc = Document(file_path)
        
        for paragraph in doc.paragraphs:
            for key in TEXT_TO_REPLACE_IN_FRONTPAGE:
                value = replacements.get(key, "")
                if key in paragraph.text:
                    for run in paragraph.runs:
                        run.text = run.text.replace(key, value)
        
        doc.save(output_path or file_path)

    def set_update_fields_on_open(doc_path: str):
        """Force to update fields when opening the document in Word."""
        doc = Document(doc_path)
        
        settings = doc.settings.element
        update_fields = parse_xml(
            f'<w:updateFields {nsdecls("w")} w:val="true"/>'
        )
        settings.append(update_fields)
        
        doc.save(doc_path)

    front_page_html = os.path.join(os.path.dirname(output_path), "html", "front-page.html")
    replacements = scrap_replacements_from_html(front_page_html)
    document = os.path.join(output_path, "document.docx")
    front_page_filled = os.path.join(output_path, "front_page_filled.docx")
    replace_placeholders("front_page_template.docx", replacements, front_page_filled)
    merge_docx(document, front_page_filled, document)
    # set_update_fields_on_open(document)
    os.remove(front_page_filled)
 No newline at end of file
+5 −2
Original line number Diff line number Diff line
import re
import re, os
from docx import Document
from docx.shared import Pt
from docx.document import Document as Doc
@@ -27,6 +27,7 @@ from src.constants import (
    UNDERLINE_TAGS,
    WITH_SPACE,
)
from src.to_docx.include_frontpage import include_frontpage
from src.utils import get_bold_italic_underline_css_classes


@@ -577,7 +578,6 @@ def postprocess(docx_dir: str):
    ### Arguments
    - `docx_dir`: The absolute or relative path at which the generated Docx was saved
    """

    doc: Doc = Document(docx_dir)

    doc = format_references(doc)
@@ -589,3 +589,6 @@ def postprocess(docx_dir: str):
    doc = set_keep_with_next_false(doc)

    doc.save(docx_dir)

    containing_folder = os.path.dirname(docx_dir)
    include_frontpage(containing_folder)
+90 −0
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ from src.utils import (
    apply_renaming_logic,
    get_dirty_filenames_mapping_with_expected_filenames,
    p_error,
    p_warning,
)

from src.constants import ABBREVIATION_CLASS
@@ -307,6 +308,92 @@ def format_tables(soup: BeautifulSoup) -> BeautifulSoup:
                    tr['class'] = existing_classes + ['bg-striped-row']
    return soup

def format_front_page(soup: BeautifulSoup) -> BeautifulSoup:
    ZA = soup.find_all("div", class_="ZA")
    try:
        p = ZA[0].find("p")
        children = list(p.children)
        try:
            # TITLE IN HEADER
            header = children[0]
            header['data-replace'] = 'WORKITEMNAME'
        except IndexError:
            print(p_warning("front-page is missing WORKITEMNAME information."))
        # Version in HEADER
        try:
            version = children[1]
            new_span = soup.new_tag("span")
            version.wrap(new_span)
            new_span['data-replace'] = 'VERSION_NO'
        except IndexError:
            print(p_warning("front-page is missing VERSION_NO information."))
        # DATE IN HEADER
        try:
            date = children[2]
            text = date.get_text(strip=True)
            text = text.replace("(", "").replace(")", "")
            open_bracket = NavigableString("(")
            date_text = new_span = soup.new_tag("span")
            date_text['data-replace'] = 'DATE'
            date_text.string = text
            close_bracket = NavigableString(")")
            date.append(open_bracket)
            date.append(date_text)
            date.append(close_bracket)
        except IndexError:
            print(p_warning("front-page is missing DATE information."))
    except IndexError:
        print(p_warning("front-page is missing the section with WORKITEMNAME, VERSION_NO, and DATE information."))

    ZT = soup.find_all("div", class_="ZT")
    try:
        # first is title
        ZT[0]['data-replace'] = '{{TITLE}}'
    except IndexError:
        print(p_warning("front-page is missing TITLE information."))
    try:
        # second is part
        ZT[1]['data-replace'] = '{{PART}}'
    except IndexError:
        print(p_warning("front-page is missing PART information."))
    try:
        # third is subpart
        ZT[2]['data-replace'] = '{{SUBPART}}'
    except IndexError:
        print(p_warning("front-page is missing SUBPART information."))
    try:
        # fourth is release
        ZT[3]['data-replace'] = '{{RELEASE}}'
    except IndexError:
        print(p_warning("front-page is missing RELEASE information."))


    ZB = soup.find_all("div", class_="ZB")
    try:
        ZB[0]['data-replace'] = '{{TYPEDOCUMENT}}'
    except IndexError:
        print(p_warning("front-page is missing TYPEDOCUMENT information."))

    TAC = soup.find_all("div", class_="TAC")
    try:
        children = list(TAC[0].children)
        try:
            # SECOND is WORKITEM
            workitem = children[3]
            workitem['data-replace'] = '{{WORKITEM}}'
        except IndexError:
            print(p_warning("front-page is missing WORKITEM information."))
        # FOURTH is KEYWORDS
        try:
            keywords = children[7]
            keywords['data-replace'] = '{{KEYWORDS}}'
        except IndexError:
            print(p_warning("front-page is missing KEYWORDS information."))
    except IndexError:
        print(p_warning("front-page is missing the section with WORKITEM and KEYWORDS information."))

    return soup


def add_links_to_references_in_text(soup):
    def reform_broken_links_in_text(soup: BeautifulSoup):
@@ -811,6 +898,9 @@ def postprocess(html_dir: str):
        soup = format_examples_and_notes(soup)
        soup = format_tables(soup)

        if new_filename == "front-page.html":
            soup = format_front_page(soup)

        if (
            new_filename.replace(".html", "") in files_with_references
        ):  # Reference-specific formatting