Commit a14f8197 authored by Marco Cavalli's avatar Marco Cavalli
Browse files

fix: remove badges when converting to docx

fix: order of content in examples
fix: preserve indentantion in code blocks
feat: convert monospace italic text to HTML-Sample style when converting to docx
parent 344da62c
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -61,7 +61,7 @@ REFERENCE_DOC = "customized_reference.docx"
OUTPUT_DOC_NAME = "document.docx"

# Classes for examples and notes
EXAMPLE_NOTE_CLASSES = ["EX", "NO", "TAN"]
EXAMPLE_NOTE_CLASSES = ["EX", "NO", "TAN", "Source Code"]

# HTML tags to look for nested in examples and notes - Pandoc doesn't handle these well, so they need to be handled
BOLD_TAGS = ["strong", "b"]
+8 −2
Original line number Diff line number Diff line
@@ -255,13 +255,19 @@ def format_examples_and_notes(doc: Doc):
            if in_example_or_note:
                # Continue example or note
                if paragraph.style.name in EXAMPLE_NOTE_CLASSES:
                    # Still in example or note
                    if not paragraph.text.startswith("\t"):
                    if not paragraph.text.startswith("\t") and paragraph.style.name != "Source Code":
                        paragraph.text = f"\t{paragraph.text}"
                else:
                    # No longer in example or note
                    in_example_or_note = False
            
            if paragraph.style.name == "Source Code":
                if in_example_or_note == True:
                    paragraph.style = "EX Source Code"
                # apply to all its runs the HTML-Sample style with no space suffix
                for run in paragraph.runs:
                    run.style = "HTML-Sample"

    document_paragraphs = list(iter_paragraphs(doc))

    handle_paragraphs(document_paragraphs)
+63 −45
Original line number Diff line number Diff line
import code
import copy
import os
import re
@@ -16,11 +17,7 @@ from src.constants import (
    WORD_A4_MAX_HEIGHT_PIXEL,
    WORD_A4_MAX_WIDTH_PIXEL,
)
from src.utils import (
    combine_biu_classes,
    handle_html_consolidation,
    p_warning
)
from src.utils import combine_biu_classes, handle_html_consolidation, p_warning


# region Helpers
@@ -106,6 +103,22 @@ def remove_pandoc_toc(soup: BeautifulSoup):
    return soup


def remove_badges(soup: BeautifulSoup):
    """
    Removes badge elements added to the HTML that are not necessary for the Docx.
    Badges are typically represented as images with specific classes or IDs.
    """
    # Example: Remove images with class 'badge' or id 'badge-container'
    for badge in soup.select(".inform, .norm"):
        text = badge.get_text()
        if not text.startswith("(") and not text.endswith(")"):
            text = "(" + text + ")"
        badge.insert_before(NavigableString(text))
        badge.decompose()

    return soup


def compute_height_and_width_from_file(file_path: str):
    """
    Computes the height and width of images in the document based on the provided file.
@@ -216,6 +229,22 @@ def modify_links(soup: BeautifulSoup):
    return soup


def handle_italic_monospace(soup: BeautifulSoup):
    """Handles spans that apply both the `HTML_Italic` and `HTML_Monospace` classes by converting them into a single custom tag that can be handled during postprocessing."""
    ems = soup.find_all("em")


    for em in ems:
        code = em.find("code", recursive=False)
        if code:
            text = code.get_text()
            new_span = soup.new_tag("span", attrs={"class": "HTML-Sample"})
            new_span.string = text
            em.replace_with(new_span)

    return soup


def get_plaintext_from_codeblock(pre: Tag):
    """
    Return a list of the lines of text contained in a code block.
@@ -223,7 +252,9 @@ def get_plaintext_from_codeblock(pre: Tag):
    The text is retrieved from the provided `<pre>` tag's `<code>` child tag, which it always has. Preserves indentation by replacing any tabs with tab placeholders, which is necessary because Pandoc trims preceding whitespace.
    """
    # There will only be one code tag inside the pre tag
    code = pre.find("code")
    code = pre.find("code", recursive=False)
    if not code:
        return []  # Nothing to do here

    # Get the direct children of the code tag. Each span contains an <a> tag and a series of spans representing a single line's worth of text.
    code_children = code.find_all("span", recursive=False)
@@ -292,35 +323,22 @@ def handle_examples_and_notes(soup: BeautifulSoup):
            """Apply the HTML Sample style to the individual lines and merge the first line with the tag"""
            # Get code blocks' lines
            pre = body.find_all("pre")[0]
            lines = get_plaintext_from_codeblock(pre)

            # Make a new div for the tag and the body
            consolidated_div = soup.new_tag("div")
            code = pre.find("code", recursive=False)
            if not code:
                return soup  # Nothing to do here

            # Make the new paragraph for the first line, containing the label text and the first line of the code block
            label_text = NavigableString(
                f"{tag.get_text()}\t"
            )  # Add tab for indentation

            first_body_span = soup.new_tag("span", attrs={"class": "HTML_Sample"})
            first_body_span.append(lines.pop(0))

            label_and_first_line_para = soup.new_tag("p")
            label_and_first_line_para.append(label_text)
            label_and_first_line_para.append(first_body_span)

            consolidated_div.append(label_and_first_line_para)

            # For the rest of the lines, add tabs to their beginnings and add them as subsequent paragraphs
            for line in lines:
                line_paragraph = soup.new_tag("p", attrs={"class": "HTML_Sample"})
                line_paragraph.append(line)
                consolidated_div.append(line_paragraph)

            tag.insert_before(consolidated_div)
            new_pre = soup.new_tag("pre")
            new_code = soup.new_tag("code")
            new_code.append(code.get_text())
            new_pre.append(new_code)
            tag.insert_before(new_pre)
            tag.decompose()
            body.decompose()

            return soup

        # Existing tag and (first or only) body element
@@ -343,25 +361,19 @@ def handle_examples_and_notes(soup: BeautifulSoup):
            Ensure the code block has the correct indentation by prepending a tab placeholder
            """
            pre = element.find_all("pre")[0]
            new_pre = soup.new_tag("pre")
            new_code = soup.new_tag("code")

            lines: list[str] = get_plaintext_from_codeblock(pre)

            codeblock_div = soup.new_tag("div", attrs={"class": "EX"})
            for line in lines:
                # Create suffix to tell whether paragraph should have space after it
                suffix = NO_SPACE
                if line == lines[-1]:
                    suffix = WITH_SPACE

                line_paragraph = soup.new_tag(
                    "p", attrs={"class": f"HTML_Sample/{suffix}"}
                )  # This class/style name doesn't exist, but will be normalized later on in postprocessing.
                line_paragraph.append(line)
            code = pre.find("code", recursive=False)
            if not code:
                return soup  # Nothing to do here

                codeblock_div.append(line_paragraph)
            raw = code.get_text()
            new_code.append(f"{raw}")
            new_pre.append(new_code)

            pre.parent.parent.append(codeblock_div)
            pre.decompose()
            element.insert_before(new_pre)
            element.decompose()

            return soup

@@ -509,6 +521,8 @@ def convert_codeblock_styles_to_etsi(soup: BeautifulSoup):

    for pre in pres:
        lines: list[str] = get_plaintext_from_codeblock(pre)
        if len(lines) == 0:
            return soup  # Nothing to do here

        new_codeblock = soup.new_tag("div")

@@ -610,7 +624,9 @@ def prepare_table_cell_classes(soup: BeautifulSoup):
                para = soup.new_tag("p").append(child.get_text())
                child.replace_with(para)
            except ValueError:
                p_warning(f'Could not add child: {repr(child)} to paragraph in table cell')
                p_warning(
                    f"Could not add child: {repr(child)} to paragraph in table cell"
                )

        div.unwrap()

@@ -680,11 +696,13 @@ def preprocess(
                soup = BeautifulSoup(html, "html.parser")

            soup = remove_pandoc_toc(soup)
            soup = remove_badges(soup)
            soup = change_images_to_use_high_quality(soup, src)
            soup = modify_links(soup)
            soup = handle_italic_monospace(soup)
            soup = handle_examples_and_notes(soup)
            soup = handle_abbreviations(soup)
            soup = convert_codeblock_styles_to_etsi(soup)
            # soup = convert_codeblock_styles_to_etsi(soup)
            soup = cleanup_code_tags(soup)
            soup = create_custom_tags_for_bold_italic_underline_styles(soup)
            soup = prepare_table_cell_classes(soup)
+1 −0
Original line number Diff line number Diff line
@@ -310,6 +310,7 @@ def get_html_to_docx_command(dest: str, consolidated_html_path, output_doc_path)
        output_doc_path,
        "--lua-filter=html_to_docx.lua",
        f"--reference-doc={REFERENCE_DOC}",
        "--preserve-tabs",
    ]

    return command