Commit b79ec65f authored by Marco Cavalli's avatar Marco Cavalli
Browse files

feat: enhance README and codebase with improved handling of conversion...

feat: enhance README and codebase with improved handling of conversion parameters and diff marks removal
parent 15d8ac94
Loading
Loading
Loading
Loading
+0 −4
Original line number Diff line number Diff line
@@ -260,10 +260,6 @@ Starting with HTML files contained in the default source location (_GENERATED_FI

`convert.py --frm html --to docx --folder {folder_name}`

Specify a different directory containing the HTML files.

`convert.py --frm html --to docx --folder {folder_name} --src relative/or/absolute/source/path`

[^1]: These steps may not be necessary with WSL 2, but it is recommended to follow them nevertheless.

[^2]: Method subject to change
+27 −9
Original line number Diff line number Diff line
@@ -79,24 +79,42 @@ function scrollToClosestAnchorInTOC() {
}

function openCorrectPanelDependingOnStoredElement() {
  const originalLiElement = document.getElementById(
    localStorage.getItem("scrollTo")
  ).parentElement;
  const scrollToId = localStorage.getItem("scrollTo");
  if (!scrollToId) {
    return;
  }

  const targetElement = document.getElementById(scrollToId);
  if (!targetElement) {
    return;
  }

  const originalLiElement = targetElement.parentElement;
  if (!originalLiElement) {
    return;
  }

  originalLiElement.classList.add("open");
  originalLiElement.classList.add("active");

  // we are working with li elements, so we skip the ul parent
  let parentElement = originalLiElement.parentElement.parentElement; // above li element or the nav
  let parentElement = originalLiElement.parentElement?.parentElement; // above li element or the nav
  if (!parentElement) {
    return;
  }

  // this handles all the other cases
  while (parentElement.id !== "TOC") {
    const parentElementTOCId = parentElement
      .querySelector("a")
      .getAttribute("id");
  while (parentElement && parentElement.id !== "TOC") {
    const aElement = parentElement.querySelector("a");
    if (!aElement) {
      break;
    }

    const parentElementTOCId = aElement.getAttribute("id");
    if (parentElementTOCId) {
      parentElement.classList.add("open");
      // we are working with li elements, so we skip the ul parent
      parentElement = parentElement.parentElement.parentElement;
      parentElement = parentElement.parentElement?.parentElement;
    } else {
      break;
    }
+94 −1
Original line number Diff line number Diff line
@@ -141,9 +141,13 @@ if not args.src and IS_DIFF and IS_GIT_DIFF:
    sys.exit(1)

SRC_TYPE: str = "html" if str(args.frm).startswith("html") else args.frm
if args.src:
if args.src and not (args.frm == "html" and args.to == "docx"):
    SRC = args.src
else:
    if args.src and args.frm == "html" and args.to == "docx":
        print(
            f"Warning: The {p_label('--src')} argument is ignored when converting from {p_label('html')} to {p_label('docx')}."
        )
    SRC = f"{FILEGEN_DIR}/{FOLDER}/{args.frm}"  # Use args.frm to get "html_dirty"

IS_CLEANUP: bool = str(args.frm) == "html_dirty"
@@ -371,6 +375,7 @@ def convert(conversion_args=None):
            """
            Handles conversion from HTML to a single Docx file.
            """
            html_folder = os.path.join(DEST, "html")
            preprocess_docx(
                SRC, SRC_TYPE, TO_DOCX_EXCLUDED_HTML_FILES, CONSOLIDATED_HTML_PATH
            )
@@ -699,6 +704,94 @@ if SRC_TYPE == "md" and DEST_TYPE == "html":
                    with open(file_path, "w", encoding="utf-8") as f:
                        f.write(str(soup))

if SRC_TYPE == "md" and DEST_TYPE == "docx":
    print("Converting Markdown files to DOCX...")
    work_dir = os.path.join(FILEGEN_DIR, f"{FOLDER}")
    if os.path.exists(work_dir):
        try:
            shutil.rmtree(work_dir)
        except OSError as e:
            print(f"Warning: Could not fully remove {work_dir}: {e}")
            # Try to remove recursively with ignore_errors as fallback
            shutil.rmtree(work_dir, ignore_errors=True)
    os.makedirs(work_dir)
    md_to_html_args = argparse.Namespace(
        frm="md",
        to="html",
        folder=f"{FOLDER}",
        src=SRC,
        file_order=FILE_ORDER_JSON,
        diff=args.diff,
        diff_git=args.diff_git,
    )
    try:
        print("Generating HTML files from Markdown...")
        with t.section("Generate HTML files from Markdown for DOCX conversion"):
            subprocess.run(
                [
                    sys.executable,
                    "convert.py",
                    "--frm",
                    md_to_html_args.frm,
                    "--to",
                    md_to_html_args.to,
                    "--folder",
                    md_to_html_args.folder,
                    "--src",
                    md_to_html_args.src,
                    *(
                        ["--file_order", md_to_html_args.file_order]
                        if md_to_html_args.file_order
                        else []
                    ),
                    *(
                        (
                            ["--diff-git"]
                            if md_to_html_args.diff_git is True
                            else ["--diff-git", md_to_html_args.diff_git]
                        )
                        if md_to_html_args.diff_git is not None
                        else (["--diff"] if md_to_html_args.diff else [])
                    ),
                ],
                # Note:
                # the case diff_git is not None and diff is False should not happen since --diff-git implies --diff.
                # If changes are done to the logic of how these flags are set, the command construction logic should be updated accordingly.
                check=True,
                capture_output=True,
                text=True,
            )
        html_work_dir = os.path.join(work_dir, "html")
        html_to_docx_args = argparse.Namespace(
            frm="html",
            to="docx",
            folder=f"{FOLDER}",
            src=html_work_dir,
            diff=IS_DIFF,
        )
        print("Converting HTML files to DOCX...")
        with t.section("Convert generated HTML files to DOCX"):
            subprocess.run(
                [
                    sys.executable,
                    "convert.py",
                    "--frm",
                    html_to_docx_args.frm,
                    "--to",
                    html_to_docx_args.to,
                    "--folder",
                    html_to_docx_args.folder,
                    "--src",
                    html_to_docx_args.src,
                ],
                check=True,
                capture_output=True,
                text=True,
            )
    except subprocess.CalledProcessError as e:
        print(p_error(f"Error during conversion:\n{e.stderr}"))
        sys.exit(1)

print("Post-processing completed successfully.")
if TIMER_ENABLED:
    print("Timing report:")
+15 −4
Original line number Diff line number Diff line
@@ -57,15 +57,26 @@ body {
  position: relative;
  overflow: visible;
  display: block;
  background-color: red;
    border-radius: 10px;
    padding-left: 10px;
    padding-right: 10px;
    margin-right: 10px;
  width: 100%;
}

nav>ul>li>a.diff-changes-enable::after {
  content: " ●";
  color: green;
#TOC a.diff-changes-enable::after {
  /* content: " ●"; */
    content: attr(data-diff-count);
    background-color: green;
    border-radius: 10px;
    padding-left: 5px;
    padding-right: 5px;
    margin-left: 5px;
    color: white;
}

nav>ul>li>a.diff-changes-enable:hover::before {
#TOC a.diff-changes-enable:hover::before {
  content: "Total Changes: " attr(data-diff-count);
  position: absolute;
  left: 2%;
+3 −2
Original line number Diff line number Diff line
@@ -6,18 +6,19 @@ from docx.oxml.ns import nsdecls
import os
from src.constants import TEXT_TO_REPLACE_IN_FRONTPAGE
from bs4 import BeautifulSoup, Tag, NavigableString
from src.to_docx.preprocessing import remove_diff_marks

def scrap_replacements_from_html(front_page_html_file: str) -> dict:
    replacements = {}
    with open(front_page_html_file, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html.parser')
        soup = remove_diff_marks(soup)
        for key in TEXT_TO_REPLACE_IN_FRONTPAGE:
            element = soup.find(attrs={"data-replace": key})
            if element:
                if key == "DATE":
                    # Special handling for DATE to format it as needed
                    date_text = element.get_text(strip=True)
                    print(f"Found date: {date_text}")
                    split_date = date_text.split('-')
                    YEAR = split_date[0] if len(split_date) > 0 else ''
                    MONTH = split_date[1] if len(split_date) > 1 else ''
Loading