Commit 2fbaa0a9 authored by Marco Cavalli's avatar Marco Cavalli
Browse files

feat: improve HTML diff handling with enhanced element cloning and output formatting

parent b79ec65f
Loading
Loading
Loading
Loading
+4 −20
Original line number Diff line number Diff line
@@ -54,7 +54,7 @@ body {
}

#TOC a.diff-changes-enable {
  position: relative;
  /* position: relative; */
  overflow: visible;
  display: block;
  background-color: red;
@@ -76,7 +76,7 @@ body {
    color: white;
}

#TOC a.diff-changes-enable:hover::before {
/* #TOC a.diff-changes-enable:hover::before {
  content: "Total Changes: " attr(data-diff-count);
  position: absolute;
  left: 2%;
@@ -91,7 +91,7 @@ body {
  white-space: nowrap;
  z-index: 1000;
  box-shadow: 0 2px 6px rgba(0, 0, 0, 0.2);
}
} */

#TOC h1 {
  background-color: #02488d;
@@ -2403,19 +2403,3 @@ ins:has(>li),
del:has(>li) {
  display: block;
}
 No newline at end of file

/* 
Rule to show deleted lines as strikethrough with a red dashed underline in simple diff mode
div[data-original-content]>del.diff-view-simple {
  display: block !important;
  color: transparent !important;
  border-left: 3px solid #b00020 !important;
  padding-left: 2px !important;
  min-height: 1.2em !important;
  position: relative !important;
  background: repeating-linear-gradient(to right,
      #b00020 0,
      #b00020 8px,
      transparent 8px,
      transparent 12px) center/100% 2px no-repeat !important;
} */
 No newline at end of file
+46 −30
Original line number Diff line number Diff line
@@ -10,6 +10,7 @@ from bs4 import (
    MarkupResemblesLocatorWarning,
)
import warnings
import html

warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

@@ -17,24 +18,34 @@ warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
def clone_element(elem, soup):
    """
    Clones an element preserving whitespace and structure.
    More reliable than serializing and re-parsing.
    Recursively copies without serializing/parsing to avoid entity issues.
    """
    if isinstance(elem, NavigableString):
        if isinstance(elem, Comment):
            return Comment(str(elem))
        # Create a new string in the target soup
        return soup.new_string(str(elem))

    # For Tag elements, serialize with formatter=None to preserve all whitespace
    # then re-parse with html.parser
    elem_str = elem.decode(formatter=None)
    cloned = BeautifulSoup(elem_str, "html.parser").contents[0]
    return cloned
    # For Tag elements, create a new tag and recursively copy children
    if isinstance(elem, Tag):
        new_tag = soup.new_tag(elem.name)
        # Copy attributes
        for key, value in elem.attrs.items():
            new_tag[key] = value
        # Recursively copy children
        for child in elem.children:
            new_child = clone_element(child, soup)
            new_tag.append(new_child)
        return new_tag
    
    # Fallback to string representation
    return soup.new_string(str(elem))


def diff_text_content(old_text: str, new_text: str) -> str:
    """Diffs two text strings and returns HTML with track changes."""
def diff_text_content(old_text: str, new_text: str, soup):
    """Diffs two text strings and returns list of BeautifulSoup elements with track changes."""
    if not old_text and not new_text:
        return ""
        return []

    dmp = diff_match_patch()
    dmp.Diff_Timeout = 0
@@ -45,13 +56,19 @@ def diff_text_content(old_text: str, new_text: str) -> str:
    result = []
    for op, text in diffs:
        if op == -1:  # Deletion
            result.append(f"<del class='diff-view-all'>{text}</del>")
            del_tag = soup.new_tag("del")
            del_tag["class"] = ["diff-view-all"]
            del_tag.string = text
            result.append(del_tag)
        elif op == 1:  # Insertion
            result.append(f"<ins class='diff-view-all'>{text}</ins>")
            ins_tag = soup.new_tag("ins")
            ins_tag["class"] = ["diff-view-all"]
            ins_tag.string = text
            result.append(ins_tag)
        else:  # No change
            result.append(text)
            result.append(soup.new_string(text))

    return "".join(result)
    return result


def is_whitespace_only(elem):
@@ -551,28 +568,22 @@ def diff_elements(old_elem, new_elem, parent_result, soup):
            parent_result.append(soup.new_string(preserved_text))
            return

        diff_html = diff_text_content(old_text, new_text)
        if diff_html:
            # Parse the diff result
            diff_soup = BeautifulSoup(diff_html, "html.parser")

            for child in diff_soup.children:
                if isinstance(child, NavigableString):
                    text = str(child)
        diff_elements_list = diff_text_content(old_text, new_text, soup)
        if diff_elements_list:
            for elem in diff_elements_list:
                if isinstance(elem, NavigableString):
                    text = str(elem)
                    # If inside sourceCode, preserve spaces with non-breaking spaces
                    if in_source_code:
                        text = text.replace(" ", "\u00a0")
                    parent_result.append(soup.new_string(text))
                else:
                    # Create a new tag in the result soup instead of reusing the parsed one
                    new_tag = soup.new_tag(child.name)
                    if child.string:
                        text = child.string
                    # elem is already a proper BeautifulSoup tag
                    if elem.string and in_source_code:
                        # If inside sourceCode, preserve spaces with non-breaking spaces
                        if in_source_code:
                            text = text.replace(" ", "\u00a0")
                        new_tag.string = text
                    parent_result.append(new_tag)
                        text = elem.string.replace(" ", "\u00a0")
                        elem.string = text
                    parent_result.append(elem)
        return

    # One is a tag, the other is a string: they are different
@@ -774,6 +785,10 @@ def make_trackchanges_diff(
    except Exception as e:
        new_html = "<html><body><div id='editor'></div></body></html>"

    if "annex-c" in out_path.lower():
        print(f"Processing Annex C: {out_path}")
        print(f"Content: {new_html}")  # Print the beginning of the content for debugging

    # Parse with BeautifulSoup
    old_soup = BeautifulSoup(old_html, "html.parser")
    new_soup = BeautifulSoup(new_html, "html.parser")
@@ -866,7 +881,8 @@ def make_trackchanges_diff(
        if "diff-view-all" not in row.get("class", []):
            row["class"].append("diff-view-all")

    Path(out_path).write_text(str(result_soup), encoding="utf-8")
    # Use formatter="html" to properly handle HTML entities
    Path(out_path).write_text(result_soup.decode(formatter="html"), encoding="utf-8")
    # print(f"Created diff: {out_path}")
    return total_changes