Commit 52291eb5 authored by Marco Cavalli's avatar Marco Cavalli
Browse files

feat: support ETSI guidelines for images and captions

fix: support up to 7 level headings
feat: support dynamic links with the format Figure+++filename for images
parent aa4298a1
Loading
Loading
Loading
Loading
+115 −1
Original line number Diff line number Diff line
@@ -80,6 +80,118 @@ local function wrap_table_cell_contents(el, default_class) --- Apply TAH or tagg
  return el
end

local function wrap_images_in_table(el) --- Wrap images in table cells with a div with the FL class
  local function process_content(content)
    local result = {}
    for i, item in ipairs(content) do
      if item.t == "Para" then
        -- Check the first element in the div
        if #item.content > 0 and item.content[1].t == "Image" then
          local new_div = pandoc.Div({ item }, pandoc.Attr("", { "FL" }))
          result[i] = new_div
        else
          local processed_content = process_content(item.content)
          item.content = processed_content
          result[i] = item
        end
      elseif item.t == "Div" then
        -- Process content inside divs recursively
        local processed_content = process_content(item.content)
        item.content = processed_content
        result[i] = item
      else
        result[i] = item
      end
    end
    return result
  end

  local function wrap_images_in_cell(cell)
    cell.content = process_content(cell.content)
    return cell
  end
  
  if el.head then
    for _, row in ipairs(el.head.rows or {}) do
      for i, cell in ipairs(row.cells) do
        row.cells[i] = wrap_images_in_cell(cell)
      end
    end
  end

  if el.bodies then
    for _, body in ipairs(el.bodies or {}) do
      for _, row in ipairs(body.body or {}) do
        for i, cell in ipairs(row.cells) do
          row.cells[i] = wrap_images_in_cell(cell)
        end
      end
    end
  end

  return el
end

local function wrap_images_captions_in_table(el) --- Wrap images in table cells with a div with the FL class
  local function process_content(content)
    local result = {}
    for i, item in ipairs(content) do
      if item.t == "Para" then
        -- Check if paragraph starts with "Figure"
        local is_figure_caption = false
        
        if #item.content > 0 and item.content[1].t == "Strong" then
          -- Get the text from the Strong element
          local strong_text = pandoc.utils.stringify(item.content[1])
          if strong_text:match("^Figure") then
            is_figure_caption = true
          end
        end
        
        if is_figure_caption then
          -- If it's a figure caption, wrap it in a div with TF class
          local para_content = pandoc.Para(pandoc.utils.stringify(item))
          result[i] = pandoc.Div({ para_content }, pandoc.Attr("", { "TF" }))
        else
          result[i] = item
        end
      elseif item.t == "Div" then
        -- Process content inside divs recursively
        local processed_content = process_content(item.content)
        item.content = processed_content
        result[i] = item
      else
        result[i] = item
      end
    end
    return result
  end

  local function wrap_images_captions_in_cell(cell)
    cell.content = process_content(cell.content)
    return cell
  end

  if el.head then
    for _, row in ipairs(el.head.rows or {}) do
      for i, cell in ipairs(row.cells) do
        row.cells[i] = wrap_images_captions_in_cell(cell)
      end
    end
  end

  if el.bodies then
    for _, body in ipairs(el.bodies or {}) do
      for _, row in ipairs(body.body or {}) do
        for i, cell in ipairs(row.cells) do
          row.cells[i] = wrap_images_captions_in_cell(cell)
        end
      end
    end
  end
  return el
end

local function handle_ex_no_tan(el) --- Preserves the main structure of Examples and Notes - that is, two paragraphs within the div with either the EX, the NO, or the TAN class. To do so, consolidate all the body text within a single paragraph.
  local tag = nil
  local text_fragments = {}
@@ -112,6 +224,8 @@ end

function Table(el)
  el = wrap_table_cell_contents(el, "TAL") --- All other tables that aren't wrapped with a div, use TAL as a default table body class
  el = wrap_images_in_table(el) --- Wrap images in table cells with a div with the FL class
  el = wrap_images_captions_in_table(el) --- Wrap images in table cells with a div with the FL class

  return el
end
+5 −3
Original line number Diff line number Diff line
@@ -45,8 +45,8 @@ function CustomTagsToLinks(el, prefix)

      local remaining_text = child.text
      local start_index = remaining_text:find(prefix .. "+++")
      -- logfile:write("DEBUG: Paragrafo con contenuto -> " .. pandoc.utils.stringify(child) .. "\n")
      -- logfile:write("DEBUG: Paragrafo con contenuto -> " .. prefix .. "+++" .. "\n")
      -- logfile:write("DEBUG: Par with content -> " .. pandoc.utils.stringify(child) .. "\n")
      -- logfile:write("DEBUG: Par with content -> " .. prefix .. "+++" .. "\n")
      -- logfile:flush()

      if start_index > 1 then
@@ -78,7 +78,9 @@ function CustomTagsToLinks(el, prefix)

      local id_prefix = ""
      local html_filename = ""
      if prefix ~= "Clause" then
      if prefix == "Figure" and #parts == 2  and (id:find("^[%w_-]+%.png$") or id:find("^[%w_-]+%.jpg$") or id:find("^[%w_-]+%.jpeg$") or id:find("^[%w_-]+%.svg$")) then
        id_prefix = "Figure+++"
      elseif prefix ~= "Clause" then
        id_prefix = prefix .. "_"
      end
      if filename then
+113 −24
Original line number Diff line number Diff line
@@ -196,7 +196,8 @@ def format_examples_and_notes(soup: BeautifulSoup):

    def get_label_text_and_class(para: Tag):
        """Get the label text from the paragraph and determine the class to assign to the div"""
        text = para.get_text()
        text = para.contents[0].split(":")[0] + ":"
        remaining_text = para.contents[0].split(": ")[1] if ": " in para.contents[0] else ""
        cls = ""

        if "[!tip]" in text:
@@ -204,9 +205,12 @@ def format_examples_and_notes(soup: BeautifulSoup):
        else:
            cls = "TAN" if para.find_parent("td") else "NO"

        text = text.replace("[!note]", "").replace("[!tip]", "").strip()
        text = text.replace("[!note] ", "").replace("[!tip] ", "")

        return text, cls
        remaining_contents = para.contents[1:]
        if remaining_text:
            remaining_contents.insert(0, NavigableString(remaining_text))
        return text, cls, remaining_contents

    # Take only the top-level blockquotes to simplify logic
    blockquotes = [
@@ -219,7 +223,7 @@ def format_examples_and_notes(soup: BeautifulSoup):
        if not label_para:
            continue

        label_text, label_class = get_label_text_and_class(label_para)
        label_text, label_class, remaining_contents = get_label_text_and_class(label_para)

        new_parent_div = soup.new_tag("div", attrs={"class": label_class})

@@ -234,7 +238,12 @@ def format_examples_and_notes(soup: BeautifulSoup):

        # Process body
        body_div = soup.new_tag("div")

        if remaining_contents: # this happens when there is not an empty line after the [!note] or [!tip], better to do here because in md we have gridtables to take care of
            para_container = soup.new_tag("p")
            for content in remaining_contents:
                para_container.append(content)
            body_div.append(para_container)
        else:
            current = blockquote.next_sibling

            while current:
@@ -413,11 +422,23 @@ def remove_links_from_labels(soup: BeautifulSoup):
    for label in labels:
        a_tag = label.find("a")
        if a_tag:
            id = a_tag.get("href").split("#")[-1]
            label.attrs["id"] = id
            a_tag.unwrap()
    return soup

def add_ids_to_labels(soup: BeautifulSoup):
    """
    Add ids to label elements if they don't have one.
    """
    labels = soup.find_all("div", class_=["TF", "TH"])
    for label in labels:
        if not label.get("id"):
            label_text = label.get_text().strip()
            id = label_text.split(":")[0].split(" ")[1]
            if label_text.startswith("Figure"):
                label.attrs["id"] = f"Figure_{id}"
            elif label_text.startswith("Table"):
                label.attrs["id"] = f"Table_{id}"
    return soup

def move_figure_id_to_FL_elements(soup: BeautifulSoup):
    """
@@ -487,7 +508,55 @@ def fix_custom_tags(soup: BeautifulSoup):
                os._exit(1)
    return soup

# endregion
def extract_images_from_html(soup: BeautifulSoup) -> dict:
    """
    Extracts image sources from the given HTML content.

    Args:
        html (str): The HTML content as a string.

    Returns:
        dict: A dictionary mapping image filenames to their full paths.
    """
    figures = soup.find_all("div", class_="FL")
    images_mapping = {}

    for fig in figures:
        id = fig.get("id", "")
        if id:
            img = fig.find("img")
            if img:
                src = img.get("src", "").replace("media/", "")
                images_mapping[id] = src

    return images_mapping, soup

def add_custom_link_to_images(soup: BeautifulSoup, images_mapping: dict) -> BeautifulSoup:
    """
    Adds a custom link to images in the HTML content based on the provided images mapping.

    Args:
        html (str): The HTML content as a string.
        images_mapping (dict): A dictionary mapping image filenames to their full paths.

    Returns:
        str: The modified HTML content with custom links added to images.
    """
    #look for text that matches the pattern Figure+++<filename>
    a_tags = soup.find_all("a")
    for a in a_tags:
        href = a.get("href", "")
        if "Figure+++" in href:
            # Extract the filename from the href
            filename = href.split("+++")[1]
            if filename in images_mapping:
                image_info = images_mapping[filename]
                a["href"] = f"{image_info['file']}#{image_info['id']}"
                a.string = f"figure {image_info['id'].split('_')[1]}"
            else:
                raise ValueError(f"ERROR: Image '{filename}' not found in images mapping. Are you sure it exists in the media folder and is used in the document?")

    return soup


def postprocess(html_dir: str):
@@ -502,13 +571,8 @@ def postprocess(html_dir: str):
    ### Arguments
    - `html_dir`:
    """

    try: 
        os.remove("filename_numbers_mapping.json")
    except FileNotFoundError:
        pass

    filenames_mapping = get_dirty_filenames_mapping_with_expected_filenames(html_dir)
    images_mapping = {}

    for filename in os.listdir(html_dir):
        if filename.endswith(".html"):
@@ -544,8 +608,33 @@ def postprocess(html_dir: str):
        soup = format_examples_and_notes(soup)

        soup = remove_links_from_labels(soup)
        soup = add_ids_to_labels(soup)
        soup = move_figure_id_to_FL_elements(soup)
        soup = fix_custom_tags(soup)
        images, soup = extract_images_from_html(soup)
        for image_id, image_src in images.items():
            images_mapping[image_src] = {
                "id": image_id,
                "file": new_filename
            }

        contents = soup.decode_contents(formatter=None)

        with open(file_path, "w", encoding="utf-8") as html:
            html.write(contents)

    for filename in os.listdir(html_dir):
        if filename.endswith(".html"):
            file_path = os.path.join(html_dir, filename)
            with open(file_path, "r", encoding="utf-8") as html:
                soup = BeautifulSoup(html, "html.parser")
            
            try:
                soup = add_custom_link_to_images(soup, images_mapping)
            except ValueError as e:
                print(p_error(f"Error in file {filename}:"))
                print(p_error(str(e)))
                os._exit(1)

            contents = soup.decode_contents(formatter=None)

+31 −0
Original line number Diff line number Diff line
@@ -115,6 +115,35 @@ def run_format_checks(filename: str, file_lines: list[str]):
    check_divs()


def add_divs_to_images_tables(text : str) -> str:
    """Add divs around images and their captions, and tables captions to the ones defined using the ETSI guidelines."""
    file_lines = text.split("\n")
    new_file_lines = []
    TABLE_CAPTION_REGEX = r"^\*\*Table"
    IMAGE_CAPTION_REGEX = r"^\*\*Figure"
    IMAGE_DEF_REGEX = r"^!\[.*\]\(.*\)"

    for line in file_lines:
        if re.match(IMAGE_DEF_REGEX, line):
            # If the line is an image definition, add divs around it
            new_file_lines.append("::: FL")
            new_file_lines.append(line)
            new_file_lines.append(":::")
        elif re.match(IMAGE_CAPTION_REGEX, line):
            # If the line is an image caption, add divs around it
            new_file_lines.append("::: TF")
            new_file_lines.append(line.replace("**", ""))
            new_file_lines.append(":::")
        elif re.match(TABLE_CAPTION_REGEX, line):
            # If the line is a table caption, add divs around it
            new_file_lines.append("::: TH")
            new_file_lines.append(line.replace("**", ""))
            new_file_lines.append(":::")
        else:
            new_file_lines.append(line)
    
    return "\n".join(new_file_lines) + "\n"

def handle_less_than_greater_than_text(file_contents: str):
    """Replace `<` and `>` with `&lt;` and `&gt;` respectively and wrap the whole section in single code ticks to allow the text to render in the HTML"""
    regex = r"\<(?!img\b|span\b|sup|/sup)(.+?)\>"
@@ -465,6 +494,8 @@ def preprocess(

                run_format_checks(filename, text.splitlines())

                text = add_divs_to_images_tables(text)

                if filename in clauses_filenames:
                    text = auto_number_content(text, "clauses")
                    filename_numbers_mapping[filename_without_extension] = (
+46 −0
Original line number Diff line number Diff line
@@ -43,6 +43,7 @@ def ensure_correct_css_class_use(soup: BeautifulSoup):
            "ondemand_CHAR_size_9_color_000000",
            "ondemand_CHAR_size_12_color_000000",
            "example-title",
            "block"
        ]

        divs_to_unwrap = soup.find_all(
@@ -81,6 +82,10 @@ def ensure_correct_css_class_use(soup: BeautifulSoup):
                "ondemand_CHAR_name_Times_New_Roman_size_10_color_595959",
                "HTML_Keyboard",
            ],
            [
                "ETSI-code_Char",
                "HTML_Sample"
            ],
        ]

        for i in range(len(class_pairs)):
@@ -1128,6 +1133,43 @@ def fix_references(soup: BeautifulSoup):
            text_element.replace_with(new_text)
    return soup

def remove_style_from_images(soup: BeautifulSoup):
    """Removes any style attributes from images"""
    for img in soup.find_all("img"):
        if img.has_attr("style"):
            del img["style"]
        if img.has_attr("alt"):
            del img["alt"]

        parent_tag = img.parent
        if parent_tag and parent_tag.name == "span" and parent_tag.has_attr("style"):
            # remove parent span if it only contains the image and has a style attribute and leave the img
            parent_tag.replace_with(img)

    return soup

def handle_figures_tables_structure(soup: BeautifulSoup):
    """Ensures that figures and tables captions are properly structured to follow ETSI guidelines"""
    fl_divs = soup.find_all("div", class_="FL")
    tf_divds = soup.find_all("div", class_="TF")
    th_divds = soup.find_all("div", class_="TH")

    for div in fl_divs + tf_divds + th_divds:
        if div.get("class") == ["FL"]:
            div_contents = [content for content in div.contents if not is_whitespace_navstr(content)]
            if len(div_contents) == 1 and isinstance(div_contents[0], Tag) and div_contents[0].name == "img":
                img = div_contents[0]
                div.replace_with(img)
        if div.get("class") == ["TF"] or div.get("class") == ["TH"]:
            # add leading and trailing ** to the text in the div
            text = div.get_text(strip=True)
            if text:
                new_text = f"**{text}**"
                new_tag = soup.new_tag("p")
                new_tag.string = new_text
                div.replace_with(new_tag)

    return soup
# endregion


@@ -1168,4 +1210,8 @@ def cleaning(soup: BeautifulSoup, css_src: bool):

    soup = fix_references(soup)

    soup = remove_style_from_images(soup)

    soup = handle_figures_tables_structure(soup)

    return soup
Loading