Loading md_to_docx_converter/src/to_html/postprocessing.py +19 −0 Original line number Diff line number Diff line Loading @@ -440,6 +440,24 @@ def add_ids_to_labels(soup: BeautifulSoup): label.attrs["id"] = f"Table_{id}" return soup def replace_dash_characters(soup: BeautifulSoup): """ Replace dash characters in the a_tags and ids with the correct ones. """ a_tags = soup.find_all("a") for a in a_tags: if a.string: a.string = a.string.replace("‑", "-").replace("—", "-") href = a.get("href", "") if href: a["href"] = href.replace("‑", "-").replace("—", "-") ids = soup.find_all(id=True) for element in ids: id = element.get("id", "") if id: element["id"] = id.replace("‑", "-").replace("—", "-") return soup def move_figure_id_to_FL_elements(soup: BeautifulSoup): """ Move the id attributes from figure elements to their parent FL elements. Loading Loading @@ -653,6 +671,7 @@ def postprocess(html_dir: str): soup = remove_links_from_labels(soup) soup = add_ids_to_labels(soup) soup = replace_dash_characters(soup) soup = move_figure_id_to_FL_elements(soup) soup = fix_custom_tags(soup) images, soup = extract_images_from_html(soup) Loading md_to_docx_converter/src/to_html/preprocessing.py +42 −28 Original line number Diff line number Diff line Loading @@ -59,7 +59,7 @@ def run_format_checks(filename: str, file_lines: list[str]): line_num = i + 1 bad_div_delin_match = re.match(BAD_DIV_DELINEATOR_REGEX, line) if bad_div_delin_match and line.startswith(":::") is False: if bad_div_delin_match and line.find(":::") == -1: # This div delineator doesn't have exactly three colons `:::` print( p_error( Loading Loading @@ -282,8 +282,8 @@ def auto_number_content( def auto_number_example(line: str) -> str: global example_counter new_line = line if "EXAMPLE" not in line: example_counter += 1 if "EXAMPLE" not in line: if ( example_counter != 1 ): # if it is one the number can be omitted, need to check later Loading @@ -295,8 +295,8 @@ def auto_number_content( def auto_number_note(line: str) -> str: global note_counter new_line = line if "NOTE" not in line: note_counter += 1 if "NOTE" not in line: if ( note_counter != 1 ): # if it is one the number can be omitted, need to check later Loading Loading @@ -329,11 +329,18 @@ def auto_number_content( global note_in_table_counter new_line = line note_in_table_counter += 1 if "NOTE" not in line: if ( note_in_table_counter != 1 ): # if it is one the number can be omitted, need to check later new_text = f"| >>> [!note] NOTE {note_in_table_counter}:" text_to_replace = "| >>> [!note]" diff_in_length = len(new_text) - len(text_to_replace) # ensure we keep the table formatting by adding spaces at the end of the line if needed if diff_in_length > 0: text_to_replace = text_to_replace + " " * diff_in_length new_line = line.replace( ">>> [!note]", f">>> [!note] NOTE {note_in_table_counter}:" text_to_replace, new_text ) return new_line Loading Loading @@ -383,14 +390,17 @@ def auto_number_content( new_line = auto_number_table(new_line) if note_in_table_counter >= 1 and first_note_in_table_line_index != -1: note_string = f" NOTE{' 1' if note_in_table_counter > 1 else ''}:" first_index_after_bracket = lines[first_note_in_table_line_index].find( "[!note]" ) lines[first_note_in_table_line_index] = ( lines[first_note_in_table_line_index][:first_index_after_bracket] + note_string + lines[first_note_in_table_line_index][first_index_after_bracket:] note_string = f"| >>> [!note] NOTE{' 1' if note_in_table_counter > 1 else ''}:" note_string_length = len(note_string) text_to_be_replaced = "| >>> [!note]" text_to_be_replaced_length = len(text_to_be_replaced) diff_in_length = note_string_length - text_to_be_replaced_length if diff_in_length > 0: text_to_be_replaced = text_to_be_replaced + " " * diff_in_length lines[first_note_in_table_line_index] = lines[ first_note_in_table_line_index ].replace( text_to_be_replaced, note_string ) note_in_table_counter = 0 first_note_in_table_line_index = -1 Loading @@ -414,14 +424,18 @@ def auto_number_content( lines[first_note_line_index] += f" NOTE{' 1' if note_counter > 1 else ''}:" if note_in_table_counter >= 1 and first_note_in_table_line_index != -1: note_string = f" NOTE{' 1' if note_in_table_counter > 1 else ''}:" first_index_after_bracket = lines[first_note_in_table_line_index].find( "[!note]" ) lines[first_note_in_table_line_index] = ( lines[first_note_in_table_line_index][:first_index_after_bracket] + note_string + lines[first_note_in_table_line_index][first_index_after_bracket:] print(f"note_in_table_counter: {note_in_table_counter}") note_string = f"| >>> [!note] NOTE{' 1' if note_in_table_counter > 1 else ''}:" note_string_length = len(note_string) text_to_be_replaced = "| >>> [!note]" text_to_be_replaced_length = len(text_to_be_replaced) diff_in_length = note_string_length - text_to_be_replaced_length if diff_in_length > 0: text_to_be_replaced = text_to_be_replaced + " " * diff_in_length lines[first_note_in_table_line_index] = lines[ first_note_in_table_line_index ].replace( text_to_be_replaced, note_string ) file_contents = "\n".join(lines) + "\n" Loading Loading
md_to_docx_converter/src/to_html/postprocessing.py +19 −0 Original line number Diff line number Diff line Loading @@ -440,6 +440,24 @@ def add_ids_to_labels(soup: BeautifulSoup): label.attrs["id"] = f"Table_{id}" return soup def replace_dash_characters(soup: BeautifulSoup): """ Replace dash characters in the a_tags and ids with the correct ones. """ a_tags = soup.find_all("a") for a in a_tags: if a.string: a.string = a.string.replace("‑", "-").replace("—", "-") href = a.get("href", "") if href: a["href"] = href.replace("‑", "-").replace("—", "-") ids = soup.find_all(id=True) for element in ids: id = element.get("id", "") if id: element["id"] = id.replace("‑", "-").replace("—", "-") return soup def move_figure_id_to_FL_elements(soup: BeautifulSoup): """ Move the id attributes from figure elements to their parent FL elements. Loading Loading @@ -653,6 +671,7 @@ def postprocess(html_dir: str): soup = remove_links_from_labels(soup) soup = add_ids_to_labels(soup) soup = replace_dash_characters(soup) soup = move_figure_id_to_FL_elements(soup) soup = fix_custom_tags(soup) images, soup = extract_images_from_html(soup) Loading
md_to_docx_converter/src/to_html/preprocessing.py +42 −28 Original line number Diff line number Diff line Loading @@ -59,7 +59,7 @@ def run_format_checks(filename: str, file_lines: list[str]): line_num = i + 1 bad_div_delin_match = re.match(BAD_DIV_DELINEATOR_REGEX, line) if bad_div_delin_match and line.startswith(":::") is False: if bad_div_delin_match and line.find(":::") == -1: # This div delineator doesn't have exactly three colons `:::` print( p_error( Loading Loading @@ -282,8 +282,8 @@ def auto_number_content( def auto_number_example(line: str) -> str: global example_counter new_line = line if "EXAMPLE" not in line: example_counter += 1 if "EXAMPLE" not in line: if ( example_counter != 1 ): # if it is one the number can be omitted, need to check later Loading @@ -295,8 +295,8 @@ def auto_number_content( def auto_number_note(line: str) -> str: global note_counter new_line = line if "NOTE" not in line: note_counter += 1 if "NOTE" not in line: if ( note_counter != 1 ): # if it is one the number can be omitted, need to check later Loading Loading @@ -329,11 +329,18 @@ def auto_number_content( global note_in_table_counter new_line = line note_in_table_counter += 1 if "NOTE" not in line: if ( note_in_table_counter != 1 ): # if it is one the number can be omitted, need to check later new_text = f"| >>> [!note] NOTE {note_in_table_counter}:" text_to_replace = "| >>> [!note]" diff_in_length = len(new_text) - len(text_to_replace) # ensure we keep the table formatting by adding spaces at the end of the line if needed if diff_in_length > 0: text_to_replace = text_to_replace + " " * diff_in_length new_line = line.replace( ">>> [!note]", f">>> [!note] NOTE {note_in_table_counter}:" text_to_replace, new_text ) return new_line Loading Loading @@ -383,14 +390,17 @@ def auto_number_content( new_line = auto_number_table(new_line) if note_in_table_counter >= 1 and first_note_in_table_line_index != -1: note_string = f" NOTE{' 1' if note_in_table_counter > 1 else ''}:" first_index_after_bracket = lines[first_note_in_table_line_index].find( "[!note]" ) lines[first_note_in_table_line_index] = ( lines[first_note_in_table_line_index][:first_index_after_bracket] + note_string + lines[first_note_in_table_line_index][first_index_after_bracket:] note_string = f"| >>> [!note] NOTE{' 1' if note_in_table_counter > 1 else ''}:" note_string_length = len(note_string) text_to_be_replaced = "| >>> [!note]" text_to_be_replaced_length = len(text_to_be_replaced) diff_in_length = note_string_length - text_to_be_replaced_length if diff_in_length > 0: text_to_be_replaced = text_to_be_replaced + " " * diff_in_length lines[first_note_in_table_line_index] = lines[ first_note_in_table_line_index ].replace( text_to_be_replaced, note_string ) note_in_table_counter = 0 first_note_in_table_line_index = -1 Loading @@ -414,14 +424,18 @@ def auto_number_content( lines[first_note_line_index] += f" NOTE{' 1' if note_counter > 1 else ''}:" if note_in_table_counter >= 1 and first_note_in_table_line_index != -1: note_string = f" NOTE{' 1' if note_in_table_counter > 1 else ''}:" first_index_after_bracket = lines[first_note_in_table_line_index].find( "[!note]" ) lines[first_note_in_table_line_index] = ( lines[first_note_in_table_line_index][:first_index_after_bracket] + note_string + lines[first_note_in_table_line_index][first_index_after_bracket:] print(f"note_in_table_counter: {note_in_table_counter}") note_string = f"| >>> [!note] NOTE{' 1' if note_in_table_counter > 1 else ''}:" note_string_length = len(note_string) text_to_be_replaced = "| >>> [!note]" text_to_be_replaced_length = len(text_to_be_replaced) diff_in_length = note_string_length - text_to_be_replaced_length if diff_in_length > 0: text_to_be_replaced = text_to_be_replaced + " " * diff_in_length lines[first_note_in_table_line_index] = lines[ first_note_in_table_line_index ].replace( text_to_be_replaced, note_string ) file_contents = "\n".join(lines) + "\n" Loading