Loading md_to_docx_converter/src/to_md/cleaning.py +22 −21 Original line number Diff line number Diff line Loading @@ -692,34 +692,35 @@ def format_lists(soup: BeautifulSoup): # Add group elements to the list as list items for elem in group: li = soup.new_tag("li") is_first_text = True # Iterate over a copy to avoid modifying the list while iterating for elem_child in elem.contents[:]: if isinstance(elem_child, NavigableString): text = str(elem_child) for elem_child in elem.contents: # Strip bullets/numbering from child text, if applicable if not elem_child.get_text().strip(): continue # No text child_text = elem_child.get_text().strip() index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", child_text) bullet_portion = re.search(r"^(\-\s+)", child_text) match = None # Remove bullet/numbering only from the first text element if is_first_text: # Strip leading whitespace for the first element text = text.lstrip() if index_portion: match = index_portion.group(0) bullet_portion = re.search(r"^(\-\s+)", text) index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", text) elif bullet_portion: match = bullet_portion.group(0) if bullet_portion: text = text[len(bullet_portion.group(0)) :] elif index_portion: text = text[len(index_portion.group(0)) :] # Set child text to the stripped text, if applicable child_text = child_text[len(match) :] if match else child_text is_first_text = False # Add the child to the list item if isinstance(elem_child, NavigableString): li.append(NavigableString(child_text)) # Only append non-empty strings (but preserve whitespace) if text: li.append(NavigableString(text)) elif isinstance(elem_child, Tag): # Extract and preserve HTML structure including spans li.append(elem_child.extract()) is_first_text = False new_list.append(li) Loading Loading
md_to_docx_converter/src/to_md/cleaning.py +22 −21 Original line number Diff line number Diff line Loading @@ -692,34 +692,35 @@ def format_lists(soup: BeautifulSoup): # Add group elements to the list as list items for elem in group: li = soup.new_tag("li") is_first_text = True # Iterate over a copy to avoid modifying the list while iterating for elem_child in elem.contents[:]: if isinstance(elem_child, NavigableString): text = str(elem_child) for elem_child in elem.contents: # Strip bullets/numbering from child text, if applicable if not elem_child.get_text().strip(): continue # No text child_text = elem_child.get_text().strip() index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", child_text) bullet_portion = re.search(r"^(\-\s+)", child_text) match = None # Remove bullet/numbering only from the first text element if is_first_text: # Strip leading whitespace for the first element text = text.lstrip() if index_portion: match = index_portion.group(0) bullet_portion = re.search(r"^(\-\s+)", text) index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", text) elif bullet_portion: match = bullet_portion.group(0) if bullet_portion: text = text[len(bullet_portion.group(0)) :] elif index_portion: text = text[len(index_portion.group(0)) :] # Set child text to the stripped text, if applicable child_text = child_text[len(match) :] if match else child_text is_first_text = False # Add the child to the list item if isinstance(elem_child, NavigableString): li.append(NavigableString(child_text)) # Only append non-empty strings (but preserve whitespace) if text: li.append(NavigableString(text)) elif isinstance(elem_child, Tag): # Extract and preserve HTML structure including spans li.append(elem_child.extract()) is_first_text = False new_list.append(li) Loading