Loading md_to_docx_converter/src/to_md/cleaning.py +22 −21 Original line number Original line Diff line number Diff line Loading @@ -692,34 +692,35 @@ def format_lists(soup: BeautifulSoup): # Add group elements to the list as list items # Add group elements to the list as list items for elem in group: for elem in group: li = soup.new_tag("li") li = soup.new_tag("li") is_first_text = True # Iterate over a copy to avoid modifying the list while iterating for elem_child in elem.contents[:]: if isinstance(elem_child, NavigableString): text = str(elem_child) for elem_child in elem.contents: # Remove bullet/numbering only from the first text element # Strip bullets/numbering from child text, if applicable if is_first_text: if not elem_child.get_text().strip(): # Strip leading whitespace for the first element continue # No text text = text.lstrip() child_text = elem_child.get_text().strip() index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", child_text) bullet_portion = re.search(r"^(\-\s+)", child_text) match = None if index_portion: bullet_portion = re.search(r"^(\-\s+)", text) match = index_portion.group(0) index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", text) elif bullet_portion: if bullet_portion: match = bullet_portion.group(0) text = text[len(bullet_portion.group(0)) :] elif index_portion: text = text[len(index_portion.group(0)) :] # Set child text to the stripped text, if applicable is_first_text = False child_text = child_text[len(match) :] if match else child_text # Add the child to the list item # Only append non-empty strings (but preserve whitespace) if isinstance(elem_child, NavigableString): if text: li.append(NavigableString(child_text)) li.append(NavigableString(text)) elif isinstance(elem_child, Tag): elif isinstance(elem_child, Tag): # Extract and preserve HTML structure including spans li.append(elem_child.extract()) li.append(elem_child.extract()) is_first_text = False new_list.append(li) new_list.append(li) Loading Loading
md_to_docx_converter/src/to_md/cleaning.py +22 −21 Original line number Original line Diff line number Diff line Loading @@ -692,34 +692,35 @@ def format_lists(soup: BeautifulSoup): # Add group elements to the list as list items # Add group elements to the list as list items for elem in group: for elem in group: li = soup.new_tag("li") li = soup.new_tag("li") is_first_text = True # Iterate over a copy to avoid modifying the list while iterating for elem_child in elem.contents[:]: if isinstance(elem_child, NavigableString): text = str(elem_child) for elem_child in elem.contents: # Remove bullet/numbering only from the first text element # Strip bullets/numbering from child text, if applicable if is_first_text: if not elem_child.get_text().strip(): # Strip leading whitespace for the first element continue # No text text = text.lstrip() child_text = elem_child.get_text().strip() index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", child_text) bullet_portion = re.search(r"^(\-\s+)", child_text) match = None if index_portion: bullet_portion = re.search(r"^(\-\s+)", text) match = index_portion.group(0) index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", text) elif bullet_portion: if bullet_portion: match = bullet_portion.group(0) text = text[len(bullet_portion.group(0)) :] elif index_portion: text = text[len(index_portion.group(0)) :] # Set child text to the stripped text, if applicable is_first_text = False child_text = child_text[len(match) :] if match else child_text # Add the child to the list item # Only append non-empty strings (but preserve whitespace) if isinstance(elem_child, NavigableString): if text: li.append(NavigableString(child_text)) li.append(NavigableString(text)) elif isinstance(elem_child, Tag): elif isinstance(elem_child, Tag): # Extract and preserve HTML structure including spans li.append(elem_child.extract()) li.append(elem_child.extract()) is_first_text = False new_list.append(li) new_list.append(li) Loading