Commit 7338f9c2 authored by Marco Cavalli's avatar Marco Cavalli
Browse files

fix: text in nested lists is not removed when importing from DOCX

parent 29a5cc93
Loading
Loading
Loading
Loading
+22 −21
Original line number Diff line number Diff line
@@ -692,34 +692,35 @@ def format_lists(soup: BeautifulSoup):
            # Add group elements to the list as list items
            for elem in group:
                li = soup.new_tag("li")
                is_first_text = True
                # Iterate over a copy to avoid modifying the list while iterating
                for elem_child in elem.contents[:]:
                    if isinstance(elem_child, NavigableString):
                        text = str(elem_child)

                for elem_child in elem.contents:
                    # Strip bullets/numbering from child text, if applicable
                    if not elem_child.get_text().strip():
                        continue  # No text

                    child_text = elem_child.get_text().strip()

                    index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", child_text)
                    bullet_portion = re.search(r"^(\-\s+)", child_text)

                    match = None
                        # Remove bullet/numbering only from the first text element
                        if is_first_text:
                            # Strip leading whitespace for the first element
                            text = text.lstrip()

                    if index_portion:
                        match = index_portion.group(0)
                            bullet_portion = re.search(r"^(\-\s+)", text)
                            index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", text)

                    elif bullet_portion:
                        match = bullet_portion.group(0)
                            if bullet_portion:
                                text = text[len(bullet_portion.group(0)) :]
                            elif index_portion:
                                text = text[len(index_portion.group(0)) :]

                    # Set child text to the stripped text, if applicable
                    child_text = child_text[len(match) :] if match else child_text
                            is_first_text = False

                    # Add the child to the list item
                    if isinstance(elem_child, NavigableString):
                        li.append(NavigableString(child_text))
                        # Only append non-empty strings (but preserve whitespace)
                        if text:
                            li.append(NavigableString(text))

                    elif isinstance(elem_child, Tag):
                        # Extract and preserve HTML structure including spans
                        li.append(elem_child.extract())
                        is_first_text = False

                new_list.append(li)