Commit 7338f9c2 authored by Marco Cavalli's avatar Marco Cavalli
Browse files

fix: text in nested lists is not removed when importing from DOCX

parent 29a5cc93
Loading
Loading
Loading
Loading
+22 −21
Original line number Original line Diff line number Diff line
@@ -692,34 +692,35 @@ def format_lists(soup: BeautifulSoup):
            # Add group elements to the list as list items
            # Add group elements to the list as list items
            for elem in group:
            for elem in group:
                li = soup.new_tag("li")
                li = soup.new_tag("li")
                is_first_text = True
                # Iterate over a copy to avoid modifying the list while iterating
                for elem_child in elem.contents[:]:
                    if isinstance(elem_child, NavigableString):
                        text = str(elem_child)


                for elem_child in elem.contents:
                        # Remove bullet/numbering only from the first text element
                    # Strip bullets/numbering from child text, if applicable
                        if is_first_text:
                    if not elem_child.get_text().strip():
                            # Strip leading whitespace for the first element
                        continue  # No text
                            text = text.lstrip()

                    child_text = elem_child.get_text().strip()

                    index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", child_text)
                    bullet_portion = re.search(r"^(\-\s+)", child_text)

                    match = None


                    if index_portion:
                            bullet_portion = re.search(r"^(\-\s+)", text)
                        match = index_portion.group(0)
                            index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", text)


                    elif bullet_portion:
                            if bullet_portion:
                        match = bullet_portion.group(0)
                                text = text[len(bullet_portion.group(0)) :]
                            elif index_portion:
                                text = text[len(index_portion.group(0)) :]


                    # Set child text to the stripped text, if applicable
                            is_first_text = False
                    child_text = child_text[len(match) :] if match else child_text


                    # Add the child to the list item
                        # Only append non-empty strings (but preserve whitespace)
                    if isinstance(elem_child, NavigableString):
                        if text:
                        li.append(NavigableString(child_text))
                            li.append(NavigableString(text))


                    elif isinstance(elem_child, Tag):
                    elif isinstance(elem_child, Tag):
                        # Extract and preserve HTML structure including spans
                        li.append(elem_child.extract())
                        li.append(elem_child.extract())
                        is_first_text = False


                new_list.append(li)
                new_list.append(li)