fix: text in nested lists is not removed when importing from DOCX (7338f9c2) · Commits · CIM - Context Information Management / NGSI-LD API

md_to_docx_converter/src/to_md/cleaning.py

+22 −21

Original line number	Diff line number	Diff line
		@@ -692,34 +692,35 @@ def format_lists(soup: BeautifulSoup):
		# Add group elements to the list as list items
		for elem in group:
		li = soup.new_tag("li")
		is_first_text = True
		# Iterate over a copy to avoid modifying the list while iterating
		for elem_child in elem.contents[:]:
		if isinstance(elem_child, NavigableString):
		text = str(elem_child)

		for elem_child in elem.contents:
		# Strip bullets/numbering from child text, if applicable
		if not elem_child.get_text().strip():
		continue # No text

		child_text = elem_child.get_text().strip()

		index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", child_text)
		bullet_portion = re.search(r"^(\-\s+)", child_text)

		match = None
		# Remove bullet/numbering only from the first text element
		if is_first_text:
		# Strip leading whitespace for the first element
		text = text.lstrip()

		if index_portion:
		match = index_portion.group(0)
		bullet_portion = re.search(r"^(\-\s+)", text)
		index_portion = re.search(r"^([A-Za-z0-9]+[\.\)]\s+)", text)

		elif bullet_portion:
		match = bullet_portion.group(0)
		if bullet_portion:
		text = text[len(bullet_portion.group(0)) :]
		elif index_portion:
		text = text[len(index_portion.group(0)) :]

		# Set child text to the stripped text, if applicable
		child_text = child_text[len(match) :] if match else child_text
		is_first_text = False

		# Add the child to the list item
		if isinstance(elem_child, NavigableString):
		li.append(NavigableString(child_text))
		# Only append non-empty strings (but preserve whitespace)
		if text:
		li.append(NavigableString(text))

		elif isinstance(elem_child, Tag):
		# Extract and preserve HTML structure including spans
		li.append(elem_child.extract())
		is_first_text = False

		new_list.append(li)