Commit 4bfb77b7 authored by ankraft's avatar ankraft
Browse files

Improved support for handling grid tables (reduced width when possible,...

Improved support for handling grid tables (reduced width when possible, support rowspans). However, rogue empty cells cannot always be detected.
parent e884d69b
Loading
Loading
Loading
Loading
+118 −28
Original line number Diff line number Diff line
@@ -9,6 +9,7 @@
import re

colspanMarker = '~~COLSPAN~~'
rowspanMarker = '~~ROWSPAN~~'

def markdownToGrid(markdownLines:list[str]) -> list[str]:
	"""	Convert a markdown table to a grid table. 
@@ -34,7 +35,6 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
		for line in markdownLines
	]


	# Get maximum width for each column 
	colWidths = []
	maxCols = max(len(row) for row in rows)
@@ -45,12 +45,11 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:

	# Process merged cells - combine content with previous cell
	for row in rows:
		for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues
		for i in range(len(row)-1, -1, -1): # Work backwards to avoid index issues
			if row[i].strip() == colspanMarker:
				row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content
				# row[i] = None 	# type:ignore[call-overload] # Indicate removal

	
	# Pad any rows that are too short
	for row in rows:
		while len(row) < maxCols:
@@ -71,12 +70,34 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
	result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
	
	# Data rows
	for row in rows[2:]:
	for rowIndex, row in enumerate(rows[2:]):

		# The following code detects if cells in the next row have rowspan marker(s)
		# If so, it will merge the cells with the current one and remove the rowspan marker
		# from that cell
		nextRowCellsMerged:list[bool] = []

		if rowIndex < len(rows)-3:
			for cellIndex, cell in enumerate(rows[rowIndex+3]):
				if cell.strip() == rowspanMarker:
					nextRowCellsMerged.append(True)
					rows[rowIndex+3][cellIndex] = cell.replace(rowspanMarker, ' '*len(rowspanMarker))
				else:	
					nextRowCellsMerged.append(False)
			# nextRowCellsMerged = [ cell.strip() == rowspanMarker for cell in rows[rowIndex+3] ]
		else:
			nextRowCellsMerged = [ False for _ in rows[rowIndex+2] ]

		result.append('|' + '|'.join(
			f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None
			f'{row[i]:<{colWidths[i]}}' 
				if row[i] != rowspanMarker else '' 
				for i in range(len(row)) 
				if row[i] is not None
		) + '|')
		result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')

		# Add separator line, if not merged
		result.append('+' + '+'.join('-' * (w + 2) if not nextRowCellsMerged[cellIndex] else ' ' * (w + 2)
							   for cellIndex, w in enumerate(colWidths)) + '+')
	return result


@@ -90,6 +111,49 @@ def formatGridTable(lines: list[str]) -> list[str]:
	Returns:
		Formatted grid table as list of strings
	"""

	def _getCellsFromRow(row:str) -> list[str]:
		"""Helper function to extract cells from a row.

			This is done by splitting the row string by the '|' character
			and returning the cells as a list. The first and last elements
			are ignored as they are empty strings.

			Args:
				row: The row string to split.

			Returns:
				A list of cells extracted from the row.
		"""
		return row.strip().split('|')[1:-1]
	

	def _guessColumnWidth(columnID:int) -> int:
		"""Helper function to guess the width of a column.

			This is done by checking the content of the cells in the column
			and returning the maximum width found. This value may not be 
			accurate if the column contains merged cells, but it is a good
			approximation.

			Args:
				columnID: The column ID to check.

			Returns:
				The guessed width of the column.s
		"""
		width = 0
		for row in lines:
			if row.startswith('|'):
				rowCells = _getCellsFromRow(row)
				if columnID < len(rowCells):
					cellLines = rowCells[columnID].rstrip().split('\\\n')
					for line in cellLines:
						if line != colspanMarker:
							width = max(width, len(line.rstrip()))
		return width

	
	if not lines or len(lines) < 3:
		return lines

@@ -101,41 +165,67 @@ def formatGridTable(lines: list[str]) -> list[str]:
	for row in lines:
		if row.startswith('|'):
			# Split cells and get their lengths
			rowCells = row.strip().split('|')[1:-1]
			rowCells = _getCellsFromRow(row)
			for i, cell in enumerate(rowCells):
				if i >= len(colWidths):
					continue
				# Calculate maximum width of each line in the cell. Lines could be multilines, so we need to split them.
				cellLines = cell.strip().split('\\\n')
				cellWidth = max(len(line.strip()) if line != colspanMarker else 0
				cellLines = cell.rstrip().split('\\\n')
				requiredCellWidth = max(len(line.rstrip()) if line != colspanMarker else 0
								for line in cellLines)
				if cellWidth > colWidths[i]:
					colWidths[i] = cellWidth

				if requiredCellWidth > colWidths[i]:
					# Check if the next cell or cells are colspan markers
					# If so, then sum the widths of the current and next cells and increase the width
					# only if the required size is still bigger than the current one
					# Check for colspan markers
					overAllCellWidth = colWidths[i]
					nextIdx = i + 1
					while nextIdx < len(rowCells) and rowCells[nextIdx].strip() == colspanMarker:
						cw = colWidths[nextIdx]
						if cw == 0:
							cw = _guessColumnWidth(nextIdx)
						overAllCellWidth += cw
						nextIdx += 1
					if requiredCellWidth > overAllCellWidth:
						# Increase the width of the current cell
						colWidths[i] += requiredCellWidth-overAllCellWidth


	# Process each line
	for line in lines:
		if line.startswith('+-'):
		# Normal separator line can either start with '+ ' or '+-'
		if line.startswith('+-') or line.startswith('+ '):	
			# Get the kind of row separator for each column
			_originalSeparator = [ l[0] for l in line.split('+')[1:-1] ]
			# Separator line - rebuild with correct column widths
			result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
			result.append('+' + '+'.join(_originalSeparator[colIndex] * (w) 
							   			 for colIndex, w in enumerate(colWidths)
							   			 if colWidths[colIndex] > 0  ) + '+')
			continue
		elif line.startswith('+='):
			# Separator line - rebuild with correct column widths
			result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+')
			result.append('+' + '+'.join('=' * (w) 
										 for colIndex, w in enumerate(colWidths)
										 if colWidths[colIndex] > 0 ) + '+')
			continue
		elif line.startswith('+:='):
			# Separator line - rebuild with correct column widths
			# ATTN: This is a special casse. It assumes that all columns are left-aligned.
			result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
			result.append('+:' + '+:'.join('=' * (w-1) 
								  		   for colIndex, w in enumerate(colWidths)
										   if colWidths[colIndex] > 0 ) + '+')
			continue


		elif line.startswith('|'):
			# Content line
			cells = line.strip().split('|')[1:-1]
			cells = line.rstrip().split('|')[1:-1]
			formattedCells = []
			i = 0
			while i < len(cells):
				cell = cells[i].strip()
				if cell == colspanMarker:
				cell = cells[i].rstrip()
				if cell.strip() == colspanMarker:
					# Skip merged cells - they were handled with previous cell
					i += 1
					continue
@@ -144,12 +234,12 @@ def formatGridTable(lines: list[str]) -> list[str]:
				width = colWidths[i]
				nextIdx = i + 1
				while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker:
					width += colWidths[nextIdx] + 3  # +3 for the cell borders
					width += colWidths[nextIdx] + 1
					nextIdx += 1

				# Format the cell content
				formattedCells.append(f'{cell:<{width}}')
				i += 1
				i = nextIdx

			result.append('|' + '|'.join(formattedCells) + '|')

@@ -204,10 +294,10 @@ def handleMultiLineGridTable(lines: list[str]) -> list[str]:
						else:
							# Use the part if available, otherwise empty string
							text = cellParts[line_idx] if line_idx < len(cellParts) else ''
						newCells.append(text.strip())
					new_line = '|' + '|'.join(f' {cell} ' for cell in newCells) + '|'
						newCells.append(text.rstrip())
					newLine = '|' + '|'.join(f'{cell}' for cell in newCells) + '|'
					# Store with original line index as key
					rowLines[i] = rowLines.get(i, []) + [new_line]
					rowLines[i] = rowLines.get(i, []) + [newLine]
			else:
				# No line breaks, keep original line
				rowLines[i] = [line]
+50 −26
Original line number Diff line number Diff line
@@ -28,7 +28,8 @@ from rich import inspect
import configparser, zipfile
from lxml import etree as ET

from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker
from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, \
	formatGridTable, colspanMarker, rowspanMarker

class Style(IntEnum):
	code = auto()
@@ -84,6 +85,7 @@ _print:Callable = print

# Some predefined tags and attributes
wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
w14ns = 'http://schemas.microsoft.com/office/word/2010/wordml'
_val = f'{{{wns}}}val'

class SectionNumbers(object):
@@ -352,12 +354,12 @@ def processDocuments(documents:list[str],
							 'footnoteRef',
							 'annotationRef',
			)
			
			newParagraphs = 0

			def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
				"""	Recursively parse a document paragraph.
				"""
				nonlocal _ignoredTags
				nonlocal _ignoredTags, newParagraphs

				_result = ''
				tag = strippedTag(element.tag)	# remove namespaces for easier handlings
@@ -497,14 +499,16 @@ def processDocuments(documents:list[str],
			# _print(ET.fromstring(elem._p.xml))
			match elem:
				case Paragraph():	# type: ignore[misc]
					return _parseXML(ET.fromstring(elem._p.xml))
					return _parseXML(ET.fromstring(elem._p.xml)).rstrip()
				case _Cell():		# type: ignore[misc]
					# Iterate over all paragraphs in the cell and parse them
					# Create a list of parsed paragraphs and join them with linebreaks
					return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
										   for p in elem.paragraphs ])
				case ET._Element():
					return _parseXML(elem)
					# return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
					# 					   for p in elem.paragraphs ])
					return '<br />'.join([ _parseXML(elem).rstrip()])
				case _:
					return ''

@@ -769,23 +773,45 @@ def processDocuments(documents:list[str],
						nrRows = 0
						colSpanDetected = False
						for row in elem.rows:
							_row = ET.fromstring(row._tr.xml)
							cells:list[str] = []
							colspanCounter = 0
							for cell in row.cells:
							for cell in _row.findall('.//w:tc', namespaces = { 'w' : wns }):

								colspanCounter = 1  # Default value if no gridspan is specified
								gridspanElem = cell.find('.//w:tcPr/w:gridSpan', namespaces={'w': wns})
								if gridspanElem is not None and _val in gridspanElem.attrib:
									colspanCounter = int(gridspanElem.attrib[_val])
									colSpanDetected = True  # Set flag that colspan was found

								# Vertical merge
								gridspanElem = cell.find('.//w:tcPr/w:vMerge', namespaces={'w': wns})
								if gridspanElem is not None and _val not in gridspanElem.attrib:
									cells.append(rowspanMarker)
								
								else:

									# Extract text from cell
									# Find all paragraphs in the cell
									_pl:list[str] = []
									for p in cell.findall('.//w:p', namespaces={'w': wns}):
										_pl.append(getTextFromXML(p))
									# Add the text to the cell
									if len(_pl) > 0:
										cells.append(_linebreak.join(_pl))
									else:
										cells.append('')

								# Handle colspan formatting
								if not forceMarkdownTables:
									if colspanCounter > 0:
										cells.append(colspanMarker)	# add at least a space
									if colspanCounter >= 1:
										for _ in range(colspanCounter-1):
											cells.append(colspanMarker)
										colspanCounter -= 1
										continue
									if cell._tc.grid_span > 1:
										colSpanDetected = True
										colspanCounter = cell._tc.grid_span - 1
								elif cell._tc.grid_span > 1:
									colSpanDetected = True
								cells.append(f'{getTextFromXML(cell)} ')	# add at least a space
							rows.append(cells)
							nrRows += 1

						# for r in rows:
						# 	_print(r)
						
						# Warning if this is a single-row table
						if nrRows == 1:
@@ -850,8 +876,6 @@ def processDocuments(documents:list[str],
							line = line.replace(ch, f'<mark>Non-ASCII character {ch} / {hex(ord(ch))}</mark>')
							lines[i] = line

		

			#
			#	Remove multiple bold / italics on/off occurances
			#	Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros
@@ -861,6 +885,9 @@ def processDocuments(documents:list[str],
				line = lines[i]
				line = line.replace('__', '')
				line = line.replace('****', '')
				line = line.replace('**  ', '** ')
				line = line.replace('_  ', '_ ')
				line = line.replace('** **', ' ')
				#line = line.replace('  ', ' ')
				lines[i] = line

@@ -966,9 +993,6 @@ def processDocuments(documents:list[str],
				for fid, text in footnotes.items():
					lines.append(f'[^{fid}]: {text}')

			#
			#	List unresolved CAPTION markers
			#
			#
			#	List unresolved CAPTION markers
			#
@@ -980,7 +1004,6 @@ def processDocuments(documents:list[str],
			#
			#	Correct formatting of Grid tables after all other changes have been applied
			#

			if not forceMarkdownTables:
				gridTable:list[str] = []
				result:list[str] = []
@@ -1006,6 +1029,7 @@ def processDocuments(documents:list[str],
				lines = result

				
			
			#
			#	Write produced Markdown file
			#