Improved support for handling grid tables (reduced width when possible,... (4bfb77b7) · Commits · Centre for Testing and Interoperability / Markdown specifications development / spec2md

gridTable.py

+118 −28

Original line number	Diff line number	Diff line
		@@ -9,6 +9,7 @@
		import re

		colspanMarker = '~~COLSPAN~~'
		rowspanMarker = '~~ROWSPAN~~'

		def markdownToGrid(markdownLines:list[str]) -> list[str]:
		""" Convert a markdown table to a grid table.
		@@ -34,7 +35,6 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
		for line in markdownLines
		]


		# Get maximum width for each column
		colWidths = []
		maxCols = max(len(row) for row in rows)
		@@ -45,12 +45,11 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:

		# Process merged cells - combine content with previous cell
		for row in rows:
		for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues
		for i in range(len(row)-1, -1, -1): # Work backwards to avoid index issues
		if row[i].strip() == colspanMarker:
		row[i-1] = row[i-1] + ' '(colWidths[i-1] - len(row[i-1]))+ ' '(colWidths[i]+3) # Merge with empty content
		# row[i] = None # type:ignore[call-overload] # Indicate removal


		# Pad any rows that are too short
		for row in rows:
		while len(row) < maxCols:
		@@ -71,12 +70,34 @@ def markdownToGrid(markdownLines:list[str]) -> list[str]:
		result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')

		# Data rows
		for row in rows[2:]:
		for rowIndex, row in enumerate(rows[2:]):

		# The following code detects if cells in the next row have rowspan marker(s)
		# If so, it will merge the cells with the current one and remove the rowspan marker
		# from that cell
		nextRowCellsMerged:list[bool] = []

		if rowIndex < len(rows)-3:
		for cellIndex, cell in enumerate(rows[rowIndex+3]):
		if cell.strip() == rowspanMarker:
		nextRowCellsMerged.append(True)
		rows[rowIndex+3][cellIndex] = cell.replace(rowspanMarker, ' '*len(rowspanMarker))
		else:
		nextRowCellsMerged.append(False)
		# nextRowCellsMerged = [ cell.strip() == rowspanMarker for cell in rows[rowIndex+3] ]
		else:
		nextRowCellsMerged = [ False for _ in rows[rowIndex+2] ]

		result.append('\|' + '\|'.join(
		f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None
		f'{row[i]:<{colWidths[i]}}'
		if row[i] != rowspanMarker else ''
		for i in range(len(row))
		if row[i] is not None
		) + '\|')
		result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')

		# Add separator line, if not merged
		result.append('+' + '+'.join('-' * (w + 2) if not nextRowCellsMerged[cellIndex] else ' ' * (w + 2)
		for cellIndex, w in enumerate(colWidths)) + '+')
		return result


		@@ -90,6 +111,49 @@ def formatGridTable(lines: list[str]) -> list[str]:
		Returns:
		Formatted grid table as list of strings
		"""

		def _getCellsFromRow(row:str) -> list[str]:
		"""Helper function to extract cells from a row.

		This is done by splitting the row string by the '\|' character
		and returning the cells as a list. The first and last elements
		are ignored as they are empty strings.

		Args:
		row: The row string to split.

		Returns:
		A list of cells extracted from the row.
		"""
		return row.strip().split('\|')[1:-1]


		def _guessColumnWidth(columnID:int) -> int:
		"""Helper function to guess the width of a column.

		This is done by checking the content of the cells in the column
		and returning the maximum width found. This value may not be
		accurate if the column contains merged cells, but it is a good
		approximation.

		Args:
		columnID: The column ID to check.

		Returns:
		The guessed width of the column.s
		"""
		width = 0
		for row in lines:
		if row.startswith('\|'):
		rowCells = _getCellsFromRow(row)
		if columnID < len(rowCells):
		cellLines = rowCells[columnID].rstrip().split('\\\n')
		for line in cellLines:
		if line != colspanMarker:
		width = max(width, len(line.rstrip()))
		return width


		if not lines or len(lines) < 3:
		return lines

		@@ -101,41 +165,67 @@ def formatGridTable(lines: list[str]) -> list[str]:
		for row in lines:
		if row.startswith('\|'):
		# Split cells and get their lengths
		rowCells = row.strip().split('\|')[1:-1]
		rowCells = _getCellsFromRow(row)
		for i, cell in enumerate(rowCells):
		if i >= len(colWidths):
		continue
		# Calculate maximum width of each line in the cell. Lines could be multilines, so we need to split them.
		cellLines = cell.strip().split('\\\n')
		cellWidth = max(len(line.strip()) if line != colspanMarker else 0
		cellLines = cell.rstrip().split('\\\n')
		requiredCellWidth = max(len(line.rstrip()) if line != colspanMarker else 0
		for line in cellLines)
		if cellWidth > colWidths[i]:
		colWidths[i] = cellWidth

		if requiredCellWidth > colWidths[i]:
		# Check if the next cell or cells are colspan markers
		# If so, then sum the widths of the current and next cells and increase the width
		# only if the required size is still bigger than the current one
		# Check for colspan markers
		overAllCellWidth = colWidths[i]
		nextIdx = i + 1
		while nextIdx < len(rowCells) and rowCells[nextIdx].strip() == colspanMarker:
		cw = colWidths[nextIdx]
		if cw == 0:
		cw = _guessColumnWidth(nextIdx)
		overAllCellWidth += cw
		nextIdx += 1
		if requiredCellWidth > overAllCellWidth:
		# Increase the width of the current cell
		colWidths[i] += requiredCellWidth-overAllCellWidth


		# Process each line
		for line in lines:
		if line.startswith('+-'):
		# Normal separator line can either start with '+ ' or '+-'
		if line.startswith('+-') or line.startswith('+ '):
		# Get the kind of row separator for each column
		_originalSeparator = [ l[0] for l in line.split('+')[1:-1] ]
		# Separator line - rebuild with correct column widths
		result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
		result.append('+' + '+'.join(_originalSeparator[colIndex] * (w)
		for colIndex, w in enumerate(colWidths)
		if colWidths[colIndex] > 0 ) + '+')
		continue
		elif line.startswith('+='):
		# Separator line - rebuild with correct column widths
		result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+')
		result.append('+' + '+'.join('=' * (w)
		for colIndex, w in enumerate(colWidths)
		if colWidths[colIndex] > 0 ) + '+')
		continue
		elif line.startswith('+:='):
		# Separator line - rebuild with correct column widths
		# ATTN: This is a special casse. It assumes that all columns are left-aligned.
		result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
		result.append('+:' + '+:'.join('=' * (w-1)
		for colIndex, w in enumerate(colWidths)
		if colWidths[colIndex] > 0 ) + '+')
		continue


		elif line.startswith('\|'):
		# Content line
		cells = line.strip().split('\|')[1:-1]
		cells = line.rstrip().split('\|')[1:-1]
		formattedCells = []
		i = 0
		while i < len(cells):
		cell = cells[i].strip()
		if cell == colspanMarker:
		cell = cells[i].rstrip()
		if cell.strip() == colspanMarker:
		# Skip merged cells - they were handled with previous cell
		i += 1
		continue
		@@ -144,12 +234,12 @@ def formatGridTable(lines: list[str]) -> list[str]:
		width = colWidths[i]
		nextIdx = i + 1
		while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker:
		width += colWidths[nextIdx] + 3 # +3 for the cell borders
		width += colWidths[nextIdx] + 1
		nextIdx += 1

		# Format the cell content
		formattedCells.append(f'{cell:<{width}}')
		i += 1
		i = nextIdx

		result.append('\|' + '\|'.join(formattedCells) + '\|')

		@@ -204,10 +294,10 @@ def handleMultiLineGridTable(lines: list[str]) -> list[str]:
		else:
		# Use the part if available, otherwise empty string
		text = cellParts[line_idx] if line_idx < len(cellParts) else ''
		newCells.append(text.strip())
		new_line = '\|' + '\|'.join(f' {cell} ' for cell in newCells) + '\|'
		newCells.append(text.rstrip())
		newLine = '\|' + '\|'.join(f'{cell}' for cell in newCells) + '\|'
		# Store with original line index as key
		rowLines[i] = rowLines.get(i, []) + [new_line]
		rowLines[i] = rowLines.get(i, []) + [newLine]
		else:
		# No line breaks, keep original line
		rowLines[i] = [line]

spec2md.py

+50 −26

Original line number	Diff line number	Diff line
		@@ -28,7 +28,8 @@ from rich import inspect
		import configparser, zipfile
		from lxml import etree as ET

		from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker
		from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, \
		formatGridTable, colspanMarker, rowspanMarker

		class Style(IntEnum):
		code = auto()
		@@ -84,6 +85,7 @@ _print:Callable = print

		# Some predefined tags and attributes
		wns = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
		w14ns = 'http://schemas.microsoft.com/office/word/2010/wordml'
		_val = f'{{{wns}}}val'

		class SectionNumbers(object):
		@@ -352,12 +354,12 @@ def processDocuments(documents:list[str],
		'footnoteRef',
		'annotationRef',
		)

		newParagraphs = 0

		def _parseXML(element:ET.Element, inCell:Optional[bool] = False) -> str:
		""" Recursively parse a document paragraph.
		"""
		nonlocal _ignoredTags
		nonlocal _ignoredTags, newParagraphs

		_result = ''
		tag = strippedTag(element.tag) # remove namespaces for easier handlings
		@@ -497,14 +499,16 @@ def processDocuments(documents:list[str],
		# _print(ET.fromstring(elem._p.xml))
		match elem:
		case Paragraph(): # type: ignore[misc]
		return _parseXML(ET.fromstring(elem._p.xml))
		return _parseXML(ET.fromstring(elem._p.xml)).rstrip()
		case _Cell(): # type: ignore[misc]
		# Iterate over all paragraphs in the cell and parse them
		# Create a list of parsed paragraphs and join them with linebreaks
		return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
		for p in elem.paragraphs ])
		case ET._Element():
		return _parseXML(elem)
		# return '<br />'.join([ _parseXML(ET.fromstring(p._p.xml), True).rstrip()
		# for p in elem.paragraphs ])
		return '<br />'.join([ _parseXML(elem).rstrip()])
		case _:
		return ''

		@@ -769,23 +773,45 @@ def processDocuments(documents:list[str],
		nrRows = 0
		colSpanDetected = False
		for row in elem.rows:
		_row = ET.fromstring(row._tr.xml)
		cells:list[str] = []
		colspanCounter = 0
		for cell in row.cells:
		for cell in _row.findall('.//w:tc', namespaces = { 'w' : wns }):

		colspanCounter = 1 # Default value if no gridspan is specified
		gridspanElem = cell.find('.//w:tcPr/w:gridSpan', namespaces={'w': wns})
		if gridspanElem is not None and _val in gridspanElem.attrib:
		colspanCounter = int(gridspanElem.attrib[_val])
		colSpanDetected = True # Set flag that colspan was found

		# Vertical merge
		gridspanElem = cell.find('.//w:tcPr/w:vMerge', namespaces={'w': wns})
		if gridspanElem is not None and _val not in gridspanElem.attrib:
		cells.append(rowspanMarker)

		else:

		# Extract text from cell
		# Find all paragraphs in the cell
		_pl:list[str] = []
		for p in cell.findall('.//w:p', namespaces={'w': wns}):
		_pl.append(getTextFromXML(p))
		# Add the text to the cell
		if len(_pl) > 0:
		cells.append(_linebreak.join(_pl))
		else:
		cells.append('')

		# Handle colspan formatting
		if not forceMarkdownTables:
		if colspanCounter > 0:
		cells.append(colspanMarker) # add at least a space
		if colspanCounter >= 1:
		for _ in range(colspanCounter-1):
		cells.append(colspanMarker)
		colspanCounter -= 1
		continue
		if cell._tc.grid_span > 1:
		colSpanDetected = True
		colspanCounter = cell._tc.grid_span - 1
		elif cell._tc.grid_span > 1:
		colSpanDetected = True
		cells.append(f'{getTextFromXML(cell)} ') # add at least a space
		rows.append(cells)
		nrRows += 1

		# for r in rows:
		# _print(r)

		# Warning if this is a single-row table
		if nrRows == 1:
		@@ -850,8 +876,6 @@ def processDocuments(documents:list[str],
		line = line.replace(ch, f'<mark>Non-ASCII character {ch} / {hex(ord(ch))}</mark>')
		lines[i] = line



		#
		# Remove multiple bold / italics on/off occurances
		# Sometimes word doesn't remove empty bold-on/bold-off (or italics) indicatros
		@@ -861,6 +885,9 @@ def processDocuments(documents:list[str],
		line = lines[i]
		line = line.replace('__', '')
		line = line.replace('****', '')
		line = line.replace(' ', ' ')
		line = line.replace('_ ', '_ ')
		line = line.replace(' ', ' ')
		#line = line.replace(' ', ' ')
		lines[i] = line

		@@ -966,9 +993,6 @@ def processDocuments(documents:list[str],
		for fid, text in footnotes.items():
		lines.append(f'[^{fid}]: {text}')

		#
		# List unresolved CAPTION markers
		#
		#
		# List unresolved CAPTION markers
		#
		@@ -980,7 +1004,6 @@ def processDocuments(documents:list[str],
		#
		# Correct formatting of Grid tables after all other changes have been applied
		#

		if not forceMarkdownTables:
		gridTable:list[str] = []
		result:list[str] = []
		@@ -1006,6 +1029,7 @@ def processDocuments(documents:list[str],
		lines = result



		#
		# Write produced Markdown file
		#

Admin message