Support for grid table generation. This is now the default when encountering tables with colspans (59f07be5) · Commits · Centre for Testing and Interoperability / Markdown specifications development / spec2md

README.md

+33 −0

Original line number	Diff line number	Diff line
		@@ -20,6 +20,32 @@ python3 -m pip install -r requirements.txt
		python3 spec2md.py <path-to-word-document>
		```

		### Command Line Options

		```
		usage: spec2md.py [-h] [--outdir <output directory>] [--skip-image-conversion] [--force-markdown-tables]
		document [document ...]

		positional arguments:
		document documents to parse

		options:
		-h, --help show this help message and exit
		--outdir <output directory>, -o <output directory>
		specify output directory (default: out)
		--skip-image-conversion, -sic
		skip image conversion step (default: False)
		--force-markdown-tables, -mdt
		Force markdown instead of grid format for tables with colspans (default: False)

		```

		- `--outdir` or `-o` specifies the output directory. The default is `out`.
		- `--skip-image-conversion` or `-sic` skips the image conversion step. The default is to convert images, but this may not be necessary if the images have already been converted.
		- `--force-markdown-tables` or `-mdt` forces the converter to generate markdown tables instead of grid tables. The default is to generate grid tables for tables with colspans. This option is useful to generate a first version of the table that can be manually adjusted later.



		## FAQ

		### The converter doesn't seem to generate image files.
		@@ -60,9 +86,16 @@ Lists in table cells are also not possible. One may use html lists for this, but
		```


		### How to convert a table with colspans?

		The converter will try to convert tables with colspans to grid tables. If the `--force-markdown-tables` option is used, then the table will be converted to a normal markdown table. If the table has colspans, then the cells will just be repeated to fill a table row.

		This may not be the desired result, but markdown doesn't support colspans. A solution is to use grid tables instead.


		## Changes

		- 2025-01-15 - Improved handling of tables with colspans (converting them to simple grid tables). Improved error messages (added line numbers). Improved error detection for tables.
		- 2024-01-09 - Added support for merging consecutive code paragraphs into a single code block.
		- 2023-08-18 - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
		- 2023-07-27 - Added converting bold and italic text in paragraphs, headers and tables.
		No newline at end of file

gridTable.py

0 → 100644

+233 −0

Original line number	Diff line number	Diff line
		#
		# gritTable.py
		#
		# Grid Table support functions for markdown conversion.
		#
		# (c) 2025 by Andreas Kraft
		# License: BSD 3-Clause License. See the LICENSE file for further details.
		#
		import re

		colspanMarker = '~~COLSPAN~~'

		def markdownToGrid(markdownLines:list[str]) -> list[str]:
		""" Convert a markdown table to a grid table.
		Cells containing ~~XX~~ will be merged with the previous cell.

		Args:
		markdownLines: The markdown lines to convert.

		Return:
		The converted grid table.
		"""

		# Check if there are enough lines to create a table
		if not markdownLines or len(markdownLines) < 3:
		return markdownLines

		# Replace all <br> with <br /> in all lines
		markdownLines = [ re.sub(r'<br\s*/?>', '<br />', line) for line in markdownLines ]

		# Split each line into cells and clean whitespace
		rows = [
		[cell.strip() for cell in line.strip('\|').split('\|')]
		for line in markdownLines
		]


		# Get maximum width for each column
		colWidths = []
		maxCols = max(len(row) for row in rows)
		for col in range(maxCols):
		width = max(len(str(row[col])) if col < len(row) else 0 for row in rows)
		colWidths.append(width)


		# Process merged cells - combine content with previous cell
		for row in rows:
		for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues
		if row[i].strip() == colspanMarker:
		row[i-1] = row[i-1] + ' '(colWidths[i-1] - len(row[i-1]))+ ' '(colWidths[i]+3) # Merge with empty content
		# row[i] = None # type:ignore[call-overload] # Indicate removal


		# Pad any rows that are too short
		for row in rows:
		while len(row) < maxCols:
		row.append('')

		# Generate grid table
		result = []

		# Top border
		result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')

		# Header row
		result.append('\|' + '\|'.join(
		f' {rows[0][i]:<{colWidths[i]}} ' for i in range(len(rows[0])) if rows[0][i] is not None
		) + '\|')

		# Header separator
		result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')

		# Data rows
		for row in rows[2:]:
		result.append('\|' + '\|'.join(
		f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None
		) + '\|')
		result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')

		return result


		def formatGridTable(lines: list[str]) -> list[str]:
		"""Format a grid table by adjusting column widths and alignments.
		Supports merged cells marked with ~~COLSPAN~~.

		Args:
		lines: List of strings containing a grid table

		Returns:
		Formatted grid table as list of strings
		"""
		if not lines or len(lines) < 3:
		return lines

		# Get column widths from first separator line
		colWidths = [len(col.strip()) for col in lines[0].split('+')[1:-1]]
		result = []

		# Adjust column widths if any cell is longer
		for row in lines:
		if row.startswith('\|'):
		# Split cells and get their lengths
		rowCells = row.strip().split('\|')[1:-1]
		for i, cell in enumerate(rowCells):
		if i >= len(colWidths):
		continue
		cellWidth = len(cell.strip())
		if cellWidth > colWidths[i]:
		colWidths[i] = cellWidth

		# Process each line
		for line in lines:
		if line.startswith('+-'):
		# Separator line - rebuild with correct column widths
		result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
		continue
		elif line.startswith('+='):
		# Separator line - rebuild with correct column widths
		result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+')
		continue
		elif line.startswith('+:='):
		# Separator line - rebuild with correct column widths
		# ATTN: This is a special casse. It assumes that all columns are left-aligned.
		result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
		continue

		elif line.startswith('\|'):
		# Content line
		cells = line.strip().split('\|')[1:-1]
		formattedCells = []
		i = 0
		while i < len(cells):
		cell = cells[i].strip()
		if cell == colspanMarker:
		# Skip merged cells - they were handled with previous cell
		i += 1
		continue

		# Calculate width for potentially merged cells
		width = colWidths[i]
		nextIdx = i + 1
		while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker:
		width += colWidths[nextIdx] + 3 # +3 for the cell borders
		nextIdx += 1

		# Format the cell content
		formattedCells.append(f' {cell:<{width}} ')
		i += 1

		result.append('\|' + '\|'.join(formattedCells) + '\|')

		return result


		def handleMultiLineGridTable(lines: list[str]) -> list[str]:
		"""Handle multiline cells in a grid table by splitting cells with <br /> markers.

		Args:
		lines: List of strings containing a grid table

		Returns:
		List of strings with multiline cells properly formatted
		"""
		result = []
		rowLines:dict[int, list[str]] = {} # Map to store line fragments for each row

		# Process each line
		for i, line in enumerate(lines):
		if line.startswith('\|'): # Content line
		# Split the line into cells
		cells = line.strip().split('\|')[1:-1]

		# Process each cell for line breaks
		maxLines = 1
		splitCells = []
		for cell in cells:
		# Check if cell contains colspan marker
		if cell.strip() == colspanMarker:
		# For colspan cells, create same number of parts filled with marker
		splitCells.append([colspanMarker])
		else:
		parts = cell.split('<br />')
		if len(parts) > 1:
		# Found line breaks in cell
		# Add "\" to each part except the last
		parts = [ p + '\\' if i < len(parts)-1 else p
		for i, p in enumerate(parts) ]

		splitCells.append(parts)
		maxLines = max(maxLines, len(parts))

		# If we found line breaks, create multiple content lines
		if maxLines > 1:
		for line_idx in range(maxLines):
		newCells = []
		for cellParts in splitCells:
		if len(cellParts) == 1 and cellParts[0].strip() == colspanMarker:
		# For colspan cells, always use the marker
		text = colspanMarker
		else:
		# Use the part if available, otherwise empty string
		text = cellParts[line_idx] if line_idx < len(cellParts) else ''
		newCells.append(text.strip())
		new_line = '\|' + '\|'.join(f' {cell} ' for cell in newCells) + '\|'
		# Store with original line index as key
		rowLines[i] = rowLines.get(i, []) + [new_line]
		else:
		# No line breaks, keep original line
		rowLines[i] = [line]
		else:
		# Border lines are kept as is
		rowLines[i] = [line]

		# Reconstruct the table
		for i in range(len(lines)):
		result.extend(rowLines.get(i, []))

		return result



		def isGridTableStart(line: str) -> bool:
		"""Check if a line marks the start of a grid table.

		Args:
		line: The line to check.

		Returns:
		True if this is a table start line, False otherwise.
		"""
		return line.startswith('+') and line.endswith('+') and '-' in line and not '=' in line

spec2md.py

+115 −12

Original line number	Diff line number	Diff line
		@@ -28,6 +28,8 @@ from rich import inspect
		import configparser, zipfile
		from lxml import etree as ET

		from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker

		class Style(IntEnum):
		code = auto()
		example = auto()
		@@ -208,9 +210,34 @@ class DocumentConfiguration(object):
		self.emfConverterSvg = config.get('media', 'emfConverterSvg', fallback = None)


		def richString(text:str) -> str:
		""" Return a rich string for the console output.

		Args:
		text: The text to convert to a rich string.

		Return:
		The converted text.
		"""
		return text.replace('[', '\\[')


		def linenumber(idx:int) -> str:
		""" Return the formatted line number.

		Args:
		idx: The index to get the line number for.

		Return:
		The formatted line number with leading zeros.
		"""
		return f'{idx+1:0{5}}' # currently 5 digits


		def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
		def processDocuments(documents:list[str],
		outDirectory:str,
		skipImageConversion:bool,
		forceMarkdownTables:bool) -> None:
		docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]] = {}
		ptasks = {}
		mediaRelations:Dict[str, str] = {}
		@@ -590,6 +617,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		# Processing the document
		lines:list[str] = []
		imageIndex = 1
		lastTableCaption:str = '<unknown caption>'

		for elem in docItems:
		paragraphNr += 1
		@@ -672,6 +700,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		caption = replaceNL(text).strip()
		anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
		lines.append(f'{caption}{anchor}')
		lastTableCaption = caption

		# Image Caption
		elif style in docConfig.imagecaption:
		@@ -724,38 +753,76 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:

		# Print Unhandled tokens also to the console
		else:
		_print(f'{paragraphNr} {style}: {elem.style}: {text}')
		_print(f'[yellow]({linenumber(len(lines))}) Undefined paragraph style "{elem.style.name}":[/yellow] [grey39]{text}')
		lines.append(text)


		case 'Table':
		rows:list[list[str]] = []
		nrRows = 0
		colSpanDetected = False
		for row in elem.rows:
		cells:list[str] = []
		colspanCounter = 0
		for cell in row.cells:
		if not forceMarkdownTables:
		if colspanCounter > 0:
		cells.append(colspanMarker) # add at least a space
		colspanCounter -= 1
		continue
		if cell._tc.grid_span > 1:
		colSpanDetected = True
		colspanCounter = cell._tc.grid_span - 1
		elif cell._tc.grid_span > 1:
		colSpanDetected = True
		cells.append(f'{getTextFromXML(cell)} ') # add at least a space
		rows.append(cells)
		nrRows += 1


		# Warning if this is a single-row table
		if nrRows == 1:
		_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)
		_print(f'[red]({linenumber(len(lines)+2)}) Single-row table found. Such tables cannot be converted to markdown.[/red]Consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)

		lines.append('') # Add an empty line before a table
		# Warning if a table with colspans is detected
		if colSpanDetected:
		if forceMarkdownTables:
		_print(f'[yellow]({linenumber(len(lines)+2)}) Table with colspans found: [/yellow][grey39]{richString(lastTableCaption)}[/grey39]\nConsider to convert it manually to a grid table', highlight = False)

		tableLines:list[str] = []

		errorDetected:bool = False
		for idx, row in enumerate(rows):

		# Check for a table caption and add separator line
		if idx == 1:
		lines.append('-'.join('\|' * (len(row) + 1) ))
		tableLines.append('-'.join('\|' * (len(row) + 1) ))

		# # Check if the number of columns is the same as the previous row and add cells if smaller

		if idx > 0 and len(row) != len(rows[idx-1]):
		_print(f'[red]({linenumber(len(lines))}) Number of columns in table row {idx} does not match the previous row.[/red]\nTable may need extra attention', highlight = False)
		errorDetected = True

		# Add table row
		lines.append(f'\|{"\|".join(row)}\|'
		tableLines.append(f'\|{"\|".join(row)}\|'
		.replace('\n', _linebreak)) # replace line breaks in cells

		# if colSpanDetected and gridTableForColspan then convert to grid table
		if colSpanDetected and not forceMarkdownTables and not errorDetected:
		lines.append('') # Add an empty line before a table
		lines.append('<mark>Table with colspans converted to grid table. Please check and adjust manually if necessary.</mark>')
		tableLines = markdownToGrid(tableLines)

		lines.append('') # Add an empty line before a table
		if errorDetected:
		lines.append('<mark>The table below caused an error during conversion and may need extra attention</mark>')
		lines.append('') # Add an empty line before a table
		lines.extend(tableLines)
		lines.append('') # Add another empty line after a table

		case _:
		_print('[blue] {type(elem).__name__}')
		_print(f'[blue]({linenumber(len(lines))}) {type(elem).__name__}')

		#
		# Replace non-ascii characters
		@@ -772,7 +839,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		line = line.replace(ch, rch) # we need the line for further replacements
		lines[i] = line
		else:
		_print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
		_print(f'[yellow]({linenumber(i)}) Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')


		#
		@@ -892,10 +959,42 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
		#
		# List unresolved CAPTION markers
		#
		for i in range(len(lines)):
		line = lines[i]
		#
		# List unresolved CAPTION markers
		#
		for i, line in enumerate(lines):
		if _captionMarker in line:
		_print(f'[yellow]Unresolved / unreferenced figure caption : \[{i}] "{line}"')
		_print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]')


		#
		# Correct formatting of Grid tables after all other changes have been applied
		#

		if not forceMarkdownTables:
		gridTable:list[str] = []
		result:list[str] = []
		for i, line in enumerate(lines):

		# Check for grid table start
		if isGridTableStart(line) and not gridTable:
		gridTable = [ line ]
		continue
		# Are we in a grid table?
		if gridTable:
		# Is the current line still part of the grid table?
		if line.startswith(('\|', '+')):
		gridTable.append(line)
		continue
		# grid table finished. Assign and clear
		gridTable = handleMultiLineGridTable(gridTable)
		result.extend(formatGridTable(gridTable))
		gridTable = []
		continue
		# not in grid table
		result.append(line)
		lines = result


		#
		# Write produced Markdown file
		@@ -954,6 +1053,7 @@ if __name__ == '__main__':
		parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
		parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>', help = 'specify output directory')
		parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion', help = 'skip image conversion step')
		parser.add_argument('--force-markdown-tables', '-mdt', action='store_true', dest='forceMarkdownTables', help = 'Force markdown instead of grid format for tables with colspans')

		parser.add_argument('document', nargs = '+', help = 'documents to parse')
		args = parser.parse_args()
		@@ -961,5 +1061,8 @@ if __name__ == '__main__':
		# Process documents and print output
		os.makedirs(args.outDirectory, exist_ok = True)

		processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion)
		processDocuments(sorted(args.document),
		args.outDirectory,
		args.skipImageConversion,
		args.forceMarkdownTables)

Admin message