Commit 59f07be5 authored by ankraft's avatar ankraft
Browse files

Support for grid table generation. This is now the default when encountering tables with colspans

parent c43a6753
Loading
Loading
Loading
Loading
+33 −0
Original line number Diff line number Diff line
@@ -20,6 +20,32 @@ python3 -m pip install -r requirements.txt
python3 spec2md.py <path-to-word-document>
```

### Command Line Options

```
usage: spec2md.py [-h] [--outdir <output directory>] [--skip-image-conversion] [--force-markdown-tables]
                  document [document ...]

positional arguments:
  document              documents to parse

options:
  -h, --help            show this help message and exit
  --outdir <output directory>, -o <output directory>
                        specify output directory (default: out)
  --skip-image-conversion, -sic
                        skip image conversion step (default: False)
  --force-markdown-tables, -mdt
                        Force markdown instead of grid format for tables with colspans (default: False)

```

- `--outdir` or `-o` specifies the output directory. The default is `out`.
- `--skip-image-conversion` or `-sic` skips the image conversion step. The default is to convert images, but this may not be necessary if the images have already been converted.
- `--force-markdown-tables` or `-mdt` forces the converter to generate markdown tables instead of grid tables. The default is to generate grid tables for tables with colspans. This option is useful to generate a first version of the table that can be manually adjusted later.



## FAQ

### The converter doesn't seem to generate image files.
@@ -60,9 +86,16 @@ Lists in table cells are also not possible. One may use html lists for this, but
```


### How to convert a table with colspans?

The converter will try to convert tables with colspans to grid tables. If the `--force-markdown-tables` option is used, then the table will be converted to a normal markdown table. If the table has colspans, then the cells will just be repeated to fill a table row.

This may not be the desired result, but markdown doesn't support colspans. A solution is to use grid tables instead.  


## Changes

- **2025-01-15** - Improved handling of tables with colspans (converting them to simple grid tables). Improved error messages (added line numbers). Improved error detection for tables.
- **2024-01-09** - Added support for merging consecutive code paragraphs into a single code block.
- **2023-08-18** - Improved handling of sometimes broken inline formatting in table cells. Adding more default heading formats.
- **2023-07-27** - Added converting bold and italic text in paragraphs, headers and tables.
 No newline at end of file

gridTable.py

0 → 100644
+233 −0
Original line number Diff line number Diff line
#
#	gritTable.py
#
#	Grid Table support functions for markdown conversion.
#
#	(c) 2025 by Andreas Kraft
#	License: BSD 3-Clause License. See the LICENSE file for further details.
#
import re

colspanMarker = '~~COLSPAN~~'

def markdownToGrid(markdownLines:list[str]) -> list[str]:
	"""	Convert a markdown table to a grid table. 
		Cells containing ~~XX~~ will be merged with the previous cell.

		Args:
			markdownLines: The markdown lines to convert.
		
		Return:
			The converted grid table.
	"""
	
	# Check if there are enough lines to create a table
	if not markdownLines or len(markdownLines) < 3:
		return markdownLines
	
	# Replace all <br> with <br /> in all lines
	markdownLines = [ re.sub(r'<br\s*/?>', '<br />', line) for line in markdownLines ]
	
	# Split each line into cells and clean whitespace
	rows = [
		[cell.strip() for cell in line.strip('|').split('|')]
		for line in markdownLines
	]


	# Get maximum width for each column 
	colWidths = []
	maxCols = max(len(row) for row in rows)
	for col in range(maxCols):
		width = max(len(str(row[col])) if col < len(row) else 0 for row in rows)
		colWidths.append(width)


	# Process merged cells - combine content with previous cell
	for row in rows:
		for i in range(len(row)-1, 0, -1): # Work backwards to avoid index issues
			if row[i].strip() == colspanMarker:
				row[i-1] = row[i-1] + ' '*(colWidths[i-1] - len(row[i-1]))+ ' '*(colWidths[i]+3) # Merge with empty content
				# row[i] = None 	# type:ignore[call-overload] # Indicate removal

	
	# Pad any rows that are too short
	for row in rows:
		while len(row) < maxCols:
			row.append('')
	
	# Generate grid table
	result = []
	
	# Top border
	result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
	
	# Header row
	result.append('|' + '|'.join(
		f' {rows[0][i]:<{colWidths[i]}} ' for i in range(len(rows[0])) if rows[0][i] is not None
	) + '|')
	
	# Header separator
	result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
	
	# Data rows
	for row in rows[2:]:
		result.append('|' + '|'.join(
			f' {row[i]:<{colWidths[i]}} ' for i in range(len(row)) if row[i] is not None
		) + '|')
		result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
	
	return result


def formatGridTable(lines: list[str]) -> list[str]:
	"""Format a grid table by adjusting column widths and alignments.
	Supports merged cells marked with ~~COLSPAN~~.
	
	Args:
		lines: List of strings containing a grid table
		
	Returns:
		Formatted grid table as list of strings
	"""
	if not lines or len(lines) < 3:
		return lines

	# Get column widths from first separator line
	colWidths = [len(col.strip()) for col in lines[0].split('+')[1:-1]]
	result = []

	# Adjust column widths if any cell is longer
	for row in lines:
		if row.startswith('|'):
			# Split cells and get their lengths
			rowCells = row.strip().split('|')[1:-1]
			for i, cell in enumerate(rowCells):
				if i >= len(colWidths):
					continue
				cellWidth = len(cell.strip())
				if cellWidth > colWidths[i]:
					colWidths[i] = cellWidth

	# Process each line
	for line in lines:
		if line.startswith('+-'):
			# Separator line - rebuild with correct column widths
			result.append('+' + '+'.join('-' * (w + 2) for w in colWidths) + '+')
			continue
		elif line.startswith('+='):
			# Separator line - rebuild with correct column widths
			result.append('+' + '+'.join('=' * (w + 2) for w in colWidths) + '+')
			continue
		elif line.startswith('+:='):
			# Separator line - rebuild with correct column widths
			# ATTN: This is a special casse. It assumes that all columns are left-aligned.
			result.append('+:' + '+:'.join('=' * (w + 1) for w in colWidths) + '+')
			continue

		elif line.startswith('|'):
			# Content line
			cells = line.strip().split('|')[1:-1]
			formattedCells = []
			i = 0
			while i < len(cells):
				cell = cells[i].strip()
				if cell == colspanMarker:
					# Skip merged cells - they were handled with previous cell
					i += 1
					continue
				
				# Calculate width for potentially merged cells
				width = colWidths[i]
				nextIdx = i + 1
				while nextIdx < len(cells) and cells[nextIdx].strip() == colspanMarker:
					width += colWidths[nextIdx] + 3  # +3 for the cell borders
					nextIdx += 1
				
				# Format the cell content
				formattedCells.append(f' {cell:<{width}} ')
				i += 1

			result.append('|' + '|'.join(formattedCells) + '|')

	return result


def handleMultiLineGridTable(lines: list[str]) -> list[str]:
	"""Handle multiline cells in a grid table by splitting cells with <br /> markers.
	
	Args:
		lines: List of strings containing a grid table
		
	Returns:
		List of strings with multiline cells properly formatted
	"""
	result = []
	rowLines:dict[int, list[str]] = {}  # Map to store line fragments for each row

	# Process each line
	for i, line in enumerate(lines):
		if line.startswith('|'):  # Content line
			# Split the line into cells
			cells = line.strip().split('|')[1:-1]
			
			# Process each cell for line breaks
			maxLines = 1
			splitCells = []
			for cell in cells:
				# Check if cell contains colspan marker
				if cell.strip() == colspanMarker:
					# For colspan cells, create same number of parts filled with marker
					splitCells.append([colspanMarker])
				else:
					parts = cell.split('<br />')
					if len(parts) > 1:
						# Found line breaks in cell
						# Add "\" to each part except the last
						parts = [ p + '\\' if i < len(parts)-1 else p 
								for i, p in enumerate(parts) ]

					splitCells.append(parts)
				maxLines = max(maxLines, len(parts))
			
			# If we found line breaks, create multiple content lines
			if maxLines > 1:
				for line_idx in range(maxLines):
					newCells = []
					for cellParts in splitCells:
						if len(cellParts) == 1 and cellParts[0].strip() == colspanMarker:
							# For colspan cells, always use the marker
							text = colspanMarker
						else:
							# Use the part if available, otherwise empty string
							text = cellParts[line_idx] if line_idx < len(cellParts) else ''
						newCells.append(text.strip())
					new_line = '|' + '|'.join(f' {cell} ' for cell in newCells) + '|'
					# Store with original line index as key
					rowLines[i] = rowLines.get(i, []) + [new_line]
			else:
				# No line breaks, keep original line
				rowLines[i] = [line]
		else:
			# Border lines are kept as is
			rowLines[i] = [line]
	
	# Reconstruct the table
	for i in range(len(lines)):
		result.extend(rowLines.get(i, []))

	return result



def isGridTableStart(line: str) -> bool:
	"""Check if a line marks the start of a grid table.

	Args:
		line: The line to check.
	
	Returns:
		True if this is a table start line, False otherwise.
	"""
	return line.startswith('+') and line.endswith('+') and '-' in line and not '=' in line
+115 −12
Original line number Diff line number Diff line
@@ -28,6 +28,8 @@ from rich import inspect
import configparser, zipfile
from lxml import etree as ET

from gridTable import markdownToGrid, isGridTableStart, handleMultiLineGridTable, formatGridTable, colspanMarker

class Style(IntEnum):
	code = auto()
	example = auto()
@@ -208,9 +210,34 @@ class DocumentConfiguration(object):
		self.emfConverterSvg = config.get('media', 'emfConverterSvg', fallback = None)


def richString(text:str) -> str:
	"""	Return a rich string for the console output.

		Args:
			text: The text to convert to a rich string.
		
		Return:
			The converted text.
	"""
	return text.replace('[', '\\[')


def linenumber(idx:int) -> str:
	"""	Return the formatted line number. 

		Args:
			idx: The index to get the line number for.
		
		Return:
			The formatted line number with leading zeros.
	"""
	return f'{idx+1:0{5}}'	# currently 5 digits


def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:bool) -> None:
def processDocuments(documents:list[str], 
					 outDirectory:str, 
					 skipImageConversion:bool,
					 forceMarkdownTables:bool) -> None:
	docs:Dict[str, Tuple[Document, DocumentConfiguration, Any]]		= {}
	ptasks 															= {}
	mediaRelations:Dict[str, str] 									= {}
@@ -590,6 +617,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
			# 	Processing the document			
			lines:list[str] = []
			imageIndex = 1
			lastTableCaption:str = '<unknown caption>'

			for elem in docItems:
				paragraphNr += 1
@@ -672,6 +700,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
							caption = replaceNL(text).strip()
							anchor = f'<a name="table_{caption[6:].split(":")[0].strip()}"></a>' if caption.startswith('Table ') and ':' in caption else ''
							lines.append(f'**{caption}**{anchor}')
							lastTableCaption = caption

						#	Image Caption
						elif style in docConfig.imagecaption:
@@ -724,38 +753,76 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:

						# Print Unhandled tokens also to the console
						else:
							_print(f'{paragraphNr} {style}: {elem.style}: {text}')
							_print(f'[yellow]({linenumber(len(lines))}) Undefined paragraph style "{elem.style.name}":[/yellow] [grey39]{text}')
							lines.append(text)


					case 'Table':
						rows:list[list[str]] = []
						nrRows = 0
						colSpanDetected = False
						for row in elem.rows:
							cells:list[str] = []
							colspanCounter = 0
							for cell in row.cells:
								if not forceMarkdownTables:
									if colspanCounter > 0:
										cells.append(colspanMarker)	# add at least a space
										colspanCounter -= 1
										continue
									if cell._tc.grid_span > 1:
										colSpanDetected = True
										colspanCounter = cell._tc.grid_span - 1
								elif cell._tc.grid_span > 1:
									colSpanDetected = True
								cells.append(f'{getTextFromXML(cell)} ')	# add at least a space
							rows.append(cells)
							nrRows += 1
						
						
						# Warning if this is a single-row table
						if nrRows == 1:
							_print(f'[red]Single-row table found. Such tables cannot be converted to markdown.[/red] Please consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)
							_print(f'[red]({linenumber(len(lines)+2)}) Single-row table found. Such tables cannot be converted to markdown.[/red]Consider to change the following table in the original document:\n[grey39]{rows[0]}', highlight = False)

						lines.append('')	# Add an empty line before a table
						# Warning if a table with colspans is detected
						if colSpanDetected:
							if forceMarkdownTables:
								_print(f'[yellow]({linenumber(len(lines)+2)}) Table with colspans found: [/yellow][grey39]{richString(lastTableCaption)}[/grey39]\nConsider to convert it manually to a grid table', highlight = False)

						tableLines:list[str] = []

						errorDetected:bool = False
						for idx, row in enumerate(rows):

							# Check for a table caption and add separator line
							if idx == 1:
								lines.append('-'.join('|' * (len(row) + 1) ))
								tableLines.append('-'.join('|' * (len(row) + 1) ))

							# # Check if the number of columns is the same as the previous row and add cells if smaller

							if idx > 0 and len(row) != len(rows[idx-1]):
								_print(f'[red]({linenumber(len(lines))}) Number of columns in table row {idx} does not match the previous row.[/red]\nTable may need extra attention', highlight = False)
								errorDetected = True
							
							# Add table row
							lines.append(f'|{"|".join(row)}|'
							tableLines.append(f'|{"|".join(row)}|'
										 .replace('\n', _linebreak))	# replace line breaks in cells
						
						# if colSpanDetected and gridTableForColspan then convert to grid table
						if colSpanDetected and not forceMarkdownTables and not errorDetected:
							lines.append('')	# Add an empty line before a table
							lines.append('<mark>Table with colspans converted to grid table. Please check and adjust manually if necessary.</mark>')
							tableLines = markdownToGrid(tableLines)
						
						lines.append('')	# Add an empty line before a table
						if errorDetected:
							lines.append('<mark>The table below caused an error during conversion and may need extra attention</mark>')
							lines.append('')	# Add an empty line before a table
						lines.extend(tableLines)
						lines.append('')	# Add another empty line after a table
					
					case _:
						_print('[blue] {type(elem).__name__}')
						_print(f'[blue]({linenumber(len(lines))}) {type(elem).__name__}')

			#
			#	Replace non-ascii characters
@@ -772,7 +839,7 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
							line = line.replace(ch, rch)	# we need the line for further replacements
							lines[i] = line
						else:
							_print(f'[yellow]Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
							_print(f'[yellow]({linenumber(i)}) Non-ASCII character (consider to add a replacement in the config.ini file): "{ch}" / {ord(ch)} / {hex(ord(ch))}')
		

			#
@@ -892,10 +959,42 @@ def processDocuments(documents:list[str], outDirectory:str, skipImageConversion:
			#
			#	List unresolved CAPTION markers
			#
			for i in range(len(lines)):
				line = lines[i]
			#
			#	List unresolved CAPTION markers
			#
			for i, line in enumerate(lines):
				if _captionMarker in line:
					_print(f'[yellow]Unresolved / unreferenced figure caption : \[{i}] "{line}"')
					_print(f'[yellow]({linenumber(i)}) Unresolved / unreferenced figure caption: "{line}"[/yellow]')
			

			#
			#	Correct formatting of Grid tables after all other changes have been applied
			#

			if not forceMarkdownTables:
				gridTable:list[str] = []
				result:list[str] = []
				for i, line in enumerate(lines):

					# Check for grid table start
					if isGridTableStart(line) and not gridTable:
						gridTable = [ line ]
						continue
					# Are we in a grid table?
					if gridTable:
						# Is the current line still part of the grid table?
						if line.startswith(('|', '+')):
							gridTable.append(line)
							continue
						# grid table finished. Assign and clear
						gridTable = handleMultiLineGridTable(gridTable)
						result.extend(formatGridTable(gridTable))
						gridTable = []
						continue
					# not in grid table
					result.append(line)
				lines = result
				
			
			#
			#	Write produced Markdown file
@@ -954,6 +1053,7 @@ if __name__ == '__main__':
	parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
	parser.add_argument('--outdir', '-o', action='store', dest='outDirectory', default = 'out', metavar = '<output directory>',  help = 'specify output directory')
	parser.add_argument('--skip-image-conversion', '-sic', action='store_true', dest='skipImageConversion',  help = 'skip image conversion step')
	parser.add_argument('--force-markdown-tables', '-mdt', action='store_true', dest='forceMarkdownTables',  help = 'Force markdown instead of grid format for tables with colspans')

	parser.add_argument('document', nargs = '+', help = 'documents to parse')
	args = parser.parse_args()
@@ -961,5 +1061,8 @@ if __name__ == '__main__':
		# Process documents and print output
	os.makedirs(args.outDirectory, exist_ok = True)

	processDocuments(sorted(args.document), args.outDirectory, args.skipImageConversion)
	processDocuments(sorted(args.document), 
				  	 args.outDirectory, 
					 args.skipImageConversion,
					 args.forceMarkdownTables)