Commit 9d0a1d23 authored by ankraft's avatar ankraft
Browse files

Moved functions in different modules. Some optimizations

parent d2bf6d75
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
BSD 3-Clause License

Copyright (c) 2024, Miguel Angel Reina Ortega
Copyright (c) 2024, Miguel Angel Reina Ortega & Andreas Kraft

Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
+503 −0
Original line number Diff line number Diff line
#
#	gridTableTools.py
#
#	(c) 2025 by Miguel Angel Reina Ortega & Andreas Kraft
#	License: BSD 3-Clause License. See the LICENSE file for further details.
#
""" Tools for working with grid tables in markdown files. """

from typing import Optional
from regexMatches import *



class GridCell:
	"""	Represents a grid table cell. """
	
	def __init__(self) -> None:
		"""	Initialize a new grid table cell. 
		"""
		self.content:Optional[str] = None
		self.rowspan:int = 0
		self.colspan:int = 0
		self.colspanAdjusted:bool = False
		self.alignment:str = 'align="center"'
		self.position:Optional[int] = None
		self.listFlag:bool = False
		self.auxiliarIndex:int = 0


	def calculateAndSetAlignment(self, headerDelimiterPositions:list[int], defaultAlignments:list[str]) -> None:
		"""	Set the alignment of the cell based on the position of the delimiter. 
		"""
		if self.position is None:
			raise ValueError('Cell position must be set before calculating alignment.')
		
		headerDelimiterIndex = 0
		while headerDelimiterIndex < len(defaultAlignments) and self.position > headerDelimiterPositions[headerDelimiterIndex]:
			headerDelimiterIndex += 1
		if headerDelimiterIndex < len(defaultAlignments):
			if self.position < headerDelimiterPositions[headerDelimiterIndex]:
				self.alignment = defaultAlignments[headerDelimiterIndex]
			elif self.position == headerDelimiterPositions[headerDelimiterIndex]:
				self.alignment = defaultAlignments[headerDelimiterIndex]
				headerDelimiterIndex += 1
		else:
			raise ValueError('Invalid table formatting')
		
	
	def __str__(self):
		return f'(Content: {self.content}, Rowspan: {self.rowspan}, Colspan: {self.colspan}, Alignment: {self.alignment}, Position: {self.position}, ListFlag: {self.listFlag}, AuxiliarIndex: {self.auxiliarIndex})'
	

	def __repr__(self):
		return self.__str__()


class GridRow():
	"""	Represents a row in a grid table. """
	cells:list[GridCell] = []


	def __init__(self, length: int = 1) -> None:
		self.cells = [GridCell() for _ in range(length)]


	def __getitem__(self, item):
		return self.cells[item]


	def __setitem__(self, key, value):
		self.cells[key] = value


	def __str__(self):
		return str(self.cells)


	def __repr__(self):
		return self.__str__()


class GridRowsTracker():
	"""	Represents the document object. """
	def __init__(self, size:int) -> None:
		self.gridRowTracker = [0 for _ in range(size)]


	def __getitem__(self, item:int) -> int:
		return self.gridRowTracker[item]


	def __setitem__(self, key:int, value:int) -> None:
		self.gridRowTracker[key] = value


	def __str__(self):
		return str(self.gridRowTracker)


	def __repr__(self):
		return self.__str__()


# Some type aliases
GridTableRow = list[GridCell]
GridTableRowList = list[GridTableRow]

def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableRowList]:
	"""
	Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan.

	:param pandoc_table: String of the Pandoc-style grid table.
	:return: List of lists representing the table with metadata for spans.
	"""

	# Split the input into lines
	lines:list[str] = [line.strip() for line in gridTable.strip().split('\n')]


	# Detect separator lines by pattern (it does not take into account partial separators
	def isSeparator(line:str) -> bool:
		return matchGridTableSeparator.match(line) is not None


	# Set content on the cell - concatenating multilines, flagging lists
	def handleCellContent(cell:GridCell, content:str) -> None:
		_c = content.strip()

		if cell.content is None:	# Previous empty cell
			cell.rowspan += 1
			cell.colspan += 1
			if _c.startswith('- '):  # List in a cell
				cell.listFlag = True
				cell.content = _c + '\n'  # Add newline to know when the list element ends
			
			elif cell.listFlag and len(_c) > 0:  # any other content when handling list is concatenated to the last list element
				cell.content = _c + '\n'

			elif not _c:  # separation between list and other paragraph
				cell.listFlag = False
				cell.content = '\n' #if not cell['content'].endswith("\n") else ""
			else:
				cell.content = re.sub(r'\\\s*$', '\n', _c)
		else: # Cell has content
			if _c.startswith('- '):  # List
				if not cell.listFlag:
					cell.content += '\n'
					#cell['content'] = cell['content'].strip("\n")
				cell.listFlag = True
				cell.content += _c + '\n'  # Add newline to know when the list element ends
			elif cell.listFlag and _c:  # any other content when handling list is concatenated to the last list element
				cell.content = cell.content.strip('\n') + ' ' + _c + '\n'
			elif len(_c) == 0:  # separation between list and other paragraph
				cell.listFlag = False
				#content = re.sub(r'\\\s*$', "\n", content.strip())
				cell.content += '\n' if not cell.content.endswith('\n') else ''
			else:
				cell.content += ' ' + re.sub(r'\\\s*$', '\n', _c)

	# Adjust colspan of a cell
	def adjustColspan(row:GridRow, columnIndex:int, numberOfParts:int, line, numberOfColumns:int, delimiterPositions:list[int]) -> None:
		for j in range(columnIndex, numberOfParts):
			delimiterStart:Optional[int] = None
			colI = columnIndex
			while delimiterStart == None:
				delimiterStart = row[colI - 1].position if colI > 0 else 0
				colI -= 1
			positions = [line.find(delimiter, delimiterStart + 1) for delimiter in "|+" if delimiter in line[delimiterStart + 1:]]
			position = min(positions) if positions else -1
			if position > delimiterPositions[j]:  # Colspan to be increased
				row[columnIndex].colspan += 1
				if position == delimiterPositions[len(delimiterPositions) - 1]:  # last cell in row, adjust colspan to get max number columns
					colspan_allocated = row[columnIndex].colspan
					#for cell_index in range(number_of_parts):
					#	colspan_allocated += row[cell_index].colspan
					row[columnIndex].colspan += numberOfColumns - colspan_allocated - columnIndex
			elif position < delimiterPositions[j]:
				raise ValueError("Wrong cell formatting")
			else:
				break

		row[columnIndex].colspanAdjusted = True	# Mark cell as adjusted


	separatorIndices = [i for i, line in enumerate(lines) if isSeparator(line)]

	if not separatorIndices:
		raise ValueError('No valid separators found in the provided grid table.')

	# Calculate max number of columns
	delimiterPositions:list[int] = []
	numberOfColumns = 0

	for separatorIndex in separatorIndices:
		if (_cnt := lines[separatorIndex].count('+') - 1) > numberOfColumns:
			numberOfColumns = _cnt
			delimiterPositions = []
			for rowIndex in range(numberOfColumns):
				delimiterPositionsStart = delimiterPositions[rowIndex - 1] if rowIndex != 0 else 0
				delPositions = [lines[separatorIndex].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[separatorIndex][delimiterPositionsStart + 1:]]
				delimiterPositions.append(min(delPositions) if delPositions else -1)
	

	# Determine delimter positions and alignments
	hasHeader = False
	headerDelimiterPositions:list[int] = []
	headerRows:GridTableRowList = []
	dataRows:GridTableRowList = []
	defaultAlignments:list[str] = []

	for index in separatorIndices:
		if matchGridTableHeaderSeparator.match(lines[index]):
			hasHeader = True
			headerSeparatorIndex = index
			parts = re.split(r'\+', lines[index].strip('+'))
			#Calculate default alignments and positions of delimiters
			for partIndex in range(len(parts)):
				if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'):	# Left alignment
					defaultAlignments.append('align="left"')
				elif not parts[partIndex].startswith(":") and parts[partIndex].endswith(":"): # Right alignment
					defaultAlignments.append('align="right"')
				else:
					defaultAlignments.append('align="center"')	# Center alignment
				# Delimiter position
				delimiterPositionsStart = delimiterPositions[partIndex - 1] if partIndex != 0 else 0
				delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]]
				headerDelimiterPositions.append(min(delPositions) if delPositions else -1)


	for rowNumber in range(len(separatorIndices) - 1):
		rows:list[GridRow] = []
		rowsTracker:GridRowsTracker
		inDataRow = False
		start, end = separatorIndices[rowNumber], separatorIndices[rowNumber + 1]
		rowLines = lines[start:end]  # Lines between separators including separator line start as it gives information about the number of columns of the row
		if rowLines:
			# Combine multiline content into single strings for each cell
			for line in rowLines:
				if isSeparator(line) and not inDataRow:
					inDataRow = True
					parts = re.split(r'\s*\+\s*', line.strip('+'))
					delimiterIndex = 0

					rows.append(GridRow(numberOfColumns))
					rowsTracker = GridRowsTracker(numberOfColumns)
					columnIndex = 0

					for rowIndex in range(len(parts)):
						if columnIndex in range(numberOfColumns):
							delimiterIndex += len(parts[rowIndex]) + 1
							cell = rows[-1][columnIndex]
							
							# Set position
							cell.position = delimiterIndex # Position of cell delimiter +
							
							# Set alignment as defined by header separator line
							cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments)

							while delimiterIndex > delimiterPositions[columnIndex]:
								columnIndex += 1
							columnIndex += 1

				elif inDataRow:
					# Regular data row or partial separator
					if matchGridTableBodySeparator.match(line): # Partial separator
						cellsContent = re.split(r"[\|\+]", line.strip("|").strip("+"))  # (?<!\\)[\|\+]
						#Add another row, set delimiters for each cell
						rows.append(GridRow(numberOfColumns))
						auxDelimiterIndex = 0
						auxiliarCellIndex = 0

						for columnIndex, content in enumerate(cellsContent):
							if auxiliarCellIndex in range(numberOfColumns):
								auxDelimiterIndex += len(content) + 1
								cell = rows[-1][auxiliarCellIndex]
								cell.position = auxDelimiterIndex  # Position of cell delimiter +
								cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments)
								while auxDelimiterIndex > delimiterPositions[auxiliarCellIndex]:
									auxiliarCellIndex += 1
								auxiliarCellIndex += 1

						if len(cellsContent) <= numberOfColumns: # Colspan: Positions of | with respect to + need to be determined
							columnCellIndex = 0

							# Go through all cells in a columnt
							for columnIndex, content in enumerate(cellsContent):
								rowIndex = rowsTracker[columnCellIndex]
								cell = rows[rowIndex][columnCellIndex]

								# Check whether a cell contains a header separator
								if matchGridTableBodySeparatorLine.match(content):  # A new row is to be added
									rowsTracker[columnCellIndex] += 1	# That actual row will have more than one row
									cell.listFlag = False
									columnForward = 0
								
									for delIndex in range(columnCellIndex, len(delimiterPositions)):
										rowIndex = rowsTracker[columnCellIndex]	# Correcting the rowIndex. Might have been changed by a previous iteration
										if rows[rowIndex][columnCellIndex].position >= delimiterPositions[delIndex]:
											columnForward += 1
											rowsTracker[columnCellIndex + columnForward - 1] += 1 if columnForward > 1 else 0
									columnCellIndex += columnForward

									continue

								else:
									# Handle content of the cell
									handleCellContent(cell, cellsContent[columnIndex])
									cell.rowspan += 1
									if not cell.colspanAdjusted:
										# TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
										adjustColspan(rows[rowIndex], columnCellIndex, numberOfColumns, line, numberOfColumns, delimiterPositions)

									if cell.position >= delimiterPositions[columnCellIndex]:
										columnCellIndex += cell.colspan if cell.colspan != 0 else 1
									continue

						else:
							raise ValueError("More cells than columns found")
						
					else: # Data row
						cellsContent = re.split(r'\s*\|\s*', line.strip('|'))
						columnCellIndex = 0
						if len(cellsContent) < numberOfColumns: # Colspan: Positions of | with respect to + need to be determined
							for columnIndex, content in enumerate(cellsContent):
								row = rows[rowsTracker[columnCellIndex]]
								cell = row[columnCellIndex]
								# Handle content of the cell
								handleCellContent(cell, content)
								if not cell.colspanAdjusted:
									#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
									adjustColspan(row, columnCellIndex, numberOfColumns, line, numberOfColumns, delimiterPositions)
								if cell.position >= delimiterPositions[columnCellIndex]:
									columnCellIndex += cell.colspan  # Move forward index i

						elif len(cellsContent) == numberOfColumns: # Simple row
							for columnIndex, content in enumerate(cellsContent):
								rowIndex = rowsTracker[columnIndex]
								handleCellContent(rows[rowIndex][columnIndex], content)
						else:
							raise ValueError("More cells than columns found")
				else:
					raise ValueError("No separator line found for row starting")

			if hasHeader and start >= headerSeparatorIndex: # table_row and auxiliar_row are part of data_rows
				for row in rows:
					dataRows.append(row.cells)
			elif hasHeader and start < headerSeparatorIndex: # table_row and auxiliar_row are part of header_rows
				for row in rows:	# header rows
					headerRows.append(row.cells)

	# Check if there are any data rows
	if not dataRows and not headerRows:
		raise ValueError('No valid rows found in the provided grid table.')

	# Format text
	for gridRows in [headerRows, dataRows]:
		for gridRow in gridRows:
			for cell in gridRow:
				if cell.content is not None:
					# Replacing "<" by &lt;
					cell.content = cell.content.replace("<", "&lt;")

					# Bold replacements
					# Regex to detect markdown bold formatting in cell content
					if cell.content is not None:
						cell.content = matchBold.sub(r'<strong>\g<text></strong>', cell.content)
					
					# Italic replacements
					# Regex to detect markdown italic formatting in cell content
					if cell.content is not None:
						cell.content = matchItalic.sub(r'<i>\g<text></i>', cell.content)


	# Correct newlines characters
	for headerRow in headerRows:
		for cell in headerRow:
			cell.content = cell.content.replace('\n', '<br />') if cell.content is not None else None
	for dataRow in dataRows:
		for cell in dataRow:
			cell.content = cell.content.replace('\n', '<br />') if cell.content is not None else None

	#
	# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
	#

	# Checking the header rows
	forwardRowspan:list[int] = []
	for idx, headerRow in enumerate(headerRows):
		if len(forwardRowspan) == 0:
			forwardRowspan = [0] * len(headerRows[idx])
		sum = 0

		for cellIndex, cell in enumerate(headerRow):
			sum += cell.colspan
			if idx > 0 and cell.colspan == 0:
				if forwardRowspan[cellIndex] > 0:
					sum += 1
				forwardRowspan[cellIndex] -= 1
			if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1:
				forwardRowspan[cellIndex] = cell.rowspan -1

		if not sum == numberOfColumns:
			raise ValueError('Grid table not converted properly')

	# Checking the data rows
	forwardRowspan = []
	for idx, dataRow in enumerate(dataRows):
		if len(forwardRowspan) == 0:
			forwardRowspan = [0] * len(dataRows[idx])
		sum = 0

		for cellIndex, cell in enumerate(dataRows[idx]):
			sum += cell.colspan
			if idx > 0 and cell.colspan == 0:
				if forwardRowspan[cellIndex] > 0:
					sum += 1
				forwardRowspan[cellIndex] -= 1
			if forwardRowspan[cellIndex] == 0 and cell.rowspan > 1:
				forwardRowspan[cellIndex] = cell.rowspan - 1
		if not sum == numberOfColumns:
			raise ValueError('Grid table not converted properly')

	return headerRows, dataRows


def generateHtmlTableWithSpans(gridTable:str) -> str:
	"""	Generate an HTML table from a Pandoc-style grid table with row and column spans.

		Args:
			gridTable: The Pandoc-style grid table.

		Returns:
			The HTML table in string format.
	"""
	try:
		gridHeader, gridBody = parseGridTableWithSpans(gridTable)
	except Exception as e:
		import traceback
		traceback.print_exc()
		return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS. {e}'

	html = '<table>\n'
	hasHeader = False

	for row in gridHeader:
		for cell in row:
			if cell.rowspan != 0 and cell.colspan != 0:
				hasHeader = True
				break

	if hasHeader:
		html += '    <thead>\n'
		for row in gridHeader:
			html += "        <tr>\n"
			for cell in row:
				if cell.rowspan == 0 or cell.colspan == 0:
					continue
				else:
					# Prepare content, in case there's a list
					if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)):  # Update cell in new row
						list = "<ul>"
						# Build list the matches
						for match in matches:
							list += "<li>" + match[1] + "</li>"
						list += "</ul>"
						cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content)
						# Enforce left alignment if cell contains a list
						cell.alignment = "align=\"left\""

					rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
					colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
					html += f"            <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n"
			html += "        </tr>\n"
		html += "    </thead>\n"


	html += "    <tbody>\n"
	for row in gridBody:
		html += "        <tr>\n"
		for cell in row:
			if cell.rowspan == 0 or cell.colspan == 0:
				continue
			else:
				#Prepare content, in case there's a list
				if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)):  # Update cell in new row
					list = "<ul>"
					# Build list the matches
					for match in matches:
						list += "<li>" + match[1] + "</li>"
					list += "</ul>"
					cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content)
					# Enforce left alignment if cell contains a list
					cell.alignment = "align=\"left\""

				rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
				colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
				html += f"            <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n"
		html += "        </tr>\n"

	html += "    </tbody>\n"
	html += "</table>"
	return html
+494 −0

File added.

Preview size limit exceeded, changes collapsed.

+40 −0
Original line number Diff line number Diff line
#
# 	regexMatches.py
#
#	(c) 2025 by Andreas Kraft & Miguel Angel Reina Ortega
#	License: BSD 3-Clause License. See the LICENSE file for further details.

#
""" This module contains the regular expressions used in the markdown processing.
"""

import re


# Regular expressions
match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
matchFootnote = re.compile(r'\[\^([^\]]*)\]:', re.IGNORECASE)
matchHtmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE)
matchHtmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
matchHtmlTag = re.compile(r'<[^>]*>', re.IGNORECASE)
MatchInlineFootnote = re.compile(r'\[\^([^\]]*)\]', re.IGNORECASE)
markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
matchHeader = re.compile(r'(#+)\s+(.*)', re.IGNORECASE)
matchHeaderNumber = re.compile(r'\b[A-Za-z0-9]\d*(\.\d+)*\b', re.IGNORECASE)
matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
matchNoteStart = re.compile(r'^\s*>\s*(note)?\s*[:]?\s*', re.IGNORECASE)
matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE)
matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE)
matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)

matchBold = re.compile(r'(?<!\S)(\*\*|__)(?P<text>.+?)(?<!\\)\1(?!\S)')
matchItalic = re.compile(r'(?<!\S)(\*|_)(?P<text>.+?)(?<!\\)\1(?!\S)')
+94 −1001

File changed.

Preview size limit exceeded, changes collapsed.