Commit 6c8a9ddc authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Some improvements for grid tables conversion

parent 9d0a1d23
Loading
Loading
Loading
Loading
+146 −38
Original line number Diff line number Diff line
@@ -27,12 +27,13 @@ class GridCell:
		self.auxiliarIndex:int = 0


	def calculateAndSetAlignment(self, headerDelimiterPositions:list[int], defaultAlignments:list[str]) -> None:
	def calculateAndSetAlignment(self) -> None:
		"""	Set the alignment of the cell based on the position of the delimiter. 
		"""
		if self.position is None:
			raise ValueError('Cell position must be set before calculating alignment.')
		
		if hasHeader:
			headerDelimiterIndex = 0
			while headerDelimiterIndex < len(defaultAlignments) and self.position > headerDelimiterPositions[headerDelimiterIndex]:
				headerDelimiterIndex += 1
@@ -44,6 +45,18 @@ class GridCell:
					headerDelimiterIndex += 1
			else:
				raise ValueError('Invalid table formatting')
		else:
			body_delimiter_index = 0
			while body_delimiter_index in range(len(defaultAlignments)) and self.position > delimiterPositions[body_delimiter_index]:
				body_delimiter_index += 1
			if body_delimiter_index in range(len(defaultAlignments)):
				if self.position < delimiterPositions[body_delimiter_index]:
					self.alignment = defaultAlignments[body_delimiter_index]
				elif self.position == delimiterPositions[body_delimiter_index]:
					self.alignment = defaultAlignments[body_delimiter_index]
					body_delimiter_index += 1
			else:
				raise ValueError("Invalid table formatting")

	
	def __str__(self):
@@ -78,6 +91,48 @@ class GridRow():
	def __repr__(self):
		return self.__str__()

	def check_delimiter_alignment(line: str, delimiters: str = "|+") -> bool:
		"""
		Check if delimiters in a row align with expected positions.
		
		Args:
			line: The line of text to check
			delimiter_positions: List of expected positions (based on + characters)
			delimiters: String containing valid delimiter characters (default: "|+")
		
		Returns:
			bool: True if delimiters align correctly, False otherwise
		"""
		if not line or not delimiterPositions:
			return False
		
		print(f"\nChecking line: '{line}'")
		print(f"Expected delimiter positions: {delimiterPositions}")
		
		# For full separator lines (only +)
		if '+' in line and '|' not in line:
			currentPositions = [i for i, char in enumerate(line) if (char == '+' and i != 0)]
			print(f"Full separator line - Found + at positions: {currentPositions}")
			return all(delimiterPositions[-1] in currentPositions and 
					line.startswith("+") and
					pos in delimiterPositions for pos in currentPositions)
		
		# For data lines (only |)
		if '|' in line and '+' not in line:
			currentPositions = [i for i, char in enumerate(line) if (char == '|' and i != 0)]
			print(f"Data line - Found | at positions: {current_positions}")
			return all(delimiterPositions[-1] in currentPositions and 
					line.startswith("|") and
					pos in delimiterPositions for pos in currentPositions)
		
		# For partial separators (mix of + and |)
		currentPositions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)]
		print(f"Partial separator - Found delimiters at positions: {currentPositions}")
		print(f"Characters at those positions: {[line[pos] for pos in currentPositions]}")
		return all(delimiterPositions[-1] in currentPositions and 
				(line.startswith("+") or line.startswith("|")) and
				pos in delimiterPositions for pos in currentPositions)


class GridRowsTracker():
	"""	Represents the document object. """
@@ -112,6 +167,14 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
	:param pandoc_table: String of the Pandoc-style grid table.
	:return: List of lists representing the table with metadata for spans.
	"""
	global hasHeader, defaultAlignments, headerDelimiterPositions, delimiterPositions, nextListElementMark
	
	# Initialize globals
	hasHeader = False
	defaultAlignments:list[str] = []
	headerDelimiterPositions:list[int] = []
	delimiterPositions:list[int] = []
	nextListElementMark = '@'
	
	# Split the input into lines
	lines:list[str] = [line.strip() for line in gridTable.strip().split('\n')]
@@ -131,14 +194,13 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
			cell.colspan += 1
			if _c.startswith('- '):  # List in a cell
				cell.listFlag = True
				cell.content = _c + '\n'  # Add newline to know when the list element ends
			
				_c = re.sub(r'\\\s*$', '\n', _c)
				cell.content = _c + nextListElementMark  # Add list element end mark to know when the list element ends		
			elif cell.listFlag and len(_c) > 0:  # any other content when handling list is concatenated to the last list element
				cell.content = _c + '\n'

				_c = re.sub(r'\\\s*$', '\n', _c)
				cell.content += _c + nextListElementMark #add the list element end mark
			elif not _c:  # separation between list and other paragraph
				cell.listFlag = False
				cell.content = '\n' #if not cell['content'].endswith("\n") else ""
				cell.content += '\n' if not cell['content'].endswith('\n') else ""
			else:
				cell.content = re.sub(r'\\\s*$', '\n', _c)
		else: # Cell has content
@@ -147,11 +209,16 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
					cell.content += '\n'
					#cell['content'] = cell['content'].strip("\n")
				cell.listFlag = True
				cell.content += _c + '\n'  # Add newline to know when the list element ends
			elif cell.listFlag and _c:  # any other content when handling list is concatenated to the last list element
				cell.content = cell.content.strip('\n') + ' ' + _c + '\n'
				_c = re.sub(r'\\\s*$', '\n', _c)
				cell.content += _c + nextListElementMark  # Add list element end mark to know when the list element ends
			elif cell.listFlag and len(_c) > 0:  # any other content when handling list is concatenated to the last list element
				cell.content = cell.content.strip(nextListElementMark) #remove list element end mark
				_c = re.sub(r'\\\s*$', '\n', _c)
				cell.content += " " + _c + nextListElementMark #add list element end mark
			elif len(_c) == 0:  # separation between list and other paragraph
				cell.listFlag = False
				if cell.list_flag:
					cell.list_flag = False
					cell.content += '\n\n' #end list by \n
				#content = re.sub(r'\\\s*$', "\n", content.strip())
				cell.content += '\n' if not cell.content.endswith('\n') else ''
			else:
@@ -202,11 +269,8 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
	

	# Determine delimter positions and alignments
	hasHeader = False
	headerDelimiterPositions:list[int] = []
	headerRows:GridTableRowList = []
	dataRows:GridTableRowList = []
	defaultAlignments:list[str] = []

	for index in separatorIndices:
		if matchGridTableHeaderSeparator.match(lines[index]):
@@ -217,7 +281,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
			for partIndex in range(len(parts)):
				if parts[partIndex].startswith(':') and not parts[partIndex].endswith(':'):	# Left alignment
					defaultAlignments.append('align="left"')
				elif not parts[partIndex].startswith(":") and parts[partIndex].endswith(":"): # Right alignment
				elif not parts[partIndex].startswith(':') and parts[partIndex].endswith(':'): # Right alignment
					defaultAlignments.append('align="right"')
				else:
					defaultAlignments.append('align="center"')	# Center alignment
@@ -226,6 +290,18 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
				delPositions = [lines[index].find(delimiter, delimiterPositionsStart + 1) for delimiter in '+' if delimiter in lines[index][delimiterPositionsStart + 1:]]
				headerDelimiterPositions.append(min(delPositions) if delPositions else -1)

	if not hasHeader:
		#Set default alignments from the first separator
		parts = re.split(r'\+', lines[0].strip('+'))
		default_alignments = []
		# Calculate default alignments and positions of delimiters
		for part_index in range(len(parts)):
			if parts[part_index].startswith(':') and not parts[part_index].endswith(':'):
				default_alignments.append('align="left"')
			elif not parts[part_index].startswith(':') and parts[part_index].endswith(':'):
				default_alignments.append('align="right"')
			else:
				default_alignments.append('align="center"')

	for rowNumber in range(len(separatorIndices) - 1):
		rows:list[GridRow] = []
@@ -238,6 +314,10 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
			for line in rowLines:
				if isSeparator(line) and not inDataRow:
					inDataRow = True
					# Add delimiter alignment check for separator lines
					if not check_delimiter_alignment(line, delimiterPositions):
						raise ValueError(f"Misaligned delimiters in separator row: {line}")
					
					parts = re.split(r'\s*\+\s*', line.strip('+'))
					delimiterIndex = 0

@@ -254,7 +334,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
							cell.position = delimiterIndex # Position of cell delimiter +
							
							# Set alignment as defined by header separator line
							cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments)
							cell.calculateAndSetAlignment()

							while delimiterIndex > delimiterPositions[columnIndex]:
								columnIndex += 1
@@ -263,7 +343,11 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
				elif inDataRow:
					# Regular data row or partial separator
					if matchGridTableBodySeparator.match(line): # Partial separator
						cellsContent = re.split(r"[\|\+]", line.strip("|").strip("+"))  # (?<!\\)[\|\+]
						# Add delimiter alignment check for partial separators
						if not check_delimiter_alignment(line, delimiterPositions):
							raise ValueError(f"Misaligned delimiters in partial separator: {line}")

						cellsContent = re.split(r"[\|\+]", line.strip('|').strip('+'))  # (?<!\\)[\|\+]
						#Add another row, set delimiters for each cell
						rows.append(GridRow(numberOfColumns))
						auxDelimiterIndex = 0
@@ -274,7 +358,7 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
								auxDelimiterIndex += len(content) + 1
								cell = rows[-1][auxiliarCellIndex]
								cell.position = auxDelimiterIndex  # Position of cell delimiter +
								cell.calculateAndSetAlignment(headerDelimiterPositions, defaultAlignments)
								cell.calculateAndSetAlignment()
								while auxDelimiterIndex > delimiterPositions[auxiliarCellIndex]:
									auxiliarCellIndex += 1
								auxiliarCellIndex += 1
@@ -318,7 +402,13 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
							raise ValueError("More cells than columns found")
						
					else: # Data row
						cellsContent = re.split(r'\s*\|\s*', line.strip('|'))
						cellsContent = line.strip()
						cellsContent = re.split(r"\|", line.strip('|'))
						
						# Add delimiter alignment check
						if not check_delimiter_alignment(line, delimiterPositions):
							raise ValueError(f"Misaligned delimiters in row: {line}")
							
						columnCellIndex = 0
						if len(cellsContent) < numberOfColumns: # Colspan: Positions of | with respect to + need to be determined
							for columnIndex, content in enumerate(cellsContent):
@@ -347,6 +437,10 @@ def parseGridTableWithSpans(gridTable:str) -> tuple[GridTableRowList, GridTableR
			elif hasHeader and start < headerSeparatorIndex: # table_row and auxiliar_row are part of header_rows
				for row in rows:	# header rows
					headerRows.append(row.cells)
			else:
				#only body
				for row in rows:
					dataRows.append(row.cells)

	# Check if there are any data rows
	if not dataRows and not headerRows:
@@ -432,13 +526,27 @@ def generateHtmlTableWithSpans(gridTable:str) -> str:
		Returns:
			The HTML table in string format.
	"""
	debug_output = []
	def debug_print(msg):
		debug_output.append(str(msg))  # Convert message to string

	try:
		# Redirect print statements to our debug collector
		global print
		original_print = print
		print = debug_print

		gridHeader, gridBody = parseGridTableWithSpans(gridTable)
		
		# Restore original print
		print = original_print

	except Exception as e:
		import traceback
		traceback.print_exc()
		return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS. {e}'
		debug_print("Grid table could not be generated")
		debug_text = "<br>".join(debug_output)  # Now all items are strings
		return f'HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE.<br><pre>{debug_text}</pre>'
		
	# Generate table HTML...
	html = '<table>\n'
	hasHeader = False

@@ -457,13 +565,13 @@ def generateHtmlTableWithSpans(gridTable:str) -> str:
					continue
				else:
					# Prepare content, in case there's a list
					if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)):  # Update cell in new row
					if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content)):  # Update cell in new row
						list = "<ul>"
						# Build list the matches
						for match in matches:
							list += "<li>" + match[1] + "</li>"
						list += "</ul>"
						cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content)
						cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+", list, cell.content)
						# Enforce left alignment if cell contains a list
						cell.alignment = "align=\"left\""

@@ -482,13 +590,13 @@ def generateHtmlTableWithSpans(gridTable:str) -> str:
				continue
			else:
				#Prepare content, in case there's a list
				if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content)):  # Update cell in new row
				if cell.content is not None and (matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content)):  # Update cell in new row
					list = "<ul>"
					# Build list the matches
					for match in matches:
						list += "<li>" + match[1] + "</li>"
					list += "</ul>"
					cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content)
					cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+",list, cell.content)
					# Enforce left alignment if cell contains a list
					cell.alignment = "align=\"left\""