Commit 944ea98e authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Improvements for grid tables conversion

parent ecd971f0
Loading
Loading
Loading
Loading
+141 −31
Original line number Diff line number Diff line
@@ -486,6 +486,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
			self.list_flag = False

		def set_alignment(self):
			if has_header:
				header_delimiter_index = 0
				while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]:
					header_delimiter_index += 1
@@ -497,7 +498,19 @@ def parse_pandoc_table_with_spans(pandoc_table):
						header_delimiter_index += 1
				else:
					raise ValueError("Invalid table formatting")

			else:
				body_delimiter_index = 0
				while body_delimiter_index in range(len(default_alignments)) and self.position > \
						delimiter_positions[body_delimiter_index]:
					body_delimiter_index += 1
				if body_delimiter_index in range(len(default_alignments)):
					if self.position < delimiter_positions[body_delimiter_index]:
						self.alignment = default_alignments[body_delimiter_index]
					elif self.position == delimiter_positions[body_delimiter_index]:
						self.alignment = default_alignments[body_delimiter_index]
						body_delimiter_index += 1
				else:
					raise ValueError("Invalid table formatting")
	class Row():
		"""	Represents a row in the markdown file. """
		cells:list[Cell] = []
@@ -534,12 +547,15 @@ def parse_pandoc_table_with_spans(pandoc_table):
			if content.strip().startswith("- "):  # List
				cell.list_flag = True
				#print(content)
				cell.content = content.strip() + "\n"  # Add newline to know when the list element ends
				content = re.sub(r'\\\s*$', "\n", content.strip())
				cell.content = content + "@"  # Add list element end mark to know when the list element ends
			elif cell.list_flag and content.strip() != "":  # any other content when handling list is concatenated to the last list element
				cell.content += content.strip() + "\n"
				content = re.sub(r'\\\s*$', "\n", content.strip())
				cell.content += content + "@" #add the list element end mark
			elif content.strip == "":  # separation between list and other paragraph
				cell.list_flag = False
				cell.content += "\n" #if not cell['content'].endswith("\n") else ""
				#if cell.list_flag:
				#	cell.list_flag = False
				cell.content += "\n" if not cell['content'].endswith("\n") else ""
			else:
				cell.content = re.sub(r'\\\s*$', "\n", content.strip())
		else:
@@ -548,12 +564,16 @@ def parse_pandoc_table_with_spans(pandoc_table):
					cell.content += "\n"
					#cell['content'] = cell['content'].strip("\n")
				cell.list_flag = True
				cell.content += content.strip() + "\n"  # Add newline to know when the list element ends
				content = re.sub(r'\\\s*$', "\n", content.strip())
				cell.content += content + "@"  # Add list element end mark to know when the list element ends
			elif cell.list_flag and content.strip() != "":  # any other content when handling list is concatenated to the last list element
				cell.content = cell.content.strip("\n")
				cell.content += " " + content.strip() + "\n"
				cell.content = cell.content.strip("@") #remove list element end mark
				content = re.sub(r'\\\s*$', "\n", content.strip())
				cell.content += " " + content + "@" #add list element end mark
			elif content.strip() == "":  # separation between list and other paragraph
				if cell.list_flag:
					cell.list_flag = False
					cell.content += "\n\n" #end list by \n
				#content = re.sub(r'\\\s*$', "\n", content.strip())
				cell.content += "\n" if not cell.content.endswith("\n") else ""
			else:
@@ -604,6 +624,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
				delimiter_positions.append(min(del_positions) if del_positions else -1)
	has_header = False
	header_delimiter_positions = []
	header_rows = []
	for index in separator_indices:
		if _matchGridTableHeaderSeparator.match(lines[index]):
			has_header = True
@@ -624,6 +645,18 @@ def parse_pandoc_table_with_spans(pandoc_table):
				del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]]
				header_delimiter_positions.append(min(del_positions) if del_positions else -1)

	if not has_header:
		#Set default alignments from the first separator
		parts = re.split(r"\+", lines[0].strip("+"))
		default_alignments = []
		# Calculate default alignments and positions of delimiters
		for part_index in range(len(parts)):
			if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
				default_alignments.append("align=\"left\"")
			elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
				default_alignments.append("align=\"right\"")
			else:
				default_alignments.append("align=\"center\"")
	data_rows = []
	for row in range(len(separator_indices) - 1):
		rows = []
@@ -636,6 +669,10 @@ def parse_pandoc_table_with_spans(pandoc_table):
			for line in row_lines:
				if is_separator(line) and not in_data_row:
					in_data_row = True
					# Add delimiter alignment check for separator lines
					if not check_delimiter_alignment(line, delimiter_positions):
						raise ValueError(f"Misaligned delimiters in separator row: {line}")
					
					parts = re.split(r"\s*\+\s*", line.strip("+"))
					delimiter_index = 0
					# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
@@ -667,7 +704,11 @@ def parse_pandoc_table_with_spans(pandoc_table):
				elif in_data_row:
					# Regular data row or partial separator
					if _matchGridTableBodySeparator.match(line): # Partial separator
						cells_content = re.split(r"[\|\+]", line.strip("|").strip("+"))  # (?<!\\)[\|\+]
						# Add delimiter alignment check for partial separators
						if not check_delimiter_alignment(line, delimiter_positions):
							raise ValueError(f"Misaligned delimiters in partial separator: {line}")
							
						cells_content = re.split(r"[\|\+]", line.strip("|").strip("+"))
						#Add another row, set delimiters for each cell
						rows.append(Row(number_of_columns))
						aux_delimiter_index = 0
@@ -717,7 +758,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
						else:
							raise ValueError("More cells than columns found")
					else: # Data row
						cells_content = re.split(r"\s*\|\s*", line.strip("|"))
						cells_content = line.strip()
						cells_content = re.split(r"\|", line.strip("|"))
						
						# Add delimiter alignment check
						if not check_delimiter_alignment(line, delimiter_positions):
							raise ValueError(f"Misaligned delimiters in row: {line}")
							
						column_index = 0
						if len(cells_content) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells_content)):
@@ -744,6 +791,10 @@ def parse_pandoc_table_with_spans(pandoc_table):
			elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
				for header_row in rows:
					header_rows.append(header_row.cells)
			else:
				#only body
				for body_row in rows:
					data_rows.append(body_row.cells)

	#print(header_rows)
	#print(data_rows)
@@ -821,19 +872,32 @@ def parse_pandoc_table_with_spans(pandoc_table):

	return header_rows, data_rows

def generate_html_table_with_spans(pandoc_table):
def generate_html_table_with_spans(pandoc_table: str) -> str:
	"""
	Generate an HTML table from a Pandoc-style grid table with row and column spans.

	:param pandoc_table: String of the Pandoc-style grid table.
	:return: HTML string.
	Args:
		pandoc_table (str): String of the Pandoc-style grid table.

	Returns:
		str: Generated HTML table markup, or error message if generation fails.
	"""
	debug_output = []
	def debug_print(msg):
		debug_output.append(str(msg))  # Convert message to string

	try:
		# Redirect print statements to our debug collector
		global print
		original_print = print
		print = debug_print
		
		grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table)
	except:
		logging.ERROR("Grid table could not be generated")
		return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS"
	else:
		
		# Restore original print
		print = original_print
		
		# Generate table HTML...
		html = "<table>\n"
		has_header = False

@@ -851,7 +915,7 @@ def generate_html_table_with_spans(pandoc_table):
					else:
						# Prepare content, in case there's a list
						#print(cell.content)
						if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
						if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@",
												 cell.content):  # Update cell in new row
							#print("MATCHING")
							list = "<ul>"
@@ -859,7 +923,7 @@ def generate_html_table_with_spans(pandoc_table):
							for match in matches:
								list += "<li>" + match[1] + "</li>"
							list += "</ul>"
							cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content)
							cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+(?:(?!@).)+@)+", list, cell.content)
							# Enforce left alignment if cell contains a list
							cell.alignment = "align=\"left\""
						#else:
@@ -880,7 +944,7 @@ def generate_html_table_with_spans(pandoc_table):
				else:
					#Prepare content, in case there's a list
					#print(cell.content)
					if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content):  # Update cell in new row
					if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content):  # Update cell in new row
						#print("MATCHING")
						#print(cell.content)
						list = "<ul>"
@@ -888,7 +952,7 @@ def generate_html_table_with_spans(pandoc_table):
						for match in matches:
							list += "<li>" + match[1] + "</li>"
						list += "</ul>"
						cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content)
						cell.content = re.sub(r"\s*([-*+]|\s*\d+\.)\s+((?:(?!@).)+@)+",list, cell.content)
						# Enforce left alignment if cell contains a list
						cell.alignment = "align=\"left\""
					#else:
@@ -901,6 +965,52 @@ def generate_html_table_with_spans(pandoc_table):
		html += "    </tbody>\n"
		html += "</table>"
		return html
	except Exception as e:
		logging.error("Grid table could not be generated")
		debug_text = "<br>".join(debug_output)  # Now all items are strings
		return f"HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE.<br><pre>{debug_text}</pre>"

def check_delimiter_alignment(line: str, delimiter_positions: list[int], delimiters: str = "|+") -> bool:
    """
    Check if delimiters in a row align with expected positions.
    
    Args:
        line: The line of text to check
        delimiter_positions: List of expected positions (based on + characters)
        delimiters: String containing valid delimiter characters (default: "|+")
    
    Returns:
        bool: True if delimiters align correctly, False otherwise
    """
    if not line or not delimiter_positions:
        return False
    
    print(f"\nChecking line: '{line}'")
    print(f"Expected delimiter positions: {delimiter_positions}")
    
    # For full separator lines (only +)
    if '+' in line and '|' not in line:
        current_positions = [i for i, char in enumerate(line) if (char == '+' and i != 0)]
        print(f"Full separator line - Found + at positions: {current_positions}")
        return all(delimiter_positions[-1] in current_positions and 
				   line.startswith("+") and
				   pos in delimiter_positions for pos in current_positions)
    
    # For data lines (only |)
    if '|' in line and '+' not in line:
        current_positions = [i for i, char in enumerate(line) if (char == '|' and i != 0)]
        print(f"Data line - Found | at positions: {current_positions}")
        return all(delimiter_positions[-1] in current_positions and 
				   line.startswith("|") and
				   pos in delimiter_positions for pos in current_positions)
       
    # For partial separators (mix of + and |)
    current_positions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)]
    print(f"Partial separator - Found delimiters at positions: {current_positions}")
    print(f"Characters at those positions: {[line[pos] for pos in current_positions]}")
    return all(delimiter_positions[-1] in current_positions and 
			   (line.startswith("+") or line.startswith("|")) and
			   pos in delimiter_positions for pos in current_positions)

def analyseMarkdown(filename:str) -> Document:
	"""	Analyse the markdown file and split it into clauses.