Improvements for grid tables conversion (944ea98e) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

toMkdocs/toMkdocs.py

+141 −31

Original line number	Diff line number	Diff line
		@@ -486,6 +486,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
		self.list_flag = False

		def set_alignment(self):
		if has_header:
		header_delimiter_index = 0
		while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]:
		header_delimiter_index += 1
		@@ -497,7 +498,19 @@ def parse_pandoc_table_with_spans(pandoc_table):
		header_delimiter_index += 1
		else:
		raise ValueError("Invalid table formatting")

		else:
		body_delimiter_index = 0
		while body_delimiter_index in range(len(default_alignments)) and self.position > \
		delimiter_positions[body_delimiter_index]:
		body_delimiter_index += 1
		if body_delimiter_index in range(len(default_alignments)):
		if self.position < delimiter_positions[body_delimiter_index]:
		self.alignment = default_alignments[body_delimiter_index]
		elif self.position == delimiter_positions[body_delimiter_index]:
		self.alignment = default_alignments[body_delimiter_index]
		body_delimiter_index += 1
		else:
		raise ValueError("Invalid table formatting")
		class Row():
		""" Represents a row in the markdown file. """
		cells:list[Cell] = []
		@@ -534,12 +547,15 @@ def parse_pandoc_table_with_spans(pandoc_table):
		if content.strip().startswith("- "): # List
		cell.list_flag = True
		#print(content)
		cell.content = content.strip() + "\n" # Add newline to know when the list element ends
		content = re.sub(r'\\\s*$', "\n", content.strip())
		cell.content = content + "@" # Add list element end mark to know when the list element ends
		elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element
		cell.content += content.strip() + "\n"
		content = re.sub(r'\\\s*$', "\n", content.strip())
		cell.content += content + "@" #add the list element end mark
		elif content.strip == "": # separation between list and other paragraph
		cell.list_flag = False
		cell.content += "\n" #if not cell['content'].endswith("\n") else ""
		#if cell.list_flag:
		# cell.list_flag = False
		cell.content += "\n" if not cell['content'].endswith("\n") else ""
		else:
		cell.content = re.sub(r'\\\s*$', "\n", content.strip())
		else:
		@@ -548,12 +564,16 @@ def parse_pandoc_table_with_spans(pandoc_table):
		cell.content += "\n"
		#cell['content'] = cell['content'].strip("\n")
		cell.list_flag = True
		cell.content += content.strip() + "\n" # Add newline to know when the list element ends
		content = re.sub(r'\\\s*$', "\n", content.strip())
		cell.content += content + "@" # Add list element end mark to know when the list element ends
		elif cell.list_flag and content.strip() != "": # any other content when handling list is concatenated to the last list element
		cell.content = cell.content.strip("\n")
		cell.content += " " + content.strip() + "\n"
		cell.content = cell.content.strip("@") #remove list element end mark
		content = re.sub(r'\\\s*$', "\n", content.strip())
		cell.content += " " + content + "@" #add list element end mark
		elif content.strip() == "": # separation between list and other paragraph
		if cell.list_flag:
		cell.list_flag = False
		cell.content += "\n\n" #end list by \n
		#content = re.sub(r'\\\s*$', "\n", content.strip())
		cell.content += "\n" if not cell.content.endswith("\n") else ""
		else:
		@@ -604,6 +624,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
		delimiter_positions.append(min(del_positions) if del_positions else -1)
		has_header = False
		header_delimiter_positions = []
		header_rows = []
		for index in separator_indices:
		if _matchGridTableHeaderSeparator.match(lines[index]):
		has_header = True
		@@ -624,6 +645,18 @@ def parse_pandoc_table_with_spans(pandoc_table):
		del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]]
		header_delimiter_positions.append(min(del_positions) if del_positions else -1)

		if not has_header:
		#Set default alignments from the first separator
		parts = re.split(r"\+", lines[0].strip("+"))
		default_alignments = []
		# Calculate default alignments and positions of delimiters
		for part_index in range(len(parts)):
		if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
		default_alignments.append("align=\"left\"")
		elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
		default_alignments.append("align=\"right\"")
		else:
		default_alignments.append("align=\"center\"")
		data_rows = []
		for row in range(len(separator_indices) - 1):
		rows = []
		@@ -636,6 +669,10 @@ def parse_pandoc_table_with_spans(pandoc_table):
		for line in row_lines:
		if is_separator(line) and not in_data_row:
		in_data_row = True
		# Add delimiter alignment check for separator lines
		if not check_delimiter_alignment(line, delimiter_positions):
		raise ValueError(f"Misaligned delimiters in separator row: {line}")

		parts = re.split(r"\s\+\s", line.strip("+"))
		delimiter_index = 0
		# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
		@@ -667,7 +704,11 @@ def parse_pandoc_table_with_spans(pandoc_table):
		elif in_data_row:
		# Regular data row or partial separator
		if _matchGridTableBodySeparator.match(line): # Partial separator
		cells_content = re.split(r"[\\|\+]", line.strip("\|").strip("+")) # (?<!\\)[\\|\+]
		# Add delimiter alignment check for partial separators
		if not check_delimiter_alignment(line, delimiter_positions):
		raise ValueError(f"Misaligned delimiters in partial separator: {line}")

		cells_content = re.split(r"[\\|\+]", line.strip("\|").strip("+"))
		#Add another row, set delimiters for each cell
		rows.append(Row(number_of_columns))
		aux_delimiter_index = 0
		@@ -717,7 +758,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
		else:
		raise ValueError("More cells than columns found")
		else: # Data row
		cells_content = re.split(r"\s\\|\s", line.strip("\|"))
		cells_content = line.strip()
		cells_content = re.split(r"\\|", line.strip("\|"))

		# Add delimiter alignment check
		if not check_delimiter_alignment(line, delimiter_positions):
		raise ValueError(f"Misaligned delimiters in row: {line}")

		column_index = 0
		if len(cells_content) < number_of_columns: # Colspan: Positions of \| with respect to + need to be determined
		for i in range(len(cells_content)):
		@@ -744,6 +791,10 @@ def parse_pandoc_table_with_spans(pandoc_table):
		elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
		for header_row in rows:
		header_rows.append(header_row.cells)
		else:
		#only body
		for body_row in rows:
		data_rows.append(body_row.cells)

		#print(header_rows)
		#print(data_rows)
		@@ -821,19 +872,32 @@ def parse_pandoc_table_with_spans(pandoc_table):

		return header_rows, data_rows

		def generate_html_table_with_spans(pandoc_table):
		def generate_html_table_with_spans(pandoc_table: str) -> str:
		"""
		Generate an HTML table from a Pandoc-style grid table with row and column spans.

		:param pandoc_table: String of the Pandoc-style grid table.
		:return: HTML string.
		Args:
		pandoc_table (str): String of the Pandoc-style grid table.

		Returns:
		str: Generated HTML table markup, or error message if generation fails.
		"""
		debug_output = []
		def debug_print(msg):
		debug_output.append(str(msg)) # Convert message to string

		try:
		# Redirect print statements to our debug collector
		global print
		original_print = print
		print = debug_print

		grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table)
		except:
		logging.ERROR("Grid table could not be generated")
		return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS"
		else:

		# Restore original print
		print = original_print

		# Generate table HTML...
		html = "<table>\n"
		has_header = False

		@@ -851,7 +915,7 @@ def generate_html_table_with_spans(pandoc_table):
		else:
		# Prepare content, in case there's a list
		#print(cell.content)
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>",
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+((?:(?!@).)+)@",
		cell.content): # Update cell in new row
		#print("MATCHING")
		list = "<ul>"
		@@ -859,7 +923,7 @@ def generate_html_table_with_spans(pandoc_table):
		for match in matches:
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell.content = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content)
		cell.content = re.sub(r"(\s([-+]\|\s*\d+\.)\s+(?:(?!@).)+@)+", list, cell.content)
		# Enforce left alignment if cell contains a list
		cell.alignment = "align=\"left\""
		#else:
		@@ -880,7 +944,7 @@ def generate_html_table_with_spans(pandoc_table):
		else:
		#Prepare content, in case there's a list
		#print(cell.content)
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>", cell.content): # Update cell in new row
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+((?:(?!@).)+)@", cell.content): # Update cell in new row
		#print("MATCHING")
		#print(cell.content)
		list = "<ul>"
		@@ -888,7 +952,7 @@ def generate_html_table_with_spans(pandoc_table):
		for match in matches:
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell.content = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content)
		cell.content = re.sub(r"\s([-+]\|\s*\d+\.)\s+((?:(?!@).)+@)+",list, cell.content)
		# Enforce left alignment if cell contains a list
		cell.alignment = "align=\"left\""
		#else:
		@@ -901,6 +965,52 @@ def generate_html_table_with_spans(pandoc_table):
		html += " </tbody>\n"
		html += "</table>"
		return html
		except Exception as e:
		logging.error("Grid table could not be generated")
		debug_text = "<br>".join(debug_output) # Now all items are strings
		return f"HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE.<br><pre>{debug_text}</pre>"

		def check_delimiter_alignment(line: str, delimiter_positions: list[int], delimiters: str = "\|+") -> bool:
		"""
		Check if delimiters in a row align with expected positions.

		Args:
		line: The line of text to check
		delimiter_positions: List of expected positions (based on + characters)
		delimiters: String containing valid delimiter characters (default: "\|+")

		Returns:
		bool: True if delimiters align correctly, False otherwise
		"""
		if not line or not delimiter_positions:
		return False

		print(f"\nChecking line: '{line}'")
		print(f"Expected delimiter positions: {delimiter_positions}")

		# For full separator lines (only +)
		if '+' in line and '\|' not in line:
		current_positions = [i for i, char in enumerate(line) if (char == '+' and i != 0)]
		print(f"Full separator line - Found + at positions: {current_positions}")
		return all(delimiter_positions[-1] in current_positions and
		line.startswith("+") and
		pos in delimiter_positions for pos in current_positions)

		# For data lines (only \|)
		if '\|' in line and '+' not in line:
		current_positions = [i for i, char in enumerate(line) if (char == '\|' and i != 0)]
		print(f"Data line - Found \| at positions: {current_positions}")
		return all(delimiter_positions[-1] in current_positions and
		line.startswith("\|") and
		pos in delimiter_positions for pos in current_positions)

		# For partial separators (mix of + and \|)
		current_positions = [i for i, char in enumerate(line) if (char in delimiters and i != 0)]
		print(f"Partial separator - Found delimiters at positions: {current_positions}")
		print(f"Characters at those positions: {[line[pos] for pos in current_positions]}")
		return all(delimiter_positions[-1] in current_positions and
		(line.startswith("+") or line.startswith("\|")) and
		pos in delimiter_positions for pos in current_positions)

		def analyseMarkdown(filename:str) -> Document:
		""" Analyse the markdown file and split it into clauses.