Using class Cell and Row to handle grid tables conversion to html (708d9fb8) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

toMkdocs/toMkdocs.py

+164 −117

Original line number	Diff line number	Diff line
		@@ -420,6 +420,7 @@ _matchStandAloneImage = re.compile(r'^\s!\[[^\]]\]$([^)])$\s', re.IGNORECA
		_matchTable = re.compile(r'^\s\\|.\\|\s*$', re.IGNORECASE)
		_matchTableSeparator = re.compile(r'^\s\\|([-: ]+\\|)+\s$', re.IGNORECASE)
		_matchGridTable = re.compile(r'^\s\+-.\+\s$', re.IGNORECASE)
		_matchGridTableSeparator = re.compile(r'\s\+([-:=]+\+)+\s$', re.IGNORECASE)
		_matchGridTableBodySeparator = re.compile(r'.\+([:-]+\+)+.$', re.IGNORECASE)
		_matchGridTableHeaderSeparator = re.compile(r'.\+([=:]+\+)+.$', re.IGNORECASE)
		_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
		@@ -463,58 +464,85 @@ def parse_pandoc_table_with_spans(pandoc_table):
		# Split the input into lines
		lines = [line.strip() for line in pandoc_table.strip().split("\n")]

		class Cell:
		""" Represents the document object. """
		content: str
		rowspan: int
		colspan: int
		colspan_adjusted: bool
		alignment: str
		position: int
		list_flag: bool
		auxiliar_index: int

		def __init__(self):
		self.content = None
		self.rowspan = 0
		self.colspan = 0
		self.colspan_adjusted = False
		self.alignment = "align=\"center\""
		self.position = 0
		self.list_flag = False
		self.auxiliar_index = None

		class Row():
		""" Represents a row in the markdown file. """
		cells:list[Cell] = []

		def __init__(self, length: int = 1) -> None:
		self.cells = [Cell() for _ in range(length)]

		# Detect separator lines by pattern (it does not take into account partial separators
		def is_separator(line):
		_matchGridTableSeparator = re.compile(r'\s\+([-:=]+\+)+\s$', re.IGNORECASE)
		return _matchGridTableSeparator.match(line)

		def handling_content(cell, content, list_flag):
		if cell['content'] is None:
		cell['rowspan'] += 1
		cell['colspan'] += 1
		def handling_content(cell, content):
		if cell.content is None:
		cell.rowspan += 1
		cell.colspan += 1
		if content.strip().startswith("- "): # List
		list_flag = True
		print(content)
		cell['content'] = content.strip() + "\n" # Add newline to know when the list element ends
		elif list_flag: # any other content when handling list is concatenated to the last list element
		cell['content'] += content.strip() + "\n"
		elif cells[i].strip() == "": # separation between list and other paragraph
		list_flag = False
		cell['content'] = re.sub(r'\\\s*$', "\n", content)
		cell.list_flag = True
		#print(content)
		cell.content = content.strip() + "\n" # Add newline to know when the list element ends
		elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element
		cell.content += content.strip() + "\n"
		elif cells[i].strip == "": # separation between list and other paragraph
		cell.list_flag = False
		cell.content += "\n" #if not cell['content'].endswith("\n") else ""
		else:
		cell['content'] = re.sub(r'\\\s*$', "\n", content.strip())
		cell.content = re.sub(r'\\\s*$', "\n", content.strip())
		else:
		if content.strip().startswith("- "): # List
		if not list_flag:
		cell['content'] += "\n"
		if not cell.list_flag:
		cell.content += "\n"
		#cell['content'] = cell['content'].strip("\n")
		list_flag = True
		cell['content'] += content.strip() + "\n" # Add newline to know when the list element ends
		elif list_flag: # any other content when handling list is concatenated to the last list element
		cell['content'] = cell['content'].strip("\n")
		cell['content'] += " " + content.strip() + "\n"
		cell.list_flag = True
		cell.content += content.strip() + "\n" # Add newline to know when the list element ends
		elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element
		cell.content = cell.content.strip("\n")
		cell.content += " " + content.strip() + "\n"
		elif cells[i].strip() == "": # separation between list and other paragraph
		list_flag = False
		cell.list_flag = False
		#content = re.sub(r'\\\s*$', "\n", content.strip())
		cell['content'] += "\n" if not cell['content'].endswith("\n") else ""
		cell.content += "\n" if not cell.content.endswith("\n") else ""
		else:
		content = re.sub(r'\\\s*$', "\n", content.strip())
		cell['content'] += " " + content
		cell.content += " " + content
		#print(cell['content'])
		return list_flag, cell
		return cell

		def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
		for j in range(column_index, number_of_parts):
		delimiter_start = row[j - 1]['position'] if j != 0 else 0
		delimiter_start = row[j - 1].position if j != 0 else 0
		positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "\|+" if delimiter in line[delimiter_start + 1:]]
		position = min(positions) if positions else -1
		if position > delimiter_positions[j]: # Colspan to be increased
		row[i]['colspan'] += 1
		row[i].colspan += 1
		if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
		colspan_allocated = 0
		for cell_index in range(number_of_parts):
		colspan_allocated += row[cell_index]['colspan']
		row[column_index]['colspan'] += number_of_columns - colspan_allocated
		colspan_allocated = row[i].colspan
		#for cell_index in range(number_of_parts):
		# colspan_allocated += row[cell_index].colspan
		row[column_index].colspan += number_of_columns - colspan_allocated - column_index
		elif position < delimiter_positions[j]:
		raise ValueError("Wrong cell formatting")
		else:
		@@ -563,6 +591,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
		data_rows = []
		for row in range(len(separator_indices) - 1):
		table_row = []
		auxiliar_rows = []
		auxiliar_row = []
		use_auxiliar_row = []
		list_flags = []
		@@ -591,65 +620,70 @@ def parse_pandoc_table_with_spans(pandoc_table):
		# else:
		# alignments.append("align=\"center\"")
		header_delimiter_index = 0
		table_row = Row(number_of_columns_row)
		for i in range(number_of_columns_row):
		delimiter_index += len(parts[i]) + 1
		table_row.append({
		"content": None,
		"rowspan": 0,
		"colspan": 0,
		"colspan_adjusted": False,
		"alignment": default_alignments[i] if i == 0 else "align=\"center\"",
		"position": delimiter_index # Position of cell delimiter +
		})
		table_row.cells[i].alignment = default_alignments[i] if i == 0 else "align=\"center\""
		table_row.cells[i].position = delimiter_index # Position of cell delimiter +

		#Set alignment as defined by header separator line
		while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]:
		while header_delimiter_index in range(len(default_alignments)) and table_row.cells[i].position > header_delimiter_positions[header_delimiter_index]:
		header_delimiter_index += 1
		if header_delimiter_index in range(len(default_alignments)):
		if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]:
		table_row[i]['alignment'] = default_alignments[header_delimiter_index]
		elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]:
		table_row[i]['alignment'] = default_alignments[i]
		if table_row.cells[i].position < header_delimiter_positions[header_delimiter_index]:
		table_row.cells[i].alignment = default_alignments[header_delimiter_index]
		elif table_row.cells[i].position == header_delimiter_positions[header_delimiter_index]:
		table_row.cells[i].alignment = default_alignments[i]
		header_delimiter_index += 1
		else:
		raise ValueError("Invalid table formatting")

		for i in range(number_of_columns):
		auxiliar_row.append({
		"content": None,
		"rowspan": 0,
		"colspan": 0,
		"colspan_adjusted": False,
		"alignment": "align=\"center\"",
		"position": 0
		})
		use_auxiliar_row.append(False)
		list_flags.append(False)
		#auxiliar_row = Row(number_of_columns)
		#for i in range(number_of_columns):
		#auxiliar_row.append(default_cell)
		#use_auxiliar_row.append(False)
		#auxiliar_rows.append({'auxiliar_row':auxiliar_row, 'use_auxiliar':use_auxiliar_row, 'list_flags':list_flags})

		elif in_data_row:
		# Regular data row or partial separator
		if _matchGridTableBodySeparator.match(line): # Partial separator
		has_merged_cells = True
		#Add auxiliar line, set delimiters for each cell
		auxiliar_rows.append(Row(number_of_columns))
		aux_delimiter_index = 0
		for i in range(number_of_columns_row):
		aux_delimiter_index += len(parts[i]) + 1
		auxiliar_rows[-1].cells[i].position = aux_delimiter_index # Position of cell delimiter +

		cells = re.split(r"\s[\\|\+]\s", line.strip("\|").strip("+")) # (?<!\\)[\\|\+]
		if len(cells) <= number_of_columns: # Colspan: Positions of \| with respect to + need to be determined
		for i in range(len(cells)):
		if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added
		use_auxiliar_row[i] = True
		list_flags[i] = False
		if cells[i].startswith(":") and not cells[i].endswith(":"):
		auxiliar_row[i]['alignment'] = "align=\"left\""
		elif not cells[i].startswith(":") and cells[i].endswith(":"):
		auxiliar_row[i]['alignment'] = "align=\"right\""
		else:
		auxiliar_row[i]['alignment'] = "align=\"center\""
		#auxiliar_rows[-1]['use_auxiliar_row'][i] = True
		auxiliar_rows[-1].cells[i].list_flag = False
		table_row.cells[i].auxiliar_index = len(auxiliar_rows)-1
		#if cells[i].startswith(":") and not cells[i].endswith(":"):
		# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\""
		#elif not cells[i].startswith(":") and cells[i].endswith(":"):
		# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\""
		#else:
		# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\""
		else:
		# Handle content of the cell
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
		if table_row.cells[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
		auxiliar_rows[table_row.cells[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index][i], cells[i])
		if not auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted:
		auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted = True
		# TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
		auxiliar_rows[table_row.cells[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index], i, len(cells), line, number_of_columns, delimiter_positions)
		else:
		table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
		# Cell which is not separator
		table_row[i]['rowspan'] += 1
		if not table_row[i]['colspan_adjusted']:
		table_row[i]['colspan_adjusted'] = True
		table_row.cells[i].rowspan += 1
		if not table_row.cells[i].colspan_adjusted:
		table_row.cells[i].colspan_adjusted = True
		#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
		table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
		table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
		#elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
		# for i in range(len(cells)):
		# if _matchGridTableBodySeparatorLine.match(cells[i]): # Update cell in new row
		@@ -674,30 +708,42 @@ def parse_pandoc_table_with_spans(pandoc_table):
		if len(cells) < number_of_columns: # Colspan: Positions of \| with respect to + need to be determined
		for i in range(len(cells)):
		# Handle content of the cell
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
		if not table_row[i]['colspan_adjusted']:
		table_row[i]['colspan_adjusted'] = True
		table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
		if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
		auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i])
		if not auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted:
		auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted = True
		#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
		auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index].cells, i, len(cells), line, number_of_columns, delimiter_positions)
		else:
		table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
		if not table_row.cells[i].colspan_adjusted:
		table_row.cells[i].colspan_adjusted = True
		table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
		elif len(cells) == number_of_columns: # Simple row
		for i in range(len(cells)):
		if use_auxiliar_row[i]:
		list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i])
		if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
		auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i])
		else:
		# Handle content of the cell
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
		table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
		else:
		raise ValueError("More cells than columns found")
		else:
		raise ValueError("No separator line found for row starting")


		if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
		data_rows.append(table_row)
		data_rows.append(table_row.cells)
		if has_merged_cells:
		data_rows.append(auxiliar_row)
		for row in auxiliar_rows:
		#for i in range(len(row.cells)):
		# print(row.cells[i].content)
		data_rows.append(row.cells)
		elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
		header_rows.append(table_row)
		header_rows.append(table_row.cells)
		if has_merged_cells:
		header_rows.append(auxiliar_row)
		for row in auxiliar_rows:
		header_rows.append(row.cells)

		#print(header_rows)
		#print(data_rows)
		@@ -711,35 +757,35 @@ def parse_pandoc_table_with_spans(pandoc_table):
		italic = "<i>"
		for row in rows:
		for cell in row:
		if cell['content'] is not None:
		if cell.content is not None:
		# Replacing "<" by <
		cell['content'] = cell['content'].replace("<", "<")
		cell.content = cell.content.replace("<", "<")

		#Bold
		for bold_characters in ["**", "__"]:
		while cell['content'].find(bold_characters) != -1:
		cell['content'] = cell['content'].replace(bold_characters, bold, 1)
		while cell.content.find(bold_characters) != -1:
		cell.content = cell.content.replace(bold_characters, bold, 1)
		if bold == "<strong>":
		bold = "</strong>"
		else:
		bold = "<strong>"
		#Italic
		while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1:
		cell['content'] = cell['content'].rstrip() .replace("_", italic, 1)
		while cell.content.find("_") != -1 and cell.content.find("\_") == -1:
		cell.content = cell.content.rstrip() .replace("_", italic, 1)
		if italic == "<i>":
		italic = "</i>"
		else:
		italic = "<i>"
		while cell['content'].find("\_") != -1:
		cell['content'] = cell['content'].rstrip().replace("\_", "_", 1)
		while cell.content.find("\_") != -1:
		cell.content = cell.content.rstrip().replace("\_", "_", 1)

		# Correct newlines characters
		for row in header_rows:
		for cell in row:
		cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
		cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None
		for row in data_rows:
		for cell in row:
		cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
		cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None

		# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
		forward_rowspan = []
		@@ -748,13 +794,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
		forward_rowspan = [0 for _ in range(len(header_rows[row_index]))]
		sum = 0
		for cell_index in range(len(header_rows[row_index])):
		sum += header_rows[row_index][cell_index]['colspan']
		if row_index > 0 and header_rows[row_index][cell_index]['colspan'] == 0:
		sum += header_rows[row_index][cell_index].colspan
		if row_index > 0 and header_rows[row_index][cell_index].colspan == 0:
		if forward_rowspan[cell_index] > 0:
		sum += 1
		forward_rowspan[cell_index] -= 1
		if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index]['rowspan'] > 1:
		forward_rowspan[cell_index] = header_rows[row_index][cell_index]['rowspan'] -1
		if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index].rowspan > 1:
		forward_rowspan[cell_index] = header_rows[row_index][cell_index].rowspan -1
		if not sum == number_of_columns:
		raise ValueError("Grid table not converted properly")
		forward_rowspan = []
		@@ -763,13 +809,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
		forward_rowspan = [0 for _ in range(len(data_rows[row_index]))]
		sum = 0
		for cell_index in range(len(data_rows[row_index])):
		sum += data_rows[row_index][cell_index]['colspan']
		if row_index > 0 and data_rows[row_index][cell_index]['colspan'] == 0:
		sum += data_rows[row_index][cell_index].colspan
		if row_index > 0 and data_rows[row_index][cell_index].colspan == 0:
		if forward_rowspan[cell_index] > 0:
		sum += 1
		forward_rowspan[cell_index] -= 1
		if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index]['rowspan'] > 1:
		forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
		if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index].rowspan > 1:
		forward_rowspan[cell_index] = data_rows[row_index][cell_index].rowspan - 1
		if not sum == number_of_columns:
		raise ValueError("Grid table not converted properly")

		@@ -789,35 +835,35 @@ def generate_html_table_with_spans(pandoc_table):

		for row in grid_header:
		for cell in row:
		if cell['rowspan'] != 0 and cell['colspan'] != 0:
		if cell.rowspan != 0 and cell.colspan != 0:
		has_header = True
		if has_header:
		html += " <thead>\n"
		for row in grid_header:
		html += " <tr>\n"
		for cell in row:
		if cell['rowspan'] == 0 or cell['colspan'] == 0:
		if cell.rowspan == 0 or cell.colspan == 0:
		continue
		else:
		# Prepare content, in case there's a list
		#print(cell['content'])
		#print(cell.content)
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>",
		cell['content']): # Update cell in new row
		cell.content): # Update cell in new row
		#print("MATCHING")
		list = "<ul>"
		# Build list the matches
		for match in matches:
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell['content'] = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
		cell.content = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content)
		# Enforce left alignment if cell contains a list
		cell['alignment'] = "align=\"left\""
		cell.alignment = "align=\"left\""
		#else:
		# print("NOT MATCHING")

		rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
		colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
		html += f" <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n"
		rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
		colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
		html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n"
		html += " </tr>\n"
		html += " </thead>\n"

		@@ -825,26 +871,27 @@ def generate_html_table_with_spans(pandoc_table):
		for row in grid_body:
		html += " <tr>\n"
		for cell in row:
		if cell['rowspan'] == 0 or cell['colspan'] == 0:
		if cell.rowspan == 0 or cell.colspan == 0:
		continue
		else:
		#Prepare content, in case there's a list
		#print(cell['content'])
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']): # Update cell in new row
		#print(cell.content)
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>", cell.content): # Update cell in new row
		#print("MATCHING")
		#print(cell.content)
		list = "<ul>"
		# Build list the matches
		for match in matches:
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell['content'] = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
		cell.content = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content)
		# Enforce left alignment if cell contains a list
		cell['alignment'] = "align=\"left\""
		cell.alignment = "align=\"left\""
		#else:
		#print("NOT MATCHING")
		rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
		colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
		html += f" <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n"
		rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
		colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
		html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n"
		html += " </tr>\n"

		html += " </tbody>\n"

Admin message