Commit 708d9fb8 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Using class Cell and Row to handle grid tables conversion to html

parent 2451610e
Loading
Loading
Loading
Loading
+164 −117
Original line number Diff line number Diff line
@@ -420,6 +420,7 @@ _matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECA
_matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE)
_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
@@ -463,58 +464,85 @@ def parse_pandoc_table_with_spans(pandoc_table):
	# Split the input into lines
	lines = [line.strip() for line in pandoc_table.strip().split("\n")]

	class Cell:
		"""	Represents the document object. """
		content: str
		rowspan: int
		colspan: int
		colspan_adjusted: bool
		alignment: str
		position: int
		list_flag: bool
		auxiliar_index: int

		def __init__(self):
			self.content = None
			self.rowspan = 0
			self.colspan = 0
			self.colspan_adjusted = False
			self.alignment = "align=\"center\""
			self.position = 0
			self.list_flag = False
			self.auxiliar_index = None

	class Row():
		"""	Represents a row in the markdown file. """
		cells:list[Cell] = []

		def __init__(self, length: int = 1) -> None:
			self.cells = [Cell() for _ in range(length)]

	# Detect separator lines by pattern (it does not take into account partial separators
	def is_separator(line):
		_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
		return _matchGridTableSeparator.match(line)

	def handling_content(cell, content, list_flag):
		if cell['content'] is None:
			cell['rowspan'] += 1
			cell['colspan'] += 1
	def handling_content(cell, content):
		if cell.content is None:
			cell.rowspan += 1
			cell.colspan += 1
			if content.strip().startswith("- "):  # List
				list_flag = True
				print(content)
				cell['content'] = content.strip() + "\n"  # Add newline to know when the list element ends
			elif list_flag:  # any other content when handling list is concatenated to the last list element
				cell['content'] += content.strip() + "\n"
			elif cells[i].strip() == "":  # separation between list and other paragraph
				list_flag = False
				cell['content'] = re.sub(r'\\\s*$', "\n", content)
				cell.list_flag = True
				#print(content)
				cell.content = content.strip() + "\n"  # Add newline to know when the list element ends
			elif cell.list_flag and cells[i].strip() != "":  # any other content when handling list is concatenated to the last list element
				cell.content += content.strip() + "\n"
			elif cells[i].strip == "":  # separation between list and other paragraph
				cell.list_flag = False
				cell.content += "\n" #if not cell['content'].endswith("\n") else ""
			else:
				cell['content'] = re.sub(r'\\\s*$', "\n", content.strip())
				cell.content = re.sub(r'\\\s*$', "\n", content.strip())
		else:
			if content.strip().startswith("- "):  # List
				if not list_flag:
					cell['content'] += "\n"
				if not cell.list_flag:
					cell.content += "\n"
					#cell['content'] = cell['content'].strip("\n")
				list_flag = True
				cell['content'] += content.strip() + "\n"  # Add newline to know when the list element ends
			elif list_flag:  # any other content when handling list is concatenated to the last list element
				cell['content'] = cell['content'].strip("\n")
				cell['content'] += " " + content.strip() + "\n"
				cell.list_flag = True
				cell.content += content.strip() + "\n"  # Add newline to know when the list element ends
			elif cell.list_flag and cells[i].strip() != "":  # any other content when handling list is concatenated to the last list element
				cell.content = cell.content.strip("\n")
				cell.content += " " + content.strip() + "\n"
			elif cells[i].strip() == "":  # separation between list and other paragraph
				list_flag = False
				cell.list_flag = False
				#content = re.sub(r'\\\s*$', "\n", content.strip())
				cell['content'] += "\n" if not cell['content'].endswith("\n") else ""
				cell.content += "\n" if not cell.content.endswith("\n") else ""
			else:
				content = re.sub(r'\\\s*$', "\n", content.strip())
				cell['content'] += " " + content
				cell.content += " " + content
		#print(cell['content'])
		return list_flag, cell
		return cell

	def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
		for j in range(column_index, number_of_parts):
			delimiter_start = row[j - 1]['position'] if j != 0 else 0
			delimiter_start = row[j - 1].position if j != 0 else 0
			positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
			position = min(positions) if positions else -1
			if position > delimiter_positions[j]:  # Colspan to be increased
				row[i]['colspan'] += 1
				row[i].colspan += 1
				if position == delimiter_positions[len(delimiter_positions) - 1]:  # last cell in row, adjust colspan to get max number columns
					colspan_allocated = 0
					for cell_index in range(number_of_parts):
						colspan_allocated += row[cell_index]['colspan']
					row[column_index]['colspan'] += number_of_columns - colspan_allocated
					colspan_allocated = row[i].colspan
					#for cell_index in range(number_of_parts):
					#	colspan_allocated += row[cell_index].colspan
					row[column_index].colspan += number_of_columns - colspan_allocated - column_index
			elif position < delimiter_positions[j]:
				raise ValueError("Wrong cell formatting")
			else:
@@ -563,6 +591,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
	data_rows = []
	for row in range(len(separator_indices) - 1):
		table_row = []
		auxiliar_rows = []
		auxiliar_row = []
		use_auxiliar_row = []
		list_flags = []
@@ -591,65 +620,70 @@ def parse_pandoc_table_with_spans(pandoc_table):
					#	else:
					#		alignments.append("align=\"center\"")
					header_delimiter_index = 0
					table_row = Row(number_of_columns_row)
					for i in range(number_of_columns_row):
						delimiter_index += len(parts[i]) + 1
						table_row.append({
							"content": None,
							"rowspan": 0,
							"colspan": 0,
							"colspan_adjusted": False,
							"alignment": default_alignments[i] if i == 0 else "align=\"center\"",
							"position": delimiter_index # Position of cell delimiter +
						})
						table_row.cells[i].alignment = default_alignments[i] if i == 0 else "align=\"center\""
						table_row.cells[i].position = delimiter_index # Position of cell delimiter +

						#Set alignment as defined by header separator line
						while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]:
						while header_delimiter_index in range(len(default_alignments)) and table_row.cells[i].position > header_delimiter_positions[header_delimiter_index]:
							header_delimiter_index += 1
						if header_delimiter_index in range(len(default_alignments)):
							if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]:
								table_row[i]['alignment'] = default_alignments[header_delimiter_index]
							elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]:
								table_row[i]['alignment'] = default_alignments[i]
							if table_row.cells[i].position < header_delimiter_positions[header_delimiter_index]:
								table_row.cells[i].alignment = default_alignments[header_delimiter_index]
							elif table_row.cells[i].position == header_delimiter_positions[header_delimiter_index]:
								table_row.cells[i].alignment = default_alignments[i]
								header_delimiter_index += 1
						else:
							raise ValueError("Invalid table formatting")

					for i in range(number_of_columns):
						auxiliar_row.append({
							"content": None,
							"rowspan": 0,
							"colspan": 0,
							"colspan_adjusted": False,
							"alignment": "align=\"center\"",
							"position": 0
						})
						use_auxiliar_row.append(False)
						list_flags.append(False)
					#auxiliar_row = Row(number_of_columns)
					#for i in range(number_of_columns):
						#auxiliar_row.append(default_cell)
						#use_auxiliar_row.append(False)
						#auxiliar_rows.append({'auxiliar_row':auxiliar_row, 'use_auxiliar':use_auxiliar_row, 'list_flags':list_flags})

				elif in_data_row:
					# Regular data row or partial separator
					if _matchGridTableBodySeparator.match(line): # Partial separator
						has_merged_cells = True
						#Add auxiliar line, set delimiters for each cell
						auxiliar_rows.append(Row(number_of_columns))
						aux_delimiter_index = 0
						for i in range(number_of_columns_row):
							aux_delimiter_index += len(parts[i]) + 1
							auxiliar_rows[-1].cells[i].position = aux_delimiter_index  # Position of cell delimiter +

						cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
						if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells)):
								if _matchGridTableBodySeparatorLine.match(cells[i]):  # A new row is to be added
									use_auxiliar_row[i] = True
									list_flags[i] = False
									if cells[i].startswith(":") and not cells[i].endswith(":"):
										auxiliar_row[i]['alignment'] = "align=\"left\""
									elif not cells[i].startswith(":") and  cells[i].endswith(":"):
										auxiliar_row[i]['alignment'] = "align=\"right\""
									else:
										auxiliar_row[i]['alignment'] = "align=\"center\""
									#auxiliar_rows[-1]['use_auxiliar_row'][i] = True
									auxiliar_rows[-1].cells[i].list_flag = False
									table_row.cells[i].auxiliar_index = len(auxiliar_rows)-1
									#if cells[i].startswith(":") and not cells[i].endswith(":"):
									#	auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\""
									#elif not cells[i].startswith(":") and  cells[i].endswith(":"):
									#	auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\""
									#else:
									#	auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\""
								else:
									# Handle content of the cell
									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
									if table_row.cells[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
										auxiliar_rows[table_row.cells[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index][i], cells[i])
										if not auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted:
											auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted = True
											# TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
											auxiliar_rows[table_row.cells[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index], i, len(cells), line, number_of_columns, delimiter_positions)
									else:
										table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
										# Cell which is not separator
									table_row[i]['rowspan'] += 1
									if not table_row[i]['colspan_adjusted']:
										table_row[i]['colspan_adjusted'] = True
										table_row.cells[i].rowspan += 1
										if not table_row.cells[i].colspan_adjusted:
											table_row.cells[i].colspan_adjusted = True
											#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
										table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
											table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
						#elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
						#	for i in range(len(cells)):
						#		if _matchGridTableBodySeparatorLine.match(cells[i]):  # Update cell in new row
@@ -674,30 +708,42 @@ def parse_pandoc_table_with_spans(pandoc_table):
						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells)):
								# Handle content of the cell
								list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
								if not table_row[i]['colspan_adjusted']:
									table_row[i]['colspan_adjusted'] = True
									table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
								if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
									auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i])
									if not auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted:
										auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted = True
										#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
										auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index].cells, i, len(cells), line, number_of_columns, delimiter_positions)
								else:
									table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
									if not table_row.cells[i].colspan_adjusted:
										table_row.cells[i].colspan_adjusted = True
										table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
						elif len(cells) == number_of_columns: # Simple row
							for i in range(len(cells)):
								if use_auxiliar_row[i]:
									list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i])
								if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
									auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i])
								else:
									# Handle content of the cell
									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
									table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
						else:
							raise ValueError("More cells than columns found")
				else:
					raise ValueError("No separator line found for row starting")


			if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
				data_rows.append(table_row)
				data_rows.append(table_row.cells)
				if has_merged_cells:
					data_rows.append(auxiliar_row)
					for row in auxiliar_rows:
						#for i in range(len(row.cells)):
						#	print(row.cells[i].content)
						data_rows.append(row.cells)
			elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
				header_rows.append(table_row)
				header_rows.append(table_row.cells)
				if has_merged_cells:
					header_rows.append(auxiliar_row)
					for row in auxiliar_rows:
						header_rows.append(row.cells)

	#print(header_rows)
	#print(data_rows)
@@ -711,35 +757,35 @@ def parse_pandoc_table_with_spans(pandoc_table):
		italic = "<i>"
		for row in rows:
			for cell in row:
				if cell['content'] is not None:
				if cell.content is not None:
					# Replacing "<" by &lt;
					cell['content'] = cell['content'].replace("<", "&lt;")
					cell.content = cell.content.replace("<", "&lt;")

					#Bold
					for bold_characters in ["**", "__"]:
						while cell['content'].find(bold_characters) != -1:
							cell['content'] = cell['content'].replace(bold_characters, bold, 1)
						while cell.content.find(bold_characters) != -1:
							cell.content = cell.content.replace(bold_characters, bold, 1)
							if bold == "<strong>":
								bold = "</strong>"
							else:
								bold = "<strong>"
					#Italic
					while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1:
						cell['content'] = cell['content'].rstrip() .replace("_", italic, 1)
					while cell.content.find("_") != -1 and cell.content.find("\_") == -1:
						cell.content = cell.content.rstrip() .replace("_", italic, 1)
						if italic == "<i>":
							italic = "</i>"
						else:
							italic = "<i>"
					while cell['content'].find("\_") != -1:
						cell['content'] = cell['content'].rstrip().replace("\_", "_", 1)
					while cell.content.find("\_") != -1:
						cell.content = cell.content.rstrip().replace("\_", "_", 1)

	# Correct newlines characters
	for row in header_rows:
		for cell in row:
			cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
			cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None
	for row in data_rows:
		for cell in row:
			cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
			cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None

	# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
	forward_rowspan = []
@@ -748,13 +794,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
			forward_rowspan = [0 for _ in range(len(header_rows[row_index]))]
		sum = 0
		for cell_index in range(len(header_rows[row_index])):
			sum += header_rows[row_index][cell_index]['colspan']
			if row_index > 0 and header_rows[row_index][cell_index]['colspan'] == 0:
			sum += header_rows[row_index][cell_index].colspan
			if row_index > 0 and header_rows[row_index][cell_index].colspan == 0:
				if forward_rowspan[cell_index] > 0:
					sum += 1
				forward_rowspan[cell_index] -= 1
			if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index]['rowspan'] > 1:
				forward_rowspan[cell_index] = header_rows[row_index][cell_index]['rowspan'] -1
			if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index].rowspan > 1:
				forward_rowspan[cell_index] = header_rows[row_index][cell_index].rowspan -1
		if not sum == number_of_columns:
			raise ValueError("Grid table not converted properly")
	forward_rowspan = []
@@ -763,13 +809,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
			forward_rowspan = [0 for _ in range(len(data_rows[row_index]))]
		sum = 0
		for cell_index in range(len(data_rows[row_index])):
			sum += data_rows[row_index][cell_index]['colspan']
			if row_index > 0 and data_rows[row_index][cell_index]['colspan'] == 0:
			sum += data_rows[row_index][cell_index].colspan
			if row_index > 0 and data_rows[row_index][cell_index].colspan == 0:
				if forward_rowspan[cell_index] > 0:
					sum += 1
				forward_rowspan[cell_index] -= 1
			if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index]['rowspan'] > 1:
				forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
			if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index].rowspan > 1:
				forward_rowspan[cell_index] = data_rows[row_index][cell_index].rowspan - 1
		if not sum == number_of_columns:
			raise ValueError("Grid table not converted properly")

@@ -789,35 +835,35 @@ def generate_html_table_with_spans(pandoc_table):

	for row in grid_header:
		for cell in row:
			if cell['rowspan'] != 0 and cell['colspan'] != 0:
			if cell.rowspan != 0 and cell.colspan != 0:
				has_header = True
	if has_header:
		html += "    <thead>\n"
		for row in grid_header:
			html += "        <tr>\n"
			for cell in row:
				if cell['rowspan'] == 0 or cell['colspan'] == 0:
				if cell.rowspan == 0 or cell.colspan == 0:
					continue
				else:
					# Prepare content, in case there's a list
					#print(cell['content'])
					#print(cell.content)
					if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
											 cell['content']):  # Update cell in new row
											 cell.content):  # Update cell in new row
						#print("MATCHING")
						list = "<ul>"
						# Build list the matches
						for match in matches:
							list += "<li>" + match[1] + "</li>"
						list += "</ul>"
						cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
						cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content)
						# Enforce left alignment if cell contains a list
						cell['alignment'] = "align=\"left\""
						cell.alignment = "align=\"left\""
					#else:
					#	print("NOT MATCHING")

					rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
					colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
					html += f"            <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n"
					rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
					colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
					html += f"            <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n"
			html += "        </tr>\n"
		html += "    </thead>\n"

@@ -825,26 +871,27 @@ def generate_html_table_with_spans(pandoc_table):
	for row in grid_body:
		html += "        <tr>\n"
		for cell in row:
			if cell['rowspan'] == 0 or cell['colspan'] == 0:
			if cell.rowspan == 0 or cell.colspan == 0:
				continue
			else:
				#Prepare content, in case there's a list
				#print(cell['content'])
				if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']):  # Update cell in new row
				#print(cell.content)
				if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell.content):  # Update cell in new row
					#print("MATCHING")
					#print(cell.content)
					list = "<ul>"
					# Build list the matches
					for match in matches:
						list += "<li>" + match[1] + "</li>"
					list += "</ul>"
					cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
					cell.content = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content)
					# Enforce left alignment if cell contains a list
					cell['alignment'] = "align=\"left\""
					cell.alignment = "align=\"left\""
				#else:
					#print("NOT MATCHING")
				rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
				colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
				html += f"            <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n"
				rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
				colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
				html += f"            <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n"
		html += "        </tr>\n"

	html += "    </tbody>\n"