Commit bb284002 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Some cleanup for handling of grid tables

parent 708d9fb8
Loading
Loading
Loading
Loading
+103 −113
Original line number Diff line number Diff line
@@ -7,6 +7,8 @@
#	directory structure.
#
from __future__ import annotations

import logging
from enum import Enum, auto
import argparse, re, os, shutil, hashlib, base64
from dataclasses import dataclass
@@ -485,6 +487,19 @@ def parse_pandoc_table_with_spans(pandoc_table):
			self.list_flag = False
			self.auxiliar_index = None

		def set_alignment(self):
			header_delimiter_index = 0
			while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]:
				header_delimiter_index += 1
			if header_delimiter_index in range(len(default_alignments)):
				if self.position < header_delimiter_positions[header_delimiter_index]:
					self.alignment = default_alignments[header_delimiter_index]
				elif self.position == header_delimiter_positions[header_delimiter_index]:
					self.alignment = default_alignments[header_delimiter_index]
					header_delimiter_index += 1
			else:
				raise ValueError("Invalid table formatting")

	class Row():
		"""	Represents a row in the markdown file. """
		cells:list[Cell] = []
@@ -492,6 +507,12 @@ def parse_pandoc_table_with_spans(pandoc_table):
		def __init__(self, length: int = 1) -> None:
			self.cells = [Cell() for _ in range(length)]

		def __getitem__(self, item):
			return self.cells[item]

		def __setitem__(self, key, value):
			self.cells[key] = value

	# Detect separator lines by pattern (it does not take into account partial separators
	def is_separator(line):
		return _matchGridTableSeparator.match(line)
@@ -573,7 +594,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
			has_header = True
			header_separator_index = index
			header_rows = []
			parts = re.split(r"\s*\+\s*", lines[index].strip("+"))
			parts = re.split(r"\+", lines[index].strip("+"))
			default_alignments = []
			#Calculate default alignments and positions of delimiters
			for part_index in range(len(parts)):
@@ -592,9 +613,6 @@ def parse_pandoc_table_with_spans(pandoc_table):
	for row in range(len(separator_indices) - 1):
		table_row = []
		auxiliar_rows = []
		auxiliar_row = []
		use_auxiliar_row = []
		list_flags = []
		has_merged_cells = False
		in_data_row = False
		start, end = separator_indices[row], separator_indices[row + 1]
@@ -623,45 +641,31 @@ def parse_pandoc_table_with_spans(pandoc_table):
					table_row = Row(number_of_columns_row)
					for i in range(number_of_columns_row):
						delimiter_index += len(parts[i]) + 1
						table_row.cells[i].alignment = default_alignments[i] if i == 0 else "align=\"center\""
						table_row.cells[i].position = delimiter_index # Position of cell delimiter +
						table_row[i].alignment = default_alignments[i] if i == 0 else "align=\"center\""
						table_row[i].position = delimiter_index # Position of cell delimiter +

						#Set alignment as defined by header separator line
						while header_delimiter_index in range(len(default_alignments)) and table_row.cells[i].position > header_delimiter_positions[header_delimiter_index]:
							header_delimiter_index += 1
						if header_delimiter_index in range(len(default_alignments)):
							if table_row.cells[i].position < header_delimiter_positions[header_delimiter_index]:
								table_row.cells[i].alignment = default_alignments[header_delimiter_index]
							elif table_row.cells[i].position == header_delimiter_positions[header_delimiter_index]:
								table_row.cells[i].alignment = default_alignments[i]
								header_delimiter_index += 1
						else:
							raise ValueError("Invalid table formatting")

					#auxiliar_row = Row(number_of_columns)
					#for i in range(number_of_columns):
						#auxiliar_row.append(default_cell)
						#use_auxiliar_row.append(False)
						#auxiliar_rows.append({'auxiliar_row':auxiliar_row, 'use_auxiliar':use_auxiliar_row, 'list_flags':list_flags})
						table_row[i].set_alignment()

				elif in_data_row:
					# Regular data row or partial separator
					if _matchGridTableBodySeparator.match(line): # Partial separator
						has_merged_cells = True
						cells = re.split(r"[\|\+]", line.strip("|").strip("+"))  # (?<!\\)[\|\+]
						#Add auxiliar line, set delimiters for each cell
						auxiliar_rows.append(Row(number_of_columns))
						aux_delimiter_index = 0
						for i in range(number_of_columns_row):
							aux_delimiter_index += len(parts[i]) + 1
							auxiliar_rows[-1].cells[i].position = aux_delimiter_index  # Position of cell delimiter +
						for auxiliar_cell_index in range(number_of_columns):
							aux_delimiter_index += len(cells[auxiliar_cell_index]) + 1
							auxiliar_rows[-1][auxiliar_cell_index].position = aux_delimiter_index  # Position of cell delimiter +
						auxiliar_rows[-1][i].set_alignment()

						cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
						if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells)):
								if _matchGridTableBodySeparatorLine.match(cells[i]):  # A new row is to be added
									#auxiliar_rows[-1]['use_auxiliar_row'][i] = True
									auxiliar_rows[-1].cells[i].list_flag = False
									table_row.cells[i].auxiliar_index = len(auxiliar_rows)-1
									auxiliar_rows[-1][i].list_flag = False
									table_row[i].auxiliar_index = len(auxiliar_rows)-1
									#if cells[i].startswith(":") and not cells[i].endswith(":"):
									#	auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\""
									#elif not cells[i].startswith(":") and  cells[i].endswith(":"):
@@ -670,37 +674,20 @@ def parse_pandoc_table_with_spans(pandoc_table):
									#	auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\""
								else:
									# Handle content of the cell
									if table_row.cells[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
										auxiliar_rows[table_row.cells[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index][i], cells[i])
										if not auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted:
											auxiliar_rows[table_row.cells[i].auxiliar_index][i].colspan_adjusted = True
									if table_row[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
										auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i])
										if not auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted:
											auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted = True
											# TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
											auxiliar_rows[table_row.cells[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index], i, len(cells), line, number_of_columns, delimiter_positions)
											auxiliar_rows[table_row[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[i].auxiliar_index], i, len(cells), line, number_of_columns, delimiter_positions)
									else:
										table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
										table_row[i] = handling_content(table_row[i], cells[i])
										# Cell which is not separator
										table_row.cells[i].rowspan += 1
										table_row[i].rowspan += 1
										if not table_row.cells[i].colspan_adjusted:
											table_row.cells[i].colspan_adjusted = True
											table_row[i].colspan_adjusted = True
											#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
											table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
						#elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
						#	for i in range(len(cells)):
						#		if _matchGridTableBodySeparatorLine.match(cells[i]):  # Update cell in new row
						#			use_auxiliar_row[i] = True
						#			list_flags[i] = False
						#			if cells[i].startswith(":") and not cells[i].endswith(":"):
						#				auxiliar_row[i]['alignment'] = "align=\"left\""
						#			elif not cells[i].startswith(":") and  cells[i].endswith(":"):
						#				auxiliar_row[i]['alignment'] = "align=\"right\""
						#			else:
						#				auxiliar_row[i]['alignment'] = "align=\"center\""
						#		else:
						#			#Handle content of the cell
						#			list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
						#			# Cell which is not separator
						#			table_row[i]['rowspan'] += 1
						#			# Adjusting of colspan not needed, no colspan as number of cells is equal to number of columns
											table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
						else:
							raise ValueError("More cells than columns found")
					else: # Data row
@@ -708,30 +695,29 @@ def parse_pandoc_table_with_spans(pandoc_table):
						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells)):
								# Handle content of the cell
								if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
									auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i])
								if table_row[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
									auxiliar_rows[table_row.cells[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i])
									if not auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted:
										auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted = True
										#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
										auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = adjust_colspan(auxiliar_rows[table_row.cells[i].auxiliar_index].cells, i, len(cells), line, number_of_columns, delimiter_positions)
										auxiliar_rows[table_row[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[i].auxiliar_index].cells, i, len(cells), line, number_of_columns, delimiter_positions)
								else:
									table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
									table_row[i] = handling_content(table_row[i], cells[i])
									if not table_row.cells[i].colspan_adjusted:
										table_row.cells[i].colspan_adjusted = True
										table_row.cells[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
										table_row[i].colspan_adjusted = True
										table_row[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
						elif len(cells) == number_of_columns: # Simple row
							for i in range(len(cells)):
								if table_row.cells[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
									auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i] = handling_content(auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i], cells[i])
								if table_row[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
									auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i])
								else:
									# Handle content of the cell
									table_row.cells[i] = handling_content(table_row.cells[i], cells[i])
									table_row[i] = handling_content(table_row[i], cells[i])
						else:
							raise ValueError("More cells than columns found")
				else:
					raise ValueError("No separator line found for row starting")


			if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
				data_rows.append(table_row.cells)
				if has_merged_cells:
@@ -759,7 +745,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
			for cell in row:
				if cell.content is not None:
					# Replacing "<" by &lt;
					cell.content = cell.content.replace("<", "&lt;")
					#cell.content = cell.content.replace("<", "&lt;")

					#Bold
					for bold_characters in ["**", "__"]:
@@ -828,8 +814,12 @@ def generate_html_table_with_spans(pandoc_table):
	:param pandoc_table: String of the Pandoc-style grid table.
	:return: HTML string.
	"""
	try:
		grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table)

	except:
		logging.ERROR("Grid table could not be generated")
		return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS"
	else:
		html = "<table>\n"
		has_header = False