Commit d15b1ca0 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Change algorithm to convert grid tables + compacting code

parent a1dbd013
Loading
Loading
Loading
Loading
+88 −91
Original line number Diff line number Diff line
@@ -484,7 +484,6 @@ def parse_pandoc_table_with_spans(pandoc_table):
			self.alignment = "align=\"center\""
			self.position = None
			self.list_flag = False
			self.auxiliar_index = None

		def set_alignment(self):
			header_delimiter_index = 0
@@ -512,10 +511,22 @@ def parse_pandoc_table_with_spans(pandoc_table):
		def __setitem__(self, key, value):
			self.cells[key] = value

	class RowTracker():
		"""	Represents the document object. """
		def __init__(self, items):
			self.rowTracker = [0 for _ in range(items)]

		def __getitem__(self, item):
			return self.rowTracker[item]

		def __setitem__(self, key, value):
			self.rowTracker[key] = value

	# Detect separator lines by pattern (it does not take into account partial separators
	def is_separator(line):
		return _matchGridTableSeparator.match(line)

	# Set content on the cell - concatenating multilines, flagging lists
	def handling_content(cell, content):
		if cell.content is None:
			cell.rowspan += 1
@@ -524,9 +535,9 @@ def parse_pandoc_table_with_spans(pandoc_table):
				cell.list_flag = True
				#print(content)
				cell.content = content.strip() + "\n"  # Add newline to know when the list element ends
			elif cell.list_flag and cells[i].strip() != "":  # any other content when handling list is concatenated to the last list element
			elif cell.list_flag and content.strip() != "":  # any other content when handling list is concatenated to the last list element
				cell.content += content.strip() + "\n"
			elif cells[i].strip == "":  # separation between list and other paragraph
			elif content.strip == "":  # separation between list and other paragraph
				cell.list_flag = False
				cell.content += "\n" #if not cell['content'].endswith("\n") else ""
			else:
@@ -538,10 +549,10 @@ def parse_pandoc_table_with_spans(pandoc_table):
					#cell['content'] = cell['content'].strip("\n")
				cell.list_flag = True
				cell.content += content.strip() + "\n"  # Add newline to know when the list element ends
			elif cell.list_flag and cells[i].strip() != "":  # any other content when handling list is concatenated to the last list element
			elif cell.list_flag and content.strip() != "":  # any other content when handling list is concatenated to the last list element
				cell.content = cell.content.strip("\n")
				cell.content += " " + content.strip() + "\n"
			elif cells[i].strip() == "":  # separation between list and other paragraph
			elif content.strip() == "":  # separation between list and other paragraph
				cell.list_flag = False
				#content = re.sub(r'\\\s*$', "\n", content.strip())
				cell.content += "\n" if not cell.content.endswith("\n") else ""
@@ -551,6 +562,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
		#print(cell['content'])
		return cell

	# Adjust colspan of a cell
	def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
		for j in range(column_index, number_of_parts):
			delimiter_start = None
@@ -614,9 +626,8 @@ def parse_pandoc_table_with_spans(pandoc_table):

	data_rows = []
	for row in range(len(separator_indices) - 1):
		table_row = []
		auxiliar_rows = []
		has_merged_cells = False
		rows = []
		rows_tracker = []
		in_data_row = False
		start, end = separator_indices[row], separator_indices[row + 1]
		row_lines = lines[start:end]  # Lines between separators including separator line start as it gives information about the number of columns of the row
@@ -624,10 +635,8 @@ def parse_pandoc_table_with_spans(pandoc_table):
			# Combine multiline content into single strings for each cell
			for line in row_lines:
				if is_separator(line) and not in_data_row:
					number_of_columns_row = line.count("+") - 1
					in_data_row = True
					parts = re.split(r"\s*\+\s*", line.strip("+"))
					# Add as many cells as columns with span attributes
					delimiter_index = 0
					# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
					# we need to assign the default alignment as defined in the header separator line
@@ -640,113 +649,101 @@ def parse_pandoc_table_with_spans(pandoc_table):
					#		alignments.append("align=\"right\"")
					#	else:
					#		alignments.append("align=\"center\"")
					header_delimiter_index = 0
					table_row = Row(number_of_columns)
					rows.append(Row(number_of_columns))
					#rows_tracker = [RowTracker() for _ in range(number_of_columns)]
					rows_tracker = RowTracker(number_of_columns)
					i = 0
					j = 0
					while i in range(number_of_columns) and j in range(len(parts)):
					for j in range(len(parts)):
						if i in range(number_of_columns):
							delimiter_index += len(parts[j]) + 1
						#table_row[i].alignment = default_alignments[i] if i == 0 else "align=\"center\""
						table_row[i].position = delimiter_index # Position of cell delimiter +
							# Set position
							rows[-1][i].position = delimiter_index # Position of cell delimiter +
							# Set alignment as defined by header separator line
						table_row[i].set_alignment()
							rows[-1][i].set_alignment()
							while delimiter_index > delimiter_positions[i]:
								i += 1
							i += 1
						j += 1

				elif in_data_row:
					# Regular data row or partial separator
					if _matchGridTableBodySeparator.match(line): # Partial separator
						has_merged_cells = True
						cells = re.split(r"[\|\+]", line.strip("|").strip("+"))  # (?<!\\)[\|\+]
						#Add auxiliar line, set delimiters for each cell
						auxiliar_rows.append(Row(number_of_columns))
						cells_content = re.split(r"[\|\+]", line.strip("|").strip("+"))  # (?<!\\)[\|\+]
						#Add another row, set delimiters for each cell
						rows.append(Row(number_of_columns))
						aux_delimiter_index = 0
						for auxiliar_cell_index in range(number_of_columns):
							aux_delimiter_index += len(cells[auxiliar_cell_index]) + 1
							auxiliar_rows[-1][auxiliar_cell_index].position = aux_delimiter_index  # Position of cell delimiter +
							auxiliar_rows[-1][auxiliar_cell_index].set_alignment()

						if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							table_row_index = 0
							for i in range(len(cells)):
								if _matchGridTableBodySeparatorLine.match(cells[i]):  # A new row is to be added
						auxiliar_cell_index = 0
						for i in range(len(cells_content)):
							if auxiliar_cell_index in range(number_of_columns):
								aux_delimiter_index += len(cells_content[i]) + 1
								rows[-1][auxiliar_cell_index].position = aux_delimiter_index  # Position of cell delimiter +
								rows[-1][auxiliar_cell_index].set_alignment()
								while aux_delimiter_index > delimiter_positions[auxiliar_cell_index]:
									auxiliar_cell_index += 1
								auxiliar_cell_index += 1

						if len(cells_content) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							column_index = 0
							for i in range(len(cells_content)):
								if _matchGridTableBodySeparatorLine.match(cells_content[i]):  # A new row is to be added
									rows_tracker[column_index] += 1
									rows[rows_tracker[column_index]][column_index].list_flag = False
									#auxiliar_rows[-1]['use_auxiliar_row'][i] = True
									auxiliar_rows[-1][i].list_flag = False
									table_row[i].auxiliar_index = len(auxiliar_rows)-1
									#if cells[i].startswith(":") and not cells[i].endswith(":"):
									#	auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\""
									#elif not cells[i].startswith(":") and  cells[i].endswith(":"):
									#	auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\""
									#else:
									#	auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\""
									column_forward = 0
									for del_index in range(column_index, len(delimiter_positions)):
										if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index]:
											column_forward += 1
											rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1 else 0
									column_index += column_forward
									continue
								else:
									# Handle content of the cell
									if table_row[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
										auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i])
										if not auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted:
											auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted = True
									rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i])
									rows[rows_tracker[column_index]][column_index].rowspan += 1
									if not rows[rows_tracker[column_index]][column_index].colspan_adjusted:
										rows[rows_tracker[column_index]][column_index].colspan_adjusted = True
										# TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
											auxiliar_rows[table_row[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[i].auxiliar_index], i, number_of_columns, line, number_of_columns, delimiter_positions)
											table_row_index += auxiliar_rows[table_row[table_row_index].auxiliar_index][i].colspan - 1
									else:
										table_row[table_row_index] = handling_content(table_row[table_row_index], cells[i])
										# Cell which is not separator
										table_row[table_row_index].rowspan += 1
										if not table_row.cells[table_row_index].colspan_adjusted:
											table_row[table_row_index].colspan_adjusted = True
											#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
											table_row[table_row_index] = adjust_colspan(table_row, table_row_index, number_of_columns, line, number_of_columns, delimiter_positions)
											#table_row_index += table_row[i].colspan - 1 #Move forward index i
								if table_row[table_row_index].position == delimiter_positions[i]:
									table_row_index += table_row[table_row_index].colspan if table_row[table_row_index].colspan != 0 else 1
										rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions)

									if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]:
										column_index += rows[rows_tracker[column_index]][column_index].colspan if rows[rows_tracker[column_index]][column_index].colspan != 0 else 1
									continue

						else:
							raise ValueError("More cells than columns found")
					else: # Data row
						cells = re.split(r"\s*\|\s*", line.strip("|"))
						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							table_row_index = 0
							for i in range(len(cells)):
						cells_content = re.split(r"\s*\|\s*", line.strip("|"))
						column_index = 0
						if len(cells_content) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells_content)):
								# Handle content of the cell
								if table_row[table_row_index].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
									auxiliar_rows[table_row.cells[table_row_index].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[table_row_index].auxiliar_index][i], cells[i])
									if not auxiliar_rows[table_row[table_row_index].auxiliar_index].cells[i].colspan_adjusted:
										auxiliar_rows[table_row[table_row_index].auxiliar_index].cells[i].colspan_adjusted = True
								rows[rows_tracker[column_index]][column_index] = handling_content(rows[rows_tracker[column_index]][column_index], cells_content[i])
								if not rows[rows_tracker[column_index]][column_index].colspan_adjusted:
									rows[rows_tracker[column_index]][column_index].colspan_adjusted = True
									#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
										auxiliar_rows[table_row[table_row_index].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[table_row_index].auxiliar_index].cells, i, number_of_columns, line, number_of_columns, delimiter_positions)
										table_row_index += auxiliar_rows[table_row[table_row_index].auxiliar_index][i].colspan - 1  # Move forward index i
								else:
									table_row[table_row_index] = handling_content(table_row[table_row_index], cells[i])
									if not table_row.cells[table_row_index].colspan_adjusted:
										table_row[table_row_index].colspan_adjusted = True
										table_row[table_row_index] = adjust_colspan(table_row.cells, table_row_index, number_of_columns, line, number_of_columns, delimiter_positions)
										table_row_index += table_row[table_row_index].colspan - 1  # Move forward index i

								table_row_index += 1
						elif len(cells) == number_of_columns: # Simple row
							for i in range(len(cells)):
								if table_row[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
									auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i])
								else:
									# Handle content of the cell
									table_row[i] = handling_content(table_row[i], cells[i])
									rows[rows_tracker[column_index]][column_index] = adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, line, number_of_columns, delimiter_positions)
								if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]:
									column_index += rows[rows_tracker[column_index]][column_index].colspan  # Move forward index i

						elif len(cells_content) == number_of_columns: # Simple row
							for i in range(len(cells_content)):
								rows[rows_tracker[i]][i] = handling_content(rows[rows_tracker[i]][i], cells_content[i])
						else:
							raise ValueError("More cells than columns found")
				else:
					raise ValueError("No separator line found for row starting")

			if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
				data_rows.append(table_row.cells)
				if has_merged_cells:
					for row in auxiliar_rows:
						#for i in range(len(row.cells)):
						#	print(row.cells[i].content)
						data_rows.append(row.cells)
				for body_row in rows:
					data_rows.append(body_row.cells)
			elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
				header_rows.append(table_row.cells)
				if has_merged_cells:
					for row in auxiliar_rows:
						header_rows.append(row.cells)
				for header_row in rows:
					header_rows.append(header_row.cells)

	#print(header_rows)
	#print(data_rows)