Commit 2451610e authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

More cleanup + cell alignment as defined in header separator line (Pandoc's behaviour)

parent d59cfbc2
Loading
Loading
Loading
Loading
+88 −134
Original line number Diff line number Diff line
@@ -422,6 +422,7 @@ _matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
@@ -502,8 +503,24 @@ def parse_pandoc_table_with_spans(pandoc_table):
		#print(cell['content'])
		return list_flag, cell

	def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
		for j in range(column_index, number_of_parts):
			delimiter_start = row[j - 1]['position'] if j != 0 else 0
			positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
			position = min(positions) if positions else -1
			if position > delimiter_positions[j]:  # Colspan to be increased
				row[i]['colspan'] += 1
				if position == delimiter_positions[len(delimiter_positions) - 1]:  # last cell in row, adjust colspan to get max number columns
					colspan_allocated = 0
					for cell_index in range(number_of_parts):
						colspan_allocated += row[cell_index]['colspan']
					row[column_index]['colspan'] += number_of_columns - colspan_allocated
			elif position < delimiter_positions[j]:
				raise ValueError("Wrong cell formatting")
			else:
				break
		return row[column_index]

	_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
	separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]

	print(separator_indices)
@@ -522,11 +539,26 @@ def parse_pandoc_table_with_spans(pandoc_table):
				del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]]
				delimiter_positions.append(min(del_positions) if del_positions else -1)
	has_header = False
	header_delimiter_positions = []
	for index in separator_indices:
		if _matchGridTableHeaderSeparator.match(lines[index]):
			has_header = True
			header_separator_index = index
			header_rows = []
			parts = re.split(r"\s*\+\s*", lines[index].strip("+"))
			default_alignments = []
			#Calculate default alignments and positions of delimiters
			for part_index in range(len(parts)):
				if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
					default_alignments.append("align=\"left\"")
				elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
					default_alignments.append("align=\"right\"")
				else:
					default_alignments.append("align=\"center\"")
				# Delimiter position
				delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0
				del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]]
				header_delimiter_positions.append(min(del_positions) if del_positions else -1)

	data_rows = []
	for row in range(len(separator_indices) - 1):
@@ -550,14 +582,15 @@ def parse_pandoc_table_with_spans(pandoc_table):
					# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
					# we need to assign the default alignment as defined in the header separator line
					# We may not need the code below, as that supports alignment per cell and row
					alignments = []
					for part_index in range(len(parts)):
						if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
							alignments.append("align=\"left\"")
						elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
							alignments.append("align=\"right\"")
						else:
							alignments.append("align=\"center\"")
					#alignments = []
					#for part_index in range(len(parts)):
					#	if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
					#		alignments.append("align=\"left\"")
					#	elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
					#		alignments.append("align=\"right\"")
					#	else:
					#		alignments.append("align=\"center\"")
					header_delimiter_index = 0
					for i in range(number_of_columns_row):
						delimiter_index += len(parts[i]) + 1
						table_row.append({
@@ -565,9 +598,21 @@ def parse_pandoc_table_with_spans(pandoc_table):
							"rowspan": 0,
							"colspan": 0,
							"colspan_adjusted": False,
							"alignment": alignments[i] if alignments[i] else "align=\"center\"",
							"alignment": default_alignments[i] if i == 0 else "align=\"center\"",
							"position": delimiter_index # Position of cell delimiter +
						})
						#Set alignment as defined by header separator line
						while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]:
							header_delimiter_index += 1
						if header_delimiter_index in range(len(default_alignments)):
							if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]:
								table_row[i]['alignment'] = default_alignments[header_delimiter_index]
							elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]:
								table_row[i]['alignment'] = default_alignments[i]
								header_delimiter_index += 1
						else:
							raise ValueError("Invalid table formatting")

					for i in range(number_of_columns):
						auxiliar_row.append({
							"content": None,
@@ -585,10 +630,11 @@ def parse_pandoc_table_with_spans(pandoc_table):
					if _matchGridTableBodySeparator.match(line): # Partial separator
						has_merged_cells = True
						cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
						if len(cells) <= number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells)):
								if _matchGridTableSeparatorLine.match(cells[i]):  # A new row is to be added
								if _matchGridTableBodySeparatorLine.match(cells[i]):  # A new row is to be added
									use_auxiliar_row[i] = True
									list_flags[i] = False
									if cells[i].startswith(":") and not cells[i].endswith(":"):
										auxiliar_row[i]['alignment'] = "align=\"left\""
									elif not cells[i].startswith(":") and  cells[i].endswith(":"):
@@ -596,145 +642,49 @@ def parse_pandoc_table_with_spans(pandoc_table):
									else:
										auxiliar_row[i]['alignment'] = "align=\"center\""
								else:
									#Handle content of the cell
									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
									#if table_row[i]['content'] is None:
									#	table_row[i]['rowspan'] += 1
									#	table_row[i]['colspan'] += 1
										#if cells[i].strip().startswith("- "):  # List
										#	handling_list = True
										#	print(cells[i])
										#	table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
										#elif handling_list: # any other content when handling list is concatenated to the last list element
										#	table_row[i]['content'].strip("\n")
										#	table_row[i]['content'] += cells[i] + "\n"
										#elif cells[i].strip(): #separation between list and other paragraph
										#	handling_list = False
										#	table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
										#else:
										#	table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
									#else:
									#	if cells[i].strip().startswith("- "): # List
									#		print(cells[i])
									#		table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
									#	else:
									#		table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])									# Cell which is not separator
									# Cell which is not separator
									table_row[i]['rowspan'] += 1
									if not table_row[i]['colspan_adjusted']:
										table_row[i]['colspan_adjusted'] = True
										for j in range(i, len(cells)):
											delimiter_start = table_row[j-1]['position'] if j != 0 else 0
											positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
											position = min(positions) if positions else -1
											if position > delimiter_positions_start[j]: # Colspan to add
												table_row[i]['colspan'] += 1
											elif position < delimiter_positions_start[j]:
												raise ValueError("Wrong cell formatting")
											else:
												break
						elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
							for i in range(len(cells)):
								if _matchGridTableSeparatorLine.match(cells[i]):  # Update cell in new row
									use_auxiliar_row[i] = True
									if cells[i].startswith(":") and not cells[i].endswith(":"):
										auxiliar_row[i]['alignment'] = "align=\"left\""
									elif not cells[i].startswith(":") and  cells[i].endswith(":"):
										auxiliar_row[i]['alignment'] = "align=\"right\""
									else:
										auxiliar_row[i]['alignment'] = "align=\"center\""
								else:
									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])

#									if table_row[i]['content'] is None:
#										table_row[i]['rowspan'] += 1
#										table_row[i]['colspan'] += 1
#										if cells[i].strip().startswith("- "): # List
#											print(cells[i])
#											table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
#										else:
#											table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
										#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
										table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
						#elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
						#	for i in range(len(cells)):
						#		if _matchGridTableBodySeparatorLine.match(cells[i]):  # Update cell in new row
						#			use_auxiliar_row[i] = True
						#			list_flags[i] = False
						#			if cells[i].startswith(":") and not cells[i].endswith(":"):
						#				auxiliar_row[i]['alignment'] = "align=\"left\""
						#			elif not cells[i].startswith(":") and  cells[i].endswith(":"):
						#				auxiliar_row[i]['alignment'] = "align=\"right\""
						#			else:
#										if cells[i].strip().startswith("- "): # List
#											print(cells[i])
#											table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
						#				auxiliar_row[i]['alignment'] = "align=\"center\""
						#		else:
#											table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])

									# Cell which is not separator
									table_row[i]['rowspan'] += 1
									# Not needed, no colspan as number of cells is equal to number of columns
									#for j in range(i, len(cells)):
									#	delimiter_start = table_row[j-1]['position'] if j != 0 else 0
									#	positions = [line.find(delimiter,delimiter_start+1) for delimiter in "|+" if delimiter in line[delimiter_start+1:]]
									#	position = min(positions) if positions else -1
									#	if position > table_row[i]['position']:  # Only colspan to be increased
									#		table_row[i]['colspan'] += 1
									#	elif position + 1  < table_row[i]['position']:
									#		raise ValueError("Wrong cell formatting")
									#	else:
									#		break

						#			#Handle content of the cell
						#			list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
						#			# Cell which is not separator
						#			table_row[i]['rowspan'] += 1
						#			# Adjusting of colspan not needed, no colspan as number of cells is equal to number of columns
						else:
							raise ValueError("More cells than columns found")
					else: # Data row
						cells = re.split(r"\s*\|\s*", line.strip("|"))
						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells)):
								# Handle content of the cell
								list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
#								if table_row[i]['content'] is None:
#									table_row[i]['rowspan'] += 1
#									table_row[i]['colspan'] += 1
#									if cells[i].strip().startswith("- "):  # List
#										print(cells[i])
#										table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
#									else:
#										table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#								else:
#									if cells[i].strip().startswith("- "):  # List
#										print(cells[i])
#										table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
#									else:
#										table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
								if not table_row[i]['colspan_adjusted']:
									table_row[i]['colspan_adjusted'] = True
									for j in range(i, len(cells)):
										delimiter_start = table_row[j-1]['position'] if j != 0 else 0
										if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
											table_row[i]['colspan'] += 1
											if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]:  # last cell in row, adjust colspan to get max number columns
												colspan_remaining = 0
												for cell_index in range(number_of_columns_row):
													colspan_remaining += table_row[cell_index]['colspan']
												table_row[i]['colspan'] += number_of_columns - colspan_remaining
										elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
											raise ValueError("Wrong cell formatting")
										else:
											break

									table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
						elif len(cells) == number_of_columns: # Simple row
							for i in range(len(cells)):
								if use_auxiliar_row[i]:
									if auxiliar_row[i]['content'] is None:
										auxiliar_row[i]['rowspan'] += 1
										auxiliar_row[i]['colspan'] += 1
										auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
									else:
										auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
									list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i])
								else:
									# Handle content of the cell
									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
#									if table_row[i]['content'] is None:
#										table_row[i]['rowspan'] += 1
#										table_row[i]['colspan'] += 1
#										if cells[i].strip().startswith("- "):  # List
#											print(cells[i])
#											table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
#										else:
#											table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#									else:
#										if cells[i].strip().startswith("- "):  # List
#											print(cells[i])
#											table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
#										else:
#											table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
						else:
							raise ValueError("More cells than columns found")
				else:
@@ -850,7 +800,7 @@ def generate_html_table_with_spans(pandoc_table):
					continue
				else:
					# Prepare content, in case there's a list
					print(cell['content'])
					#print(cell['content'])
					if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
											 cell['content']):  # Update cell in new row
						#print("MATCHING")
@@ -860,6 +810,8 @@ def generate_html_table_with_spans(pandoc_table):
							list += "<li>" + match[1] + "</li>"
						list += "</ul>"
						cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
						# Enforce left alignment if cell contains a list
						cell['alignment'] = "align=\"left\""
					#else:
					#	print("NOT MATCHING")

@@ -886,6 +838,8 @@ def generate_html_table_with_spans(pandoc_table):
						list += "<li>" + match[1] + "</li>"
					list += "</ul>"
					cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
					# Enforce left alignment if cell contains a list
					cell['alignment'] = "align=\"left\""
				#else:
					#print("NOT MATCHING")
				rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""