Commit d59cfbc2 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Some cleanup + parsing converting lists in cells to html lists

parent bc780760
Loading
Loading
Loading
Loading
+204 −62
Original line number Diff line number Diff line
@@ -417,12 +417,13 @@ _matchCodefenceStart = re.compile(r'\s*```\s?.*', re.IGNORECASE)
_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
_matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
_matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE)
_matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE)
_matchTable = re.compile(r'^\s*\|.*\|\s*$', re.IGNORECASE)
_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([-:]+\+)+.*$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([:-]+\+)+.*$', re.IGNORECASE)
_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_matchListInContent = re.compile(r'^(?:\s*(P<marker>[-*+]|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
_htmlAnchorLink = re.compile(r'<a\s+name="([^"]*)">[^<]*</a>', re.IGNORECASE)
@@ -466,6 +467,42 @@ def parse_pandoc_table_with_spans(pandoc_table):
		_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
		return _matchGridTableSeparator.match(line)

	def handling_content(cell, content, list_flag):
		if cell['content'] is None:
			cell['rowspan'] += 1
			cell['colspan'] += 1
			if content.strip().startswith("- "):  # List
				list_flag = True
				print(content)
				cell['content'] = content.strip() + "\n"  # Add newline to know when the list element ends
			elif list_flag:  # any other content when handling list is concatenated to the last list element
				cell['content'] += content.strip() + "\n"
			elif cells[i].strip() == "":  # separation between list and other paragraph
				list_flag = False
				cell['content'] = re.sub(r'\\\s*$', "\n", content)
			else:
				cell['content'] = re.sub(r'\\\s*$', "\n", content.strip())
		else:
			if content.strip().startswith("- "):  # List
				if not list_flag:
					cell['content'] += "\n"
					#cell['content'] = cell['content'].strip("\n")
				list_flag = True
				cell['content'] += content.strip() + "\n"  # Add newline to know when the list element ends
			elif list_flag:  # any other content when handling list is concatenated to the last list element
				cell['content'] = cell['content'].strip("\n")
				cell['content'] += " " + content.strip() + "\n"
			elif cells[i].strip() == "":  # separation between list and other paragraph
				list_flag = False
				#content = re.sub(r'\\\s*$', "\n", content.strip())
				cell['content'] += "\n" if not cell['content'].endswith("\n") else ""
			else:
				content = re.sub(r'\\\s*$', "\n", content.strip())
				cell['content'] += " " + content
		#print(cell['content'])
		return list_flag, cell


	_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
	separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]

@@ -490,11 +527,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
			has_header = True
			header_separator_index = index
			header_rows = []

	data_rows = []
	for row in range(len(separator_indices) - 1):
		table_row = []
		auxiliar_row = []
		use_auxiliar_row = []
		list_flags = []
		has_merged_cells = False
		in_data_row = False
		start, end = separator_indices[row], separator_indices[row + 1]
@@ -508,24 +547,38 @@ def parse_pandoc_table_with_spans(pandoc_table):
					parts = re.split(r"\s*\+\s*", line.strip("+"))
					# Add as many cells as columns with span attributes
					delimiter_index = 0
					# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
					# we need to assign the default alignment as defined in the header separator line
					# We may not need the code below, as that supports alignment per cell and row
					alignments = []
					for part_index in range(len(parts)):
						if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
							alignments.append("align=\"left\"")
						elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
							alignments.append("align=\"right\"")
						else:
							alignments.append("align=\"center\"")
					for i in range(number_of_columns_row):
						delimiter_index += len(parts[i]) + 1
						table_row.append({
							"content": "NOCONTENT",
							"content": None,
							"rowspan": 0,
							"colspan": 0,
							"colspan_adjusted": False,
							"alignment": alignments[i] if alignments[i] else "align=\"center\"",
							"position": delimiter_index # Position of cell delimiter +
						})
					for i in range(number_of_columns):
						auxiliar_row.append({
							"content": "NOCONTENT",
							"content": None,
							"rowspan": 0,
							"colspan": 0,
							"colspan_adjusted": False,
							"alignment": "align=\"center\"",
							"position": 0
						})
						use_auxiliar_row.append(False)
						list_flags.append(False)

				elif in_data_row:
					# Regular data row or partial separator
@@ -536,14 +589,35 @@ def parse_pandoc_table_with_spans(pandoc_table):
							for i in range(len(cells)):
								if _matchGridTableSeparatorLine.match(cells[i]):  # A new row is to be added
									use_auxiliar_row[i] = True
									if cells[i].startswith(":") and not cells[i].endswith(":"):
										auxiliar_row[i]['alignment'] = "align=\"left\""
									elif not cells[i].startswith(":") and  cells[i].endswith(":"):
										auxiliar_row[i]['alignment'] = "align=\"right\""
									else:
									if table_row[i]['content'] == "NOCONTENT":
										table_row[i]['rowspan'] += 1
										table_row[i]['colspan'] += 1
										table_row[i]['content'] = cells[i]
										auxiliar_row[i]['alignment'] = "align=\"center\""
								else:
										table_row[i]['content'] += cells[i]
									# Cell which is not separator
									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
									#if table_row[i]['content'] is None:
									#	table_row[i]['rowspan'] += 1
									#	table_row[i]['colspan'] += 1
										#if cells[i].strip().startswith("- "):  # List
										#	handling_list = True
										#	print(cells[i])
										#	table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
										#elif handling_list: # any other content when handling list is concatenated to the last list element
										#	table_row[i]['content'].strip("\n")
										#	table_row[i]['content'] += cells[i] + "\n"
										#elif cells[i].strip(): #separation between list and other paragraph
										#	handling_list = False
										#	table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
										#else:
										#	table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
									#else:
									#	if cells[i].strip().startswith("- "): # List
									#		print(cells[i])
									#		table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
									#	else:
									#		table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])									# Cell which is not separator
									table_row[i]['rowspan'] += 1
									if not table_row[i]['colspan_adjusted']:
										table_row[i]['colspan_adjusted'] = True
@@ -561,13 +635,30 @@ def parse_pandoc_table_with_spans(pandoc_table):
							for i in range(len(cells)):
								if _matchGridTableSeparatorLine.match(cells[i]):  # Update cell in new row
									use_auxiliar_row[i] = True
									if cells[i].startswith(":") and not cells[i].endswith(":"):
										auxiliar_row[i]['alignment'] = "align=\"left\""
									elif not cells[i].startswith(":") and  cells[i].endswith(":"):
										auxiliar_row[i]['alignment'] = "align=\"right\""
									else:
									if table_row[i]['content'] == "NOCONTENT":
										table_row[i]['rowspan'] += 1
										table_row[i]['colspan'] += 1
										table_row[i]['content'] = cells[i]
										auxiliar_row[i]['alignment'] = "align=\"center\""
								else:
										table_row[i]['content'] += cells[i]
									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])

#									if table_row[i]['content'] is None:
#										table_row[i]['rowspan'] += 1
#										table_row[i]['colspan'] += 1
#										if cells[i].strip().startswith("- "): # List
#											print(cells[i])
#											table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
#										else:
#											table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
#									else:
#										if cells[i].strip().startswith("- "): # List
#											print(cells[i])
#											table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
#										else:
#											table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])

									# Cell which is not separator
									table_row[i]['rowspan'] += 1
									# Not needed, no colspan as number of cells is equal to number of columns
@@ -588,40 +679,62 @@ def parse_pandoc_table_with_spans(pandoc_table):
						cells = re.split(r"\s*\|\s*", line.strip("|"))
						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells)):
								if table_row[i]['content'] == "NOCONTENT":
									table_row[i]['rowspan'] += 1
									table_row[i]['colspan'] += 1
									table_row[i]['content'] = cells[i]
								else:
									table_row[i]['content'] += cells[i]
								list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
#								if table_row[i]['content'] is None:
#									table_row[i]['rowspan'] += 1
#									table_row[i]['colspan'] += 1
#									if cells[i].strip().startswith("- "):  # List
#										print(cells[i])
#										table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
#									else:
#										table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#								else:
#									if cells[i].strip().startswith("- "):  # List
#										print(cells[i])
#										table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
#									else:
#										table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
								if not table_row[i]['colspan_adjusted']:
									table_row[i]['colspan_adjusted'] = True
									for j in range(i, len(cells)):
										delimiter_start = table_row[j-1]['position'] if j != 0 else 0
										if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
											table_row[i]['colspan'] += 1
											if line.find("|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]:  # last cell in row, adjust colspan to get max number columns
												colspan_remaining = 0
												for cell_index in range(number_of_columns_row):
													colspan_remaining += table_row[cell_index]['colspan']
												table_row[i]['colspan'] += number_of_columns - colspan_remaining
										elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
											raise ValueError("Wrong cell formatting")
										else:

											break

						elif len(cells) == number_of_columns: # Simple row
							for i in range(len(cells)):
								if use_auxiliar_row[i]:
									if auxiliar_row[i]['content'] == "NOCONTENT":
									if auxiliar_row[i]['content'] is None:
										auxiliar_row[i]['rowspan'] += 1
										auxiliar_row[i]['colspan'] += 1
										auxiliar_row[i]['content'] = cells[i]
									else:
										auxiliar_row[i]['content'] += cells[i]
										auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
									else:
									if table_row[i]['content'] == "NOCONTENT":
										table_row[i]['rowspan'] += 1
										table_row[i]['colspan'] += 1
										table_row[i]['content'] = cells[i]
										auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
								else:
										table_row[i]['content'] += cells[i]
									list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
#									if table_row[i]['content'] is None:
#										table_row[i]['rowspan'] += 1
#										table_row[i]['colspan'] += 1
#										if cells[i].strip().startswith("- "):  # List
#											print(cells[i])
#											table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
#										else:
#											table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
#									else:
#										if cells[i].strip().startswith("- "):  # List
#											print(cells[i])
#											table_row[i]['content'] += cells[i] + "\n"  # Add newline to know when the list element ends
#										else:
#											table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
						else:
							raise ValueError("More cells than columns found")
				else:
@@ -638,39 +751,47 @@ def parse_pandoc_table_with_spans(pandoc_table):

	#print(header_rows)
	#print(data_rows)
	# Correct newlines characters
	for row in header_rows:
		for cell in row:
			cell['content'] = cell['content'].replace("\\", "<br>")
	for row in data_rows:
		for cell in row:
			cell['content'] = cell['content'].replace("\\", "<br>")
	# Check if there are any data rows
	if not data_rows and not header_rows:
		raise ValueError("No valid rows found in the provided Pandoc table.")

	# Format text
	for rows in [header_rows, data_rows]:
		bold = "<strong>"
	for row in header_rows:
		italic = "<i>"
		for row in rows:
			for cell in row:
			while cell['content'].find("**") != -1:
				cell['content'] = cell['content'].replace("**", bold, 1)
				if cell['content'] is not None:
					# Replacing "<" by &lt;
					cell['content'] = cell['content'].replace("<", "&lt;")

					#Bold
					for bold_characters in ["**", "__"]:
						while cell['content'].find(bold_characters) != -1:
							cell['content'] = cell['content'].replace(bold_characters, bold, 1)
							if bold == "<strong>":
								bold = "</strong>"
							else:
								bold = "<strong>"
	bold = "<strong>"
					#Italic
					while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1:
						cell['content'] = cell['content'].rstrip() .replace("_", italic, 1)
						if italic == "<i>":
							italic = "</i>"
						else:
							italic = "<i>"
					while cell['content'].find("\_") != -1:
						cell['content'] = cell['content'].rstrip().replace("\_", "_", 1)

	# Correct newlines characters
	for row in header_rows:
		for cell in row:
			cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
	for row in data_rows:
		for cell in row:
			while cell['content'].find("**") != -1:
				cell['content'] = cell['content'].replace("**", bold, 1)
				if bold == "<strong>":
					bold = "</strong>"
				else:
					bold = "<strong>"
			cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None

	# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows

	forward_rowspan = []
	for row_index in range(len(header_rows)):
		if len(forward_rowspan) == 0:
@@ -701,12 +822,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
				forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
		if not sum == number_of_columns:
			raise ValueError("Grid table not converted properly")
	#if has_header:
	#	table_with_spans = header_rows

	#table_with_spans += data_rows

	#return table_with_spans
	return header_rows, data_rows

def generate_html_table_with_spans(pandoc_table):
@@ -733,9 +849,23 @@ def generate_html_table_with_spans(pandoc_table):
				if cell['rowspan'] == 0 or cell['colspan'] == 0:
					continue
				else:
					# Prepare content, in case there's a list
					print(cell['content'])
					if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>",
											 cell['content']):  # Update cell in new row
						#print("MATCHING")
						list = "<ul>"
						# Build list the matches
						for match in matches:
							list += "<li>" + match[1] + "</li>"
						list += "</ul>"
						cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
					#else:
					#	print("NOT MATCHING")

					rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
					colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
					html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
					html += f"            <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n"
			html += "        </tr>\n"
		html += "    </thead>\n"

@@ -746,9 +876,21 @@ def generate_html_table_with_spans(pandoc_table):
			if cell['rowspan'] == 0 or cell['colspan'] == 0:
				continue
			else:
				#Prepare content, in case there's a list
				#print(cell['content'])
				if matches := re.findall(r"\s*([-*+]|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']):  # Update cell in new row
					#print("MATCHING")
					list = "<ul>"
					# Build list the matches
					for match in matches:
						list += "<li>" + match[1] + "</li>"
					list += "</ul>"
					cell['content'] = re.sub(r"(\s*([-*+]|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
				#else:
					#print("NOT MATCHING")
				rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
				colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
				html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
				html += f"            <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n"
		html += "        </tr>\n"

	html += "    </tbody>\n"