Some cleanup + parsing converting lists in cells to html lists (d59cfbc2) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

toMkdocs/toMkdocs.py

+204 −62

Original line number	Diff line number	Diff line
		@@ -417,12 +417,13 @@ _matchCodefenceStart = re.compile(r'\s```\s?.', re.IGNORECASE)
		_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
		_matchNote = re.compile(r'^\s>\s', re.IGNORECASE)
		_matchStandAloneImage = re.compile(r'^\s!\[[^\]]\]$([^)])$\s', re.IGNORECASE)
		_matchTable = re.compile(r'^\s\\|.\\|\s$', re.IGNORECASE)
		_matchTable = re.compile(r'^\s\\|.\\|\s*$', re.IGNORECASE)
		_matchTableSeparator = re.compile(r'^\s\\|([-: ]+\\|)+\s$', re.IGNORECASE)
		_matchGridTable = re.compile(r'^\s\+-.\+\s$', re.IGNORECASE)
		_matchGridTableBodySeparator = re.compile(r'.\+([-:]+\+)+.$', re.IGNORECASE)
		_matchGridTableBodySeparator = re.compile(r'.\+([:-]+\+)+.$', re.IGNORECASE)
		_matchGridTableHeaderSeparator = re.compile(r'.\+([=:]+\+)+.$', re.IGNORECASE)
		_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
		_matchListInContent = re.compile(r'^(?:\s(P<marker>[-+]\|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
		_markdownLink = re.compile(r'[^!]\[[^\]]\]$(#[^)])$', re.IGNORECASE)
		_htmlLink = re.compile(r'<a\s+href="([^"\'])">[^<]</a>', re.IGNORECASE)
		_htmlAnchorLink = re.compile(r'<a\s+name="([^"])">[^<]</a>', re.IGNORECASE)
		@@ -466,6 +467,42 @@ def parse_pandoc_table_with_spans(pandoc_table):
		_matchGridTableSeparator = re.compile(r'\s\+([-:=]+\+)+\s$', re.IGNORECASE)
		return _matchGridTableSeparator.match(line)

		def handling_content(cell, content, list_flag):
		if cell['content'] is None:
		cell['rowspan'] += 1
		cell['colspan'] += 1
		if content.strip().startswith("- "): # List
		list_flag = True
		print(content)
		cell['content'] = content.strip() + "\n" # Add newline to know when the list element ends
		elif list_flag: # any other content when handling list is concatenated to the last list element
		cell['content'] += content.strip() + "\n"
		elif cells[i].strip() == "": # separation between list and other paragraph
		list_flag = False
		cell['content'] = re.sub(r'\\\s*$', "\n", content)
		else:
		cell['content'] = re.sub(r'\\\s*$', "\n", content.strip())
		else:
		if content.strip().startswith("- "): # List
		if not list_flag:
		cell['content'] += "\n"
		#cell['content'] = cell['content'].strip("\n")
		list_flag = True
		cell['content'] += content.strip() + "\n" # Add newline to know when the list element ends
		elif list_flag: # any other content when handling list is concatenated to the last list element
		cell['content'] = cell['content'].strip("\n")
		cell['content'] += " " + content.strip() + "\n"
		elif cells[i].strip() == "": # separation between list and other paragraph
		list_flag = False
		#content = re.sub(r'\\\s*$', "\n", content.strip())
		cell['content'] += "\n" if not cell['content'].endswith("\n") else ""
		else:
		content = re.sub(r'\\\s*$', "\n", content.strip())
		cell['content'] += " " + content
		#print(cell['content'])
		return list_flag, cell


		_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
		separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]

		@@ -490,11 +527,13 @@ def parse_pandoc_table_with_spans(pandoc_table):
		has_header = True
		header_separator_index = index
		header_rows = []

		data_rows = []
		for row in range(len(separator_indices) - 1):
		table_row = []
		auxiliar_row = []
		use_auxiliar_row = []
		list_flags = []
		has_merged_cells = False
		in_data_row = False
		start, end = separator_indices[row], separator_indices[row + 1]
		@@ -508,24 +547,38 @@ def parse_pandoc_table_with_spans(pandoc_table):
		parts = re.split(r"\s\+\s", line.strip("+"))
		# Add as many cells as columns with span attributes
		delimiter_index = 0
		# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
		# we need to assign the default alignment as defined in the header separator line
		# We may not need the code below, as that supports alignment per cell and row
		alignments = []
		for part_index in range(len(parts)):
		if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
		alignments.append("align=\"left\"")
		elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
		alignments.append("align=\"right\"")
		else:
		alignments.append("align=\"center\"")
		for i in range(number_of_columns_row):
		delimiter_index += len(parts[i]) + 1
		table_row.append({
		"content": "NOCONTENT",
		"content": None,
		"rowspan": 0,
		"colspan": 0,
		"colspan_adjusted": False,
		"alignment": alignments[i] if alignments[i] else "align=\"center\"",
		"position": delimiter_index # Position of cell delimiter +
		})
		for i in range(number_of_columns):
		auxiliar_row.append({
		"content": "NOCONTENT",
		"content": None,
		"rowspan": 0,
		"colspan": 0,
		"colspan_adjusted": False,
		"alignment": "align=\"center\"",
		"position": 0
		})
		use_auxiliar_row.append(False)
		list_flags.append(False)

		elif in_data_row:
		# Regular data row or partial separator
		@@ -536,14 +589,35 @@ def parse_pandoc_table_with_spans(pandoc_table):
		for i in range(len(cells)):
		if _matchGridTableSeparatorLine.match(cells[i]): # A new row is to be added
		use_auxiliar_row[i] = True
		if cells[i].startswith(":") and not cells[i].endswith(":"):
		auxiliar_row[i]['alignment'] = "align=\"left\""
		elif not cells[i].startswith(":") and cells[i].endswith(":"):
		auxiliar_row[i]['alignment'] = "align=\"right\""
		else:
		if table_row[i]['content'] == "NOCONTENT":
		table_row[i]['rowspan'] += 1
		table_row[i]['colspan'] += 1
		table_row[i]['content'] = cells[i]
		auxiliar_row[i]['alignment'] = "align=\"center\""
		else:
		table_row[i]['content'] += cells[i]
		# Cell which is not separator
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
		#if table_row[i]['content'] is None:
		# table_row[i]['rowspan'] += 1
		# table_row[i]['colspan'] += 1
		#if cells[i].strip().startswith("- "): # List
		# handling_list = True
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		#elif handling_list: # any other content when handling list is concatenated to the last list element
		# table_row[i]['content'].strip("\n")
		# table_row[i]['content'] += cells[i] + "\n"
		#elif cells[i].strip(): #separation between list and other paragraph
		# handling_list = False
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		#else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		#else:
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) # Cell which is not separator
		table_row[i]['rowspan'] += 1
		if not table_row[i]['colspan_adjusted']:
		table_row[i]['colspan_adjusted'] = True
		@@ -561,13 +635,30 @@ def parse_pandoc_table_with_spans(pandoc_table):
		for i in range(len(cells)):
		if _matchGridTableSeparatorLine.match(cells[i]): # Update cell in new row
		use_auxiliar_row[i] = True
		if cells[i].startswith(":") and not cells[i].endswith(":"):
		auxiliar_row[i]['alignment'] = "align=\"left\""
		elif not cells[i].startswith(":") and cells[i].endswith(":"):
		auxiliar_row[i]['alignment'] = "align=\"right\""
		else:
		if table_row[i]['content'] == "NOCONTENT":
		table_row[i]['rowspan'] += 1
		table_row[i]['colspan'] += 1
		table_row[i]['content'] = cells[i]
		auxiliar_row[i]['alignment'] = "align=\"center\""
		else:
		table_row[i]['content'] += cells[i]
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])

		# if table_row[i]['content'] is None:
		# table_row[i]['rowspan'] += 1
		# table_row[i]['colspan'] += 1
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
		# else:
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])

		# Cell which is not separator
		table_row[i]['rowspan'] += 1
		# Not needed, no colspan as number of cells is equal to number of columns
		@@ -588,40 +679,62 @@ def parse_pandoc_table_with_spans(pandoc_table):
		cells = re.split(r"\s\\|\s", line.strip("\|"))
		if len(cells) < number_of_columns: # Colspan: Positions of \| with respect to + need to be determined
		for i in range(len(cells)):
		if table_row[i]['content'] == "NOCONTENT":
		table_row[i]['rowspan'] += 1
		table_row[i]['colspan'] += 1
		table_row[i]['content'] = cells[i]
		else:
		table_row[i]['content'] += cells[i]
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
		# if table_row[i]['content'] is None:
		# table_row[i]['rowspan'] += 1
		# table_row[i]['colspan'] += 1
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		# else:
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		if not table_row[i]['colspan_adjusted']:
		table_row[i]['colspan_adjusted'] = True
		for j in range(i, len(cells)):
		delimiter_start = table_row[j-1]['position'] if j != 0 else 0
		if line.find("\|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
		table_row[i]['colspan'] += 1
		if line.find("\|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
		colspan_remaining = 0
		for cell_index in range(number_of_columns_row):
		colspan_remaining += table_row[cell_index]['colspan']
		table_row[i]['colspan'] += number_of_columns - colspan_remaining
		elif line.find("\|", delimiter_start+1) < delimiter_positions[j]:
		raise ValueError("Wrong cell formatting")
		else:

		break

		elif len(cells) == number_of_columns: # Simple row
		for i in range(len(cells)):
		if use_auxiliar_row[i]:
		if auxiliar_row[i]['content'] == "NOCONTENT":
		if auxiliar_row[i]['content'] is None:
		auxiliar_row[i]['rowspan'] += 1
		auxiliar_row[i]['colspan'] += 1
		auxiliar_row[i]['content'] = cells[i]
		else:
		auxiliar_row[i]['content'] += cells[i]
		auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		else:
		if table_row[i]['content'] == "NOCONTENT":
		table_row[i]['rowspan'] += 1
		table_row[i]['colspan'] += 1
		table_row[i]['content'] = cells[i]
		auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		else:
		table_row[i]['content'] += cells[i]
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
		# if table_row[i]['content'] is None:
		# table_row[i]['rowspan'] += 1
		# table_row[i]['colspan'] += 1
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		# else:
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
		else:
		raise ValueError("More cells than columns found")
		else:
		@@ -638,39 +751,47 @@ def parse_pandoc_table_with_spans(pandoc_table):

		#print(header_rows)
		#print(data_rows)
		# Correct newlines characters
		for row in header_rows:
		for cell in row:
		cell['content'] = cell['content'].replace("\\", "<br>")
		for row in data_rows:
		for cell in row:
		cell['content'] = cell['content'].replace("\\", "<br>")
		# Check if there are any data rows
		if not data_rows and not header_rows:
		raise ValueError("No valid rows found in the provided Pandoc table.")

		# Format text
		for rows in [header_rows, data_rows]:
		bold = "<strong>"
		for row in header_rows:
		italic = "<i>"
		for row in rows:
		for cell in row:
		while cell['content'].find("**") != -1:
		cell['content'] = cell['content'].replace("**", bold, 1)
		if cell['content'] is not None:
		# Replacing "<" by <
		cell['content'] = cell['content'].replace("<", "<")

		#Bold
		for bold_characters in ["**", "__"]:
		while cell['content'].find(bold_characters) != -1:
		cell['content'] = cell['content'].replace(bold_characters, bold, 1)
		if bold == "<strong>":
		bold = "</strong>"
		else:
		bold = "<strong>"
		bold = "<strong>"
		#Italic
		while cell['content'].find("_") != -1 and cell['content'].find("\_") == -1:
		cell['content'] = cell['content'].rstrip() .replace("_", italic, 1)
		if italic == "<i>":
		italic = "</i>"
		else:
		italic = "<i>"
		while cell['content'].find("\_") != -1:
		cell['content'] = cell['content'].rstrip().replace("\_", "_", 1)

		# Correct newlines characters
		for row in header_rows:
		for cell in row:
		cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None
		for row in data_rows:
		for cell in row:
		while cell['content'].find("**") != -1:
		cell['content'] = cell['content'].replace("**", bold, 1)
		if bold == "<strong>":
		bold = "</strong>"
		else:
		bold = "<strong>"
		cell['content'] = cell['content'].replace("\n", "<br />") if cell['content'] is not None else None

		# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows

		forward_rowspan = []
		for row_index in range(len(header_rows)):
		if len(forward_rowspan) == 0:
		@@ -701,12 +822,7 @@ def parse_pandoc_table_with_spans(pandoc_table):
		forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
		if not sum == number_of_columns:
		raise ValueError("Grid table not converted properly")
		#if has_header:
		# table_with_spans = header_rows

		#table_with_spans += data_rows

		#return table_with_spans
		return header_rows, data_rows

		def generate_html_table_with_spans(pandoc_table):
		@@ -733,9 +849,23 @@ def generate_html_table_with_spans(pandoc_table):
		if cell['rowspan'] == 0 or cell['colspan'] == 0:
		continue
		else:
		# Prepare content, in case there's a list
		print(cell['content'])
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>",
		cell['content']): # Update cell in new row
		#print("MATCHING")
		list = "<ul>"
		# Build list the matches
		for match in matches:
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell['content'] = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
		#else:
		# print("NOT MATCHING")

		rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
		colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
		html += f" <td{rowspan}{colspan}>{cell['content']}</td>\n"
		html += f" <th{rowspan}{colspan} {cell['alignment']}>{cell['content']}</th>\n"
		html += " </tr>\n"
		html += " </thead>\n"

		@@ -746,9 +876,21 @@ def generate_html_table_with_spans(pandoc_table):
		if cell['rowspan'] == 0 or cell['colspan'] == 0:
		continue
		else:
		#Prepare content, in case there's a list
		#print(cell['content'])
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>", cell['content']): # Update cell in new row
		#print("MATCHING")
		list = "<ul>"
		# Build list the matches
		for match in matches:
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell['content'] = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
		#else:
		#print("NOT MATCHING")
		rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
		colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
		html += f" <td{rowspan}{colspan}>{cell['content']}</td>\n"
		html += f" <td{rowspan}{colspan} {cell['alignment']}>{cell['content']}</td>\n"
		html += " </tr>\n"

		html += " </tbody>\n"

Admin message