More cleanup + cell alignment as defined in header separator line (Pandoc's behaviour) (2451610e) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

toMkdocs/toMkdocs.py

+88 −134

Original line number	Diff line number	Diff line
		@@ -422,6 +422,7 @@ _matchTableSeparator = re.compile(r'^\s\\|([-: ]+\\|)+\s$', re.IGNORECASE)
		_matchGridTable = re.compile(r'^\s\+-.\+\s$', re.IGNORECASE)
		_matchGridTableBodySeparator = re.compile(r'.\+([:-]+\+)+.$', re.IGNORECASE)
		_matchGridTableHeaderSeparator = re.compile(r'.\+([=:]+\+)+.$', re.IGNORECASE)
		_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
		_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
		_matchListInContent = re.compile(r'^(?:\s(P<marker>[-+]\|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
		_markdownLink = re.compile(r'[^!]\[[^\]]\]$(#[^)])$', re.IGNORECASE)
		@@ -502,8 +503,24 @@ def parse_pandoc_table_with_spans(pandoc_table):
		#print(cell['content'])
		return list_flag, cell

		def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
		for j in range(column_index, number_of_parts):
		delimiter_start = row[j - 1]['position'] if j != 0 else 0
		positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "\|+" if delimiter in line[delimiter_start + 1:]]
		position = min(positions) if positions else -1
		if position > delimiter_positions[j]: # Colspan to be increased
		row[i]['colspan'] += 1
		if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
		colspan_allocated = 0
		for cell_index in range(number_of_parts):
		colspan_allocated += row[cell_index]['colspan']
		row[column_index]['colspan'] += number_of_columns - colspan_allocated
		elif position < delimiter_positions[j]:
		raise ValueError("Wrong cell formatting")
		else:
		break
		return row[column_index]

		_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
		separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]

		print(separator_indices)
		@@ -522,11 +539,26 @@ def parse_pandoc_table_with_spans(pandoc_table):
		del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]]
		delimiter_positions.append(min(del_positions) if del_positions else -1)
		has_header = False
		header_delimiter_positions = []
		for index in separator_indices:
		if _matchGridTableHeaderSeparator.match(lines[index]):
		has_header = True
		header_separator_index = index
		header_rows = []
		parts = re.split(r"\s\+\s", lines[index].strip("+"))
		default_alignments = []
		#Calculate default alignments and positions of delimiters
		for part_index in range(len(parts)):
		if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
		default_alignments.append("align=\"left\"")
		elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
		default_alignments.append("align=\"right\"")
		else:
		default_alignments.append("align=\"center\"")
		# Delimiter position
		delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0
		del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]]
		header_delimiter_positions.append(min(del_positions) if del_positions else -1)

		data_rows = []
		for row in range(len(separator_indices) - 1):
		@@ -550,14 +582,15 @@ def parse_pandoc_table_with_spans(pandoc_table):
		# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
		# we need to assign the default alignment as defined in the header separator line
		# We may not need the code below, as that supports alignment per cell and row
		alignments = []
		for part_index in range(len(parts)):
		if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
		alignments.append("align=\"left\"")
		elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
		alignments.append("align=\"right\"")
		else:
		alignments.append("align=\"center\"")
		#alignments = []
		#for part_index in range(len(parts)):
		# if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
		# alignments.append("align=\"left\"")
		# elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
		# alignments.append("align=\"right\"")
		# else:
		# alignments.append("align=\"center\"")
		header_delimiter_index = 0
		for i in range(number_of_columns_row):
		delimiter_index += len(parts[i]) + 1
		table_row.append({
		@@ -565,9 +598,21 @@ def parse_pandoc_table_with_spans(pandoc_table):
		"rowspan": 0,
		"colspan": 0,
		"colspan_adjusted": False,
		"alignment": alignments[i] if alignments[i] else "align=\"center\"",
		"alignment": default_alignments[i] if i == 0 else "align=\"center\"",
		"position": delimiter_index # Position of cell delimiter +
		})
		#Set alignment as defined by header separator line
		while header_delimiter_index in range(len(default_alignments)) and table_row[i]['position'] > header_delimiter_positions[header_delimiter_index]:
		header_delimiter_index += 1
		if header_delimiter_index in range(len(default_alignments)):
		if table_row[i]['position'] < header_delimiter_positions[header_delimiter_index]:
		table_row[i]['alignment'] = default_alignments[header_delimiter_index]
		elif table_row[i]['position'] == header_delimiter_positions[header_delimiter_index]:
		table_row[i]['alignment'] = default_alignments[i]
		header_delimiter_index += 1
		else:
		raise ValueError("Invalid table formatting")

		for i in range(number_of_columns):
		auxiliar_row.append({
		"content": None,
		@@ -585,10 +630,11 @@ def parse_pandoc_table_with_spans(pandoc_table):
		if _matchGridTableBodySeparator.match(line): # Partial separator
		has_merged_cells = True
		cells = re.split(r"\s[\\|\+]\s", line.strip("\|").strip("+")) # (?<!\\)[\\|\+]
		if len(cells) < number_of_columns: # Colspan: Positions of \| with respect to + need to be determined
		if len(cells) <= number_of_columns: # Colspan: Positions of \| with respect to + need to be determined
		for i in range(len(cells)):
		if _matchGridTableSeparatorLine.match(cells[i]): # A new row is to be added
		if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added
		use_auxiliar_row[i] = True
		list_flags[i] = False
		if cells[i].startswith(":") and not cells[i].endswith(":"):
		auxiliar_row[i]['alignment'] = "align=\"left\""
		elif not cells[i].startswith(":") and cells[i].endswith(":"):
		@@ -596,145 +642,49 @@ def parse_pandoc_table_with_spans(pandoc_table):
		else:
		auxiliar_row[i]['alignment'] = "align=\"center\""
		else:
		#Handle content of the cell
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
		#if table_row[i]['content'] is None:
		# table_row[i]['rowspan'] += 1
		# table_row[i]['colspan'] += 1
		#if cells[i].strip().startswith("- "): # List
		# handling_list = True
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		#elif handling_list: # any other content when handling list is concatenated to the last list element
		# table_row[i]['content'].strip("\n")
		# table_row[i]['content'] += cells[i] + "\n"
		#elif cells[i].strip(): #separation between list and other paragraph
		# handling_list = False
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		#else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		#else:
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i]) # Cell which is not separator
		# Cell which is not separator
		table_row[i]['rowspan'] += 1
		if not table_row[i]['colspan_adjusted']:
		table_row[i]['colspan_adjusted'] = True
		for j in range(i, len(cells)):
		delimiter_start = table_row[j-1]['position'] if j != 0 else 0
		positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "\|+" if delimiter in line[delimiter_start + 1:]]
		position = min(positions) if positions else -1
		if position > delimiter_positions_start[j]: # Colspan to add
		table_row[i]['colspan'] += 1
		elif position < delimiter_positions_start[j]:
		raise ValueError("Wrong cell formatting")
		else:
		break
		elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
		for i in range(len(cells)):
		if _matchGridTableSeparatorLine.match(cells[i]): # Update cell in new row
		use_auxiliar_row[i] = True
		if cells[i].startswith(":") and not cells[i].endswith(":"):
		auxiliar_row[i]['alignment'] = "align=\"left\""
		elif not cells[i].startswith(":") and cells[i].endswith(":"):
		auxiliar_row[i]['alignment'] = "align=\"right\""
		else:
		auxiliar_row[i]['alignment'] = "align=\"center\""
		else:
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])

		# if table_row[i]['content'] is None:
		# table_row[i]['rowspan'] += 1
		# table_row[i]['colspan'] += 1
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
		#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
		table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
		#elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
		# for i in range(len(cells)):
		# if _matchGridTableBodySeparatorLine.match(cells[i]): # Update cell in new row
		# use_auxiliar_row[i] = True
		# list_flags[i] = False
		# if cells[i].startswith(":") and not cells[i].endswith(":"):
		# auxiliar_row[i]['alignment'] = "align=\"left\""
		# elif not cells[i].startswith(":") and cells[i].endswith(":"):
		# auxiliar_row[i]['alignment'] = "align=\"right\""
		# else:
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# auxiliar_row[i]['alignment'] = "align=\"center\""
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])

		# Cell which is not separator
		table_row[i]['rowspan'] += 1
		# Not needed, no colspan as number of cells is equal to number of columns
		#for j in range(i, len(cells)):
		# delimiter_start = table_row[j-1]['position'] if j != 0 else 0
		# positions = [line.find(delimiter,delimiter_start+1) for delimiter in "\|+" if delimiter in line[delimiter_start+1:]]
		# position = min(positions) if positions else -1
		# if position > table_row[i]['position']: # Only colspan to be increased
		# table_row[i]['colspan'] += 1
		# elif position + 1 < table_row[i]['position']:
		# raise ValueError("Wrong cell formatting")
		# else:
		# break

		# #Handle content of the cell
		# list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
		# # Cell which is not separator
		# table_row[i]['rowspan'] += 1
		# # Adjusting of colspan not needed, no colspan as number of cells is equal to number of columns
		else:
		raise ValueError("More cells than columns found")
		else: # Data row
		cells = re.split(r"\s\\|\s", line.strip("\|"))
		if len(cells) < number_of_columns: # Colspan: Positions of \| with respect to + need to be determined
		for i in range(len(cells)):
		# Handle content of the cell
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i], list_flags[i])
		# if table_row[i]['content'] is None:
		# table_row[i]['rowspan'] += 1
		# table_row[i]['colspan'] += 1
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		# else:
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		if not table_row[i]['colspan_adjusted']:
		table_row[i]['colspan_adjusted'] = True
		for j in range(i, len(cells)):
		delimiter_start = table_row[j-1]['position'] if j != 0 else 0
		if line.find("\|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
		table_row[i]['colspan'] += 1
		if line.find("\|", delimiter_start + 1) == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
		colspan_remaining = 0
		for cell_index in range(number_of_columns_row):
		colspan_remaining += table_row[cell_index]['colspan']
		table_row[i]['colspan'] += number_of_columns - colspan_remaining
		elif line.find("\|", delimiter_start+1) < delimiter_positions[j]:
		raise ValueError("Wrong cell formatting")
		else:
		break

		table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
		elif len(cells) == number_of_columns: # Simple row
		for i in range(len(cells)):
		if use_auxiliar_row[i]:
		if auxiliar_row[i]['content'] is None:
		auxiliar_row[i]['rowspan'] += 1
		auxiliar_row[i]['colspan'] += 1
		auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		else:
		auxiliar_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		list_flags[i], auxiliar_row[i] = handling_content(auxiliar_row[i], cells[i],list_flags[i])
		else:
		# Handle content of the cell
		list_flags[i], table_row[i] = handling_content(table_row[i], cells[i],list_flags[i])
		# if table_row[i]['content'] is None:
		# table_row[i]['rowspan'] += 1
		# table_row[i]['colspan'] += 1
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] = re.sub(r'\\\s*$', "\n", cells[i])
		# else:
		# if cells[i].strip().startswith("- "): # List
		# print(cells[i])
		# table_row[i]['content'] += cells[i] + "\n" # Add newline to know when the list element ends
		# else:
		# table_row[i]['content'] += re.sub(r'\\\s*$', "\n", cells[i])
		else:
		raise ValueError("More cells than columns found")
		else:
		@@ -850,7 +800,7 @@ def generate_html_table_with_spans(pandoc_table):
		continue
		else:
		# Prepare content, in case there's a list
		print(cell['content'])
		#print(cell['content'])
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>",
		cell['content']): # Update cell in new row
		#print("MATCHING")
		@@ -860,6 +810,8 @@ def generate_html_table_with_spans(pandoc_table):
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell['content'] = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell['content'])
		# Enforce left alignment if cell contains a list
		cell['alignment'] = "align=\"left\""
		#else:
		# print("NOT MATCHING")

		@@ -886,6 +838,8 @@ def generate_html_table_with_spans(pandoc_table):
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell['content'] = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell['content'])
		# Enforce left alignment if cell contains a list
		cell['alignment'] = "align=\"left\""
		#else:
		#print("NOT MATCHING")
		rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""