Support for Pandoc grid tables and equations on mkdocs + fix generation of... (bdefcbf9) · Commits · Centre for Testing and Interoperability / Markdown specifications development / Specification tools

generateChangemarks/dockerfile.pandoc

+2 −3

Original line number	Diff line number	Diff line
		FROM pandoc/core:3.4.0-ubuntu
		FROM pandoc/latex:3.4.0-ubuntu

		RUN apt-get update -y && \
		apt-get install -y npm &&\
		npm install --global mermaid-filter &&\
		apt-get install -y texlive-latex-base texlive-latex-recommended texlive-fonts-recommended
		npm install --global mermaid-filter

		CMD ["/bin/sh"]

toMkdocs/mkdocs.yml

+6 −0

Original line number	Diff line number	Diff line
		@@ -60,6 +60,8 @@ markdown_extensions:
		pygments_lang_class: true
		- pymdownx.inlinehilite
		- pymdownx.snippets
		- pymdownx.arithmatex:
		generic: true
		- pymdownx.superfences:
		custom_fences:
		- name: mermaid
		@@ -69,6 +71,10 @@ markdown_extensions:
		alternate_style: true
		- tables

		extra_javascript:
		- javascripts/mathjax.js
		- https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js

		##############################################################################

		extra:

toMkdocs/toMkdocs.py

+475 −5

Original line number	Diff line number	Diff line
		@@ -7,6 +7,8 @@
		# directory structure.
		#
		from __future__ import annotations

		import logging
		from enum import Enum, auto
		import argparse, re, os, shutil, hashlib, base64
		from dataclasses import dataclass
		@@ -416,9 +418,15 @@ _matchCodefenceStart = re.compile(r'\s```\s?.', re.IGNORECASE)
		_matchCodefenceEnd = re.compile(r'\s*```\s?', re.IGNORECASE)
		_matchNote = re.compile(r'^\s>\s', re.IGNORECASE)
		_matchStandAloneImage = re.compile(r'^\s!\[[^\]]\]$([^)])$\s', re.IGNORECASE)
		_matchTable = re.compile(r'^\s\\|.\\|\s$', re.IGNORECASE)
		_matchTable = re.compile(r'^\s\\|.\\|\s*$', re.IGNORECASE)
		_matchTableSeparator = re.compile(r'^\s\\|([-: ]+\\|)+\s$', re.IGNORECASE)
		_matchGridTable = re.compile(r'^\s\+-.\+\s$', re.IGNORECASE)
		_matchGridTableSeparator = re.compile(r'\s\+([-:=]+\+)+\s$', re.IGNORECASE)
		_matchGridTableBodySeparator = re.compile(r'.\+([:-]+\+)+.$', re.IGNORECASE)
		_matchGridTableHeaderSeparator = re.compile(r'.\+([=:]+\+)+.$', re.IGNORECASE)
		_matchGridTableBodySeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
		_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
		_matchListInContent = re.compile(r'^(?:\s(P<marker>[-+]\|\s*\d+\.))\s+(P<content>.+)$', re.IGNORECASE)
		_markdownLink = re.compile(r'[^!]\[[^\]]\]$(#[^)])$', re.IGNORECASE)
		_htmlLink = re.compile(r'<a\s+href="([^"\'])">[^<]</a>', re.IGNORECASE)
		_htmlAnchorLink = re.compile(r'<a\s+name="([^"])">[^<]</a>', re.IGNORECASE)
		@@ -447,6 +455,437 @@ def shortHash(value:str, length:int) -> str:
		).digest()
		).decode()[:length]

		def parse_pandoc_table_with_spans(pandoc_table):
		"""
		Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan.

		:param pandoc_table: String of the Pandoc-style grid table.
		:return: List of lists representing the table with metadata for spans.
		"""
		# Split the input into lines
		lines = [line.strip() for line in pandoc_table.strip().split("\n")]

		class Cell:
		""" Represents the document object. """
		content: str
		rowspan: int
		colspan: int
		colspan_adjusted: bool
		alignment: str
		position: int
		list_flag: bool
		auxiliar_index: int

		def __init__(self):
		self.content = None
		self.rowspan = 0
		self.colspan = 0
		self.colspan_adjusted = False
		self.alignment = "align=\"center\""
		self.position = 0
		self.list_flag = False
		self.auxiliar_index = None

		def set_alignment(self):
		header_delimiter_index = 0
		while header_delimiter_index in range(len(default_alignments)) and self.position > header_delimiter_positions[header_delimiter_index]:
		header_delimiter_index += 1
		if header_delimiter_index in range(len(default_alignments)):
		if self.position < header_delimiter_positions[header_delimiter_index]:
		self.alignment = default_alignments[header_delimiter_index]
		elif self.position == header_delimiter_positions[header_delimiter_index]:
		self.alignment = default_alignments[header_delimiter_index]
		header_delimiter_index += 1
		else:
		raise ValueError("Invalid table formatting")

		class Row():
		""" Represents a row in the markdown file. """
		cells:list[Cell] = []

		def __init__(self, length: int = 1) -> None:
		self.cells = [Cell() for _ in range(length)]

		def __getitem__(self, item):
		return self.cells[item]

		def __setitem__(self, key, value):
		self.cells[key] = value

		# Detect separator lines by pattern (it does not take into account partial separators
		def is_separator(line):
		return _matchGridTableSeparator.match(line)

		def handling_content(cell, content):
		if cell.content is None:
		cell.rowspan += 1
		cell.colspan += 1
		if content.strip().startswith("- "): # List
		cell.list_flag = True
		#print(content)
		cell.content = content.strip() + "\n" # Add newline to know when the list element ends
		elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element
		cell.content += content.strip() + "\n"
		elif cells[i].strip == "": # separation between list and other paragraph
		cell.list_flag = False
		cell.content += "\n" #if not cell['content'].endswith("\n") else ""
		else:
		cell.content = re.sub(r'\\\s*$', "\n", content.strip())
		else:
		if content.strip().startswith("- "): # List
		if not cell.list_flag:
		cell.content += "\n"
		#cell['content'] = cell['content'].strip("\n")
		cell.list_flag = True
		cell.content += content.strip() + "\n" # Add newline to know when the list element ends
		elif cell.list_flag and cells[i].strip() != "": # any other content when handling list is concatenated to the last list element
		cell.content = cell.content.strip("\n")
		cell.content += " " + content.strip() + "\n"
		elif cells[i].strip() == "": # separation between list and other paragraph
		cell.list_flag = False
		#content = re.sub(r'\\\s*$', "\n", content.strip())
		cell.content += "\n" if not cell.content.endswith("\n") else ""
		else:
		content = re.sub(r'\\\s*$', "\n", content.strip())
		cell.content += " " + content
		#print(cell['content'])
		return cell

		def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions):
		for j in range(column_index, number_of_parts):
		delimiter_start = row[j - 1].position if j != 0 else 0
		positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "\|+" if delimiter in line[delimiter_start + 1:]]
		position = min(positions) if positions else -1
		if position > delimiter_positions[j]: # Colspan to be increased
		row[i].colspan += 1
		if position == delimiter_positions[len(delimiter_positions) - 1]: # last cell in row, adjust colspan to get max number columns
		colspan_allocated = row[i].colspan
		#for cell_index in range(number_of_parts):
		# colspan_allocated += row[cell_index].colspan
		row[column_index].colspan += number_of_columns - colspan_allocated - column_index
		elif position < delimiter_positions[j]:
		raise ValueError("Wrong cell formatting")
		else:
		break
		return row[column_index]

		separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]

		print(separator_indices)
		if not separator_indices:
		raise ValueError("No valid separators found in the provided Pandoc table.")

		# Calculate max number of columns
		delimiter_positions = []
		number_of_columns = 0
		for separator_index in separator_indices:
		if lines[separator_index].count("+") - 1 > number_of_columns:
		number_of_columns = lines[separator_index].count("+") - 1
		delimiter_positions = []
		for j in range(number_of_columns):
		delimiter_positions_start = delimiter_positions[j - 1] if j != 0 else 0
		del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]]
		delimiter_positions.append(min(del_positions) if del_positions else -1)
		has_header = False
		header_delimiter_positions = []
		for index in separator_indices:
		if _matchGridTableHeaderSeparator.match(lines[index]):
		has_header = True
		header_separator_index = index
		header_rows = []
		parts = re.split(r"\+", lines[index].strip("+"))
		default_alignments = []
		#Calculate default alignments and positions of delimiters
		for part_index in range(len(parts)):
		if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
		default_alignments.append("align=\"left\"")
		elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
		default_alignments.append("align=\"right\"")
		else:
		default_alignments.append("align=\"center\"")
		# Delimiter position
		delimiter_positions_start = delimiter_positions[part_index - 1] if part_index != 0 else 0
		del_positions = [lines[index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[index][delimiter_positions_start + 1:]]
		header_delimiter_positions.append(min(del_positions) if del_positions else -1)

		data_rows = []
		for row in range(len(separator_indices) - 1):
		table_row = []
		auxiliar_rows = []
		has_merged_cells = False
		in_data_row = False
		start, end = separator_indices[row], separator_indices[row + 1]
		row_lines = lines[start:end] # Lines between separators including separator line start as it gives information about the number of columns of the row
		if row_lines:
		# Combine multiline content into single strings for each cell
		for line in row_lines:
		if is_separator(line) and not in_data_row:
		number_of_columns_row = line.count("+") - 1
		in_data_row = True
		parts = re.split(r"\s\+\s", line.strip("+"))
		# Add as many cells as columns with span attributes
		delimiter_index = 0
		# Determine the alignment of the cell - In order to replicate Pandoc's behaviour (do not support of alignment colons on separator lines (just header separator)
		# we need to assign the default alignment as defined in the header separator line
		# We may not need the code below, as that supports alignment per cell and row
		#alignments = []
		#for part_index in range(len(parts)):
		# if parts[part_index].startswith(":") and not parts[part_index].endswith(":"):
		# alignments.append("align=\"left\"")
		# elif not parts[part_index].startswith(":") and parts[part_index].endswith(":"):
		# alignments.append("align=\"right\"")
		# else:
		# alignments.append("align=\"center\"")
		header_delimiter_index = 0
		table_row = Row(number_of_columns_row)
		for i in range(number_of_columns_row):
		delimiter_index += len(parts[i]) + 1
		table_row[i].alignment = default_alignments[i] if i == 0 else "align=\"center\""
		table_row[i].position = delimiter_index # Position of cell delimiter +

		#Set alignment as defined by header separator line
		table_row[i].set_alignment()

		elif in_data_row:
		# Regular data row or partial separator
		if _matchGridTableBodySeparator.match(line): # Partial separator
		has_merged_cells = True
		cells = re.split(r"[\\|\+]", line.strip("\|").strip("+")) # (?<!\\)[\\|\+]
		#Add auxiliar line, set delimiters for each cell
		auxiliar_rows.append(Row(number_of_columns))
		aux_delimiter_index = 0
		for auxiliar_cell_index in range(number_of_columns):
		aux_delimiter_index += len(cells[auxiliar_cell_index]) + 1
		auxiliar_rows[-1][auxiliar_cell_index].position = aux_delimiter_index # Position of cell delimiter +
		auxiliar_rows[-1][i].set_alignment()

		if len(cells) <= number_of_columns: # Colspan: Positions of \| with respect to + need to be determined
		for i in range(len(cells)):
		if _matchGridTableBodySeparatorLine.match(cells[i]): # A new row is to be added
		#auxiliar_rows[-1]['use_auxiliar_row'][i] = True
		auxiliar_rows[-1][i].list_flag = False
		table_row[i].auxiliar_index = len(auxiliar_rows)-1
		#if cells[i].startswith(":") and not cells[i].endswith(":"):
		# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"left\""
		#elif not cells[i].startswith(":") and cells[i].endswith(":"):
		# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"right\""
		#else:
		# auxiliar_rows[-1]['auxiliar_row'][i]['alignment'] = "align=\"center\""
		else:
		# Handle content of the cell
		if table_row[i].auxiliar_index is not None: # and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
		auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i])
		if not auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted:
		auxiliar_rows[table_row[i].auxiliar_index][i].colspan_adjusted = True
		# TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
		auxiliar_rows[table_row[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[i].auxiliar_index], i, len(cells), line, number_of_columns, delimiter_positions)
		else:
		table_row[i] = handling_content(table_row[i], cells[i])
		# Cell which is not separator
		table_row[i].rowspan += 1
		if not table_row.cells[i].colspan_adjusted:
		table_row[i].colspan_adjusted = True
		#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
		table_row[i] = adjust_colspan(table_row, i, len(cells), line, number_of_columns, delimiter_positions)
		else:
		raise ValueError("More cells than columns found")
		else: # Data row
		cells = re.split(r"\s\\|\s", line.strip("\|"))
		if len(cells) < number_of_columns: # Colspan: Positions of \| with respect to + need to be determined
		for i in range(len(cells)):
		# Handle content of the cell
		if table_row[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
		auxiliar_rows[table_row.cells[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i])
		if not auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted:
		auxiliar_rows[table_row.cells[i].auxiliar_index].cells[i].colspan_adjusted = True
		#TO BE CHECKED Most probably the code below is never executed, colspan should be already adjusted when dealing with a partial separator
		auxiliar_rows[table_row[i].auxiliar_index][i] = adjust_colspan(auxiliar_rows[table_row[i].auxiliar_index].cells, i, len(cells), line, number_of_columns, delimiter_positions)
		else:
		table_row[i] = handling_content(table_row[i], cells[i])
		if not table_row.cells[i].colspan_adjusted:
		table_row[i].colspan_adjusted = True
		table_row[i] = adjust_colspan(table_row.cells, i, len(cells), line, number_of_columns, delimiter_positions)
		elif len(cells) == number_of_columns: # Simple row
		for i in range(len(cells)):
		if table_row[i].auxiliar_index is not None:# and auxiliar_rows[table_row[i]['auxiliar_index']]['use_auxiliar_row'][i]:
		auxiliar_rows[table_row[i].auxiliar_index][i] = handling_content(auxiliar_rows[table_row[i].auxiliar_index][i], cells[i])
		else:
		# Handle content of the cell
		table_row[i] = handling_content(table_row[i], cells[i])
		else:
		raise ValueError("More cells than columns found")
		else:
		raise ValueError("No separator line found for row starting")

		if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
		data_rows.append(table_row.cells)
		if has_merged_cells:
		for row in auxiliar_rows:
		#for i in range(len(row.cells)):
		# print(row.cells[i].content)
		data_rows.append(row.cells)
		elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
		header_rows.append(table_row.cells)
		if has_merged_cells:
		for row in auxiliar_rows:
		header_rows.append(row.cells)

		#print(header_rows)
		#print(data_rows)
		# Check if there are any data rows
		if not data_rows and not header_rows:
		raise ValueError("No valid rows found in the provided Pandoc table.")

		# Format text
		for rows in [header_rows, data_rows]:
		bold = "<strong>"
		italic = "<i>"
		for row in rows:
		for cell in row:
		if cell.content is not None:
		# Replacing "<" by <
		#cell.content = cell.content.replace("<", "<")

		#Bold
		for bold_characters in ["**", "__"]:
		while cell.content.find(bold_characters) != -1:
		cell.content = cell.content.replace(bold_characters, bold, 1)
		if bold == "<strong>":
		bold = "</strong>"
		else:
		bold = "<strong>"
		#Italic
		while cell.content.find("_") != -1 and cell.content.find("\_") == -1:
		cell.content = cell.content.rstrip() .replace("_", italic, 1)
		if italic == "<i>":
		italic = "</i>"
		else:
		italic = "<i>"
		while cell.content.find("\_") != -1:
		cell.content = cell.content.rstrip().replace("\_", "_", 1)

		# Correct newlines characters
		for row in header_rows:
		for cell in row:
		cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None
		for row in data_rows:
		for cell in row:
		cell.content = cell.content.replace("\n", "<br />") if cell.content is not None else None

		# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows
		forward_rowspan = []
		for row_index in range(len(header_rows)):
		if len(forward_rowspan) == 0:
		forward_rowspan = [0 for _ in range(len(header_rows[row_index]))]
		sum = 0
		for cell_index in range(len(header_rows[row_index])):
		sum += header_rows[row_index][cell_index].colspan
		if row_index > 0 and header_rows[row_index][cell_index].colspan == 0:
		if forward_rowspan[cell_index] > 0:
		sum += 1
		forward_rowspan[cell_index] -= 1
		if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index].rowspan > 1:
		forward_rowspan[cell_index] = header_rows[row_index][cell_index].rowspan -1
		if not sum == number_of_columns:
		raise ValueError("Grid table not converted properly")
		forward_rowspan = []
		for row_index in range(len(data_rows)):
		if len(forward_rowspan) == 0:
		forward_rowspan = [0 for _ in range(len(data_rows[row_index]))]
		sum = 0
		for cell_index in range(len(data_rows[row_index])):
		sum += data_rows[row_index][cell_index].colspan
		if row_index > 0 and data_rows[row_index][cell_index].colspan == 0:
		if forward_rowspan[cell_index] > 0:
		sum += 1
		forward_rowspan[cell_index] -= 1
		if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index].rowspan > 1:
		forward_rowspan[cell_index] = data_rows[row_index][cell_index].rowspan - 1
		if not sum == number_of_columns:
		raise ValueError("Grid table not converted properly")

		return header_rows, data_rows

		def generate_html_table_with_spans(pandoc_table):
		"""
		Generate an HTML table from a Pandoc-style grid table with row and column spans.

		:param pandoc_table: String of the Pandoc-style grid table.
		:return: HTML string.
		"""
		try:
		grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table)
		except:
		logging.ERROR("Grid table could not be generated")
		return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS"
		else:
		html = "<table>\n"
		has_header = False

		for row in grid_header:
		for cell in row:
		if cell.rowspan != 0 and cell.colspan != 0:
		has_header = True
		if has_header:
		html += " <thead>\n"
		for row in grid_header:
		html += " <tr>\n"
		for cell in row:
		if cell.rowspan == 0 or cell.colspan == 0:
		continue
		else:
		# Prepare content, in case there's a list
		#print(cell.content)
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>",
		cell.content): # Update cell in new row
		#print("MATCHING")
		list = "<ul>"
		# Build list the matches
		for match in matches:
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell.content = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+", list, cell.content)
		# Enforce left alignment if cell contains a list
		cell.alignment = "align=\"left\""
		#else:
		# print("NOT MATCHING")

		rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
		colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
		html += f" <th{rowspan}{colspan} {cell.alignment}>{cell.content}</th>\n"
		html += " </tr>\n"
		html += " </thead>\n"

		html += " <tbody>\n"
		for row in grid_body:
		html += " <tr>\n"
		for cell in row:
		if cell.rowspan == 0 or cell.colspan == 0:
		continue
		else:
		#Prepare content, in case there's a list
		#print(cell.content)
		if matches := re.findall(r"\s([-+]\|\s*\d+\.)\s+([^<]+)<br \/>", cell.content): # Update cell in new row
		#print("MATCHING")
		#print(cell.content)
		list = "<ul>"
		# Build list the matches
		for match in matches:
		list += "<li>" + match[1] + "</li>"
		list += "</ul>"
		cell.content = re.sub(r"(\s([-+]\|\s*\d+\.)\s+[^<]+<br \/>)+",list, cell.content)
		# Enforce left alignment if cell contains a list
		cell.alignment = "align=\"left\""
		#else:
		#print("NOT MATCHING")
		rowspan = f" rowspan=\"{cell.rowspan}\"" if cell.rowspan > 1 else ""
		colspan = f" colspan=\"{cell.colspan}\"" if cell.colspan > 1 else ""
		html += f" <td{rowspan}{colspan} {cell.alignment}>{cell.content}</td>\n"
		html += " </tr>\n"

		html += " </tbody>\n"
		html += "</table>"
		return html

		def analyseMarkdown(filename:str) -> Document:
		""" Analyse the markdown file and split it into clauses.
		@@ -473,6 +912,9 @@ def analyseMarkdown(filename:str) -> Document:
		inCodefence = False
		inTable = False
		tableHasSeparator = False
		inGridTable = False
		gridTableHasSeparator = False
		gridTable = ""
		for line in inLines:

		# Detect and handle codefences
		@@ -493,7 +935,7 @@ def analyseMarkdown(filename:str) -> Document:
		continue

		# Detect and handle tables
		if _matchTable.match(line) and not inTable:
		if _matchTable.match(line) and not inTable and not inGridTable:
		inTable = True
		outClauses[-1].append(Line(line, LineType.TABLEHEADER))
		continue
		@@ -512,6 +954,34 @@ def analyseMarkdown(filename:str) -> Document:
		outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW
		# continue with other matches

		#Detect grid tables and convert them to html table
		if _matchGridTable.match(line) and not inGridTable:
		inGridTable = True
		#outClauses[-1].append(Line(line, LineType.TABLEHEADER))
		gridTable += line
		continue
		if inGridTable:
		if _matchGridTableHeaderSeparator.match(line) or _matchGridTableBodySeparator.match(line):
		#outClauses[-1].append(Line(line, LineType.TABLESEPARATOR))
		gridTable += line
		continue
		elif _matchTable.match(line):
		#outClauses[-1].append(Line(line, LineType.TABLEROW))
		gridTable += line
		continue
		else:
		inGridTable = False
		# Mark the previous line as the last row in the table
		#outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW
		print(gridTable)
		htmltable = ""
		htmltable = generate_html_table_with_spans(gridTable)
		print(htmltable)
		for row in htmltable:
		outClauses[-1].append(Line(row, LineType.TABLEROW))
		gridTable = ""
		# continue with other matches

		# Detect notes
		# Notes are lines that start with a '>'.
		if _matchNote.match(line):