Commit bc780760 authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Support for grid tables and equations on mkdocs

parent fd0dfa13
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -197,12 +197,12 @@ pages:
     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2Fstylesheets%2Fextra%2Ecss/raw?ref=master" >> extra.css
    - mkdir -p docs/stylesheets && mv extra.css docs/stylesheets/
    - |
     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2Fmkdocs%2Eyml/raw?ref=master" >> mkdocs.yml
     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2Fmkdocs%2Eyml/raw?ref=gridtables" >> mkdocs.yml
    - |
     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2FindexDownload%2Emd/raw?ref=master" >> indexDownload.md
    - mkdir -p docs/download && mv indexDownload.md docs/download/index.md
    - |
     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2FtoMkdocs%2Epy/raw?ref=master" >> toMkdocs.py
     curl "${CI_API_V4_URL}/projects/$TOOLS_SCRIPTS_PROJECT_ID/repository/files/toMkdocs%2FtoMkdocs%2Epy/raw?ref=gridtables" >> toMkdocs.py
    - |
     export SPEC_NAME=$(ls | grep -E "(TS|TR|WI).*\.md" | cut -d'.' -f1)
    - |
+6 −0
Original line number Diff line number Diff line
@@ -60,6 +60,8 @@ markdown_extensions:
      pygments_lang_class: true
  - pymdownx.inlinehilite
  - pymdownx.snippets
  - pymdownx.arithmatex:
      generic: true
  - pymdownx.superfences:
      custom_fences:
        - name: mermaid
@@ -69,6 +71,10 @@ markdown_extensions:
     alternate_style: true
  - tables

extra_javascript:
  - javascripts/mathjax.js
  - https://unpkg.com/mathjax@3/es5/tex-mml-chtml.js

##############################################################################

extra:
+342 −4
Original line number Diff line number Diff line
@@ -11,6 +11,7 @@ from enum import Enum, auto
import argparse, re, os, shutil, hashlib, base64
from dataclasses import dataclass
from rich import print
from html import escape

verbose = False
veryVerbose = False
@@ -418,6 +419,9 @@ _matchNote = re.compile(r'^\s*>\s*', re.IGNORECASE)
_matchStandAloneImage = re.compile(r'^\s*!\[[^\]]*\]\(([^)]*)\)\s*', re.IGNORECASE)
_matchTable = re.compile(r'^\s*\|.*\|\s$', re.IGNORECASE)
_matchTableSeparator = re.compile(r'^\s*\|([-: ]+\|)+\s*$', re.IGNORECASE)
_matchGridTable = re.compile(r'^\s*\+-.*\+\s$', re.IGNORECASE)
_matchGridTableBodySeparator = re.compile(r'.*\+([-:]+\+)+.*$', re.IGNORECASE)
_matchGridTableHeaderSeparator = re.compile(r'.*\+([=:]+\+)+.*$', re.IGNORECASE)
_match2spaceListIndention = re.compile(r'^\s{2}-', re.IGNORECASE)
_markdownLink = re.compile(r'[^!]\[[^\]]*\]\((#[^)]*)\)', re.IGNORECASE)
_htmlLink = re.compile(r'<a\s+href="([^"\']*)">[^<]*</a>', re.IGNORECASE)
@@ -447,6 +451,309 @@ def shortHash(value:str, length:int) -> str:
				).digest()
			 ).decode()[:length]

def parse_pandoc_table_with_spans(pandoc_table):
	"""
	Parse a Pandoc-style grid table into a structure for HTML conversion with rowspan and colspan.

	:param pandoc_table: String of the Pandoc-style grid table.
	:return: List of lists representing the table with metadata for spans.
	"""
	# Split the input into lines
	lines = [line.strip() for line in pandoc_table.strip().split("\n")]

	# Detect separator lines by pattern (it does not take into account partial separators
	def is_separator(line):
		_matchGridTableSeparator = re.compile(r'\s*\+([-:=]+\+)+\s*$', re.IGNORECASE)
		return _matchGridTableSeparator.match(line)

	_matchGridTableSeparatorLine = re.compile(r'[-:]+$', re.IGNORECASE)
	separator_indices = [i for i, line in enumerate(lines) if is_separator(line)]

	print(separator_indices)
	if not separator_indices:
		raise ValueError("No valid separators found in the provided Pandoc table.")

	# Calculate max number of columns
	delimiter_positions = []
	number_of_columns = 0
	for separator_index in separator_indices:
		if lines[separator_index].count("+") - 1 > number_of_columns:
			number_of_columns = lines[separator_index].count("+") - 1
			delimiter_positions = []
			for j in range(number_of_columns):
				delimiter_positions_start = delimiter_positions[j - 1] if j != 0 else 0
				del_positions = [lines[separator_index].find(delimiter, delimiter_positions_start + 1) for delimiter in "+" if delimiter in lines[separator_index][delimiter_positions_start + 1:]]
				delimiter_positions.append(min(del_positions) if del_positions else -1)
	has_header = False
	for index in separator_indices:
		if _matchGridTableHeaderSeparator.match(lines[index]):
			has_header = True
			header_separator_index = index
			header_rows = []
	data_rows = []
	for row in range(len(separator_indices) - 1):
		table_row = []
		auxiliar_row = []
		use_auxiliar_row = []
		has_merged_cells = False
		in_data_row = False
		start, end = separator_indices[row], separator_indices[row + 1]
		row_lines = lines[start:end]  # Lines between separators including separator line start as it gives information about the number of columns of the row
		if row_lines:
			# Combine multiline content into single strings for each cell
			for line in row_lines:
				if is_separator(line) and not in_data_row:
					number_of_columns_row = line.count("+") - 1
					in_data_row = True
					parts = re.split(r"\s*\+\s*", line.strip("+"))
					# Add as many cells as columns with span attributes
					delimiter_index = 0
					for i in range(number_of_columns_row):
						delimiter_index += len(parts[i]) + 1
						table_row.append({
							"content": "NOCONTENT",
							"rowspan": 0,
							"colspan": 0,
							"colspan_adjusted": False,
							"position": delimiter_index # Position of cell delimiter +
						})
					for i in range(number_of_columns):
						auxiliar_row.append({
							"content": "NOCONTENT",
							"rowspan": 0,
							"colspan": 0,
							"colspan_adjusted": False,
							"position": 0
						})
						use_auxiliar_row.append(False)

				elif in_data_row:
					# Regular data row or partial separator
					if _matchGridTableBodySeparator.match(line): # Partial separator
						has_merged_cells = True
						cells = re.split(r"\s*[\|\+]\s*", line.strip("|").strip("+")) # (?<!\\)[\|\+]
						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells)):
								if _matchGridTableSeparatorLine.match(cells[i]):  # A new row is to be added
									use_auxiliar_row[i] = True
								else:
									if table_row[i]['content'] == "NOCONTENT":
										table_row[i]['rowspan'] += 1
										table_row[i]['colspan'] += 1
										table_row[i]['content'] = cells[i]
									else:
										table_row[i]['content'] += cells[i]
									# Cell which is not separator
									table_row[i]['rowspan'] += 1
									if not table_row[i]['colspan_adjusted']:
										table_row[i]['colspan_adjusted'] = True
										for j in range(i, len(cells)):
											delimiter_start = table_row[j-1]['position'] if j != 0 else 0
											positions = [line.find(delimiter, delimiter_start + 1) for delimiter in "|+" if delimiter in line[delimiter_start + 1:]]
											position = min(positions) if positions else -1
											if position > delimiter_positions_start[j]: # Colspan to add
												table_row[i]['colspan'] += 1
											elif position < delimiter_positions_start[j]:
												raise ValueError("Wrong cell formatting")
											else:
												break
						elif len(cells) == number_of_columns: # Simple row with partial separator, # A new row is to be added
							for i in range(len(cells)):
								if _matchGridTableSeparatorLine.match(cells[i]):  # Update cell in new row
									use_auxiliar_row[i] = True
								else:
									if table_row[i]['content'] == "NOCONTENT":
										table_row[i]['rowspan'] += 1
										table_row[i]['colspan'] += 1
										table_row[i]['content'] = cells[i]
									else:
										table_row[i]['content'] += cells[i]
									# Cell which is not separator
									table_row[i]['rowspan'] += 1
									# Not needed, no colspan as number of cells is equal to number of columns
									#for j in range(i, len(cells)):
									#	delimiter_start = table_row[j-1]['position'] if j != 0 else 0
									#	positions = [line.find(delimiter,delimiter_start+1) for delimiter in "|+" if delimiter in line[delimiter_start+1:]]
									#	position = min(positions) if positions else -1
									#	if position > table_row[i]['position']:  # Only colspan to be increased
									#		table_row[i]['colspan'] += 1
									#	elif position + 1  < table_row[i]['position']:
									#		raise ValueError("Wrong cell formatting")
									#	else:
									#		break

						else:
							raise ValueError("More cells than columns found")
					else: # Data row
						cells = re.split(r"\s*\|\s*", line.strip("|"))
						if len(cells) < number_of_columns: # Colspan: Positions of | with respect to + need to be determined
							for i in range(len(cells)):
								if table_row[i]['content'] == "NOCONTENT":
									table_row[i]['rowspan'] += 1
									table_row[i]['colspan'] += 1
									table_row[i]['content'] = cells[i]
								else:
									table_row[i]['content'] += cells[i]
								if not table_row[i]['colspan_adjusted']:
									table_row[i]['colspan_adjusted'] = True
									for j in range(i, len(cells)):
										delimiter_start = table_row[j-1]['position'] if j != 0 else 0
										if line.find("|", delimiter_start+1) > delimiter_positions[j]: # Colspan to be increased
											table_row[i]['colspan'] += 1
										elif line.find("|", delimiter_start+1) < delimiter_positions[j]:
											raise ValueError("Wrong cell formatting")
										else:

											break

						elif len(cells) == number_of_columns: # Simple row
							for i in range(len(cells)):
								if use_auxiliar_row[i]:
									if auxiliar_row[i]['content'] == "NOCONTENT":
										auxiliar_row[i]['rowspan'] += 1
										auxiliar_row[i]['colspan'] += 1
										auxiliar_row[i]['content'] = cells[i]
									else:
										auxiliar_row[i]['content'] += cells[i]
								else:
									if table_row[i]['content'] == "NOCONTENT":
										table_row[i]['rowspan'] += 1
										table_row[i]['colspan'] += 1
										table_row[i]['content'] = cells[i]
									else:
										table_row[i]['content'] += cells[i]
						else:
							raise ValueError("More cells than columns found")
				else:
					raise ValueError("No separator line found for row starting")

			if has_header and start >= header_separator_index: # table_row and auxiliar_row are part of data_rows
				data_rows.append(table_row)
				if has_merged_cells:
					data_rows.append(auxiliar_row)
			elif has_header and start < header_separator_index: # table_row and auxiliar_row are part of header_rows
				header_rows.append(table_row)
				if has_merged_cells:
					header_rows.append(auxiliar_row)

	#print(header_rows)
	#print(data_rows)
	# Correct newlines characters
	for row in header_rows:
		for cell in row:
			cell['content'] = cell['content'].replace("\\", "<br>")
	for row in data_rows:
		for cell in row:
			cell['content'] = cell['content'].replace("\\", "<br>")
	# Check if there are any data rows
	if not data_rows and not header_rows:
		raise ValueError("No valid rows found in the provided Pandoc table.")

	# Format text
	bold = "<strong>"
	for row in header_rows:
		for cell in row:
			while cell['content'].find("**") != -1:
				cell['content'] = cell['content'].replace("**", bold, 1)
				if bold == "<strong>":
					bold = "</strong>"
				else:
					bold = "<strong>"
	bold = "<strong>"
	for row in data_rows:
		for cell in row:
			while cell['content'].find("**") != -1:
				cell['content'] = cell['content'].replace("**", bold, 1)
				if bold == "<strong>":
					bold = "</strong>"
				else:
					bold = "<strong>"

	# Checking that the grid is correct Not too much tested - need to take into account rowspan of previous rows

	forward_rowspan = []
	for row_index in range(len(header_rows)):
		if len(forward_rowspan) == 0:
			forward_rowspan = [0 for _ in range(len(header_rows[row_index]))]
		sum = 0
		for cell_index in range(len(header_rows[row_index])):
			sum += header_rows[row_index][cell_index]['colspan']
			if row_index > 0 and header_rows[row_index][cell_index]['colspan'] == 0:
				if forward_rowspan[cell_index] > 0:
					sum += 1
				forward_rowspan[cell_index] -= 1
			if forward_rowspan[cell_index] == 0 and header_rows[row_index][cell_index]['rowspan'] > 1:
				forward_rowspan[cell_index] = header_rows[row_index][cell_index]['rowspan'] -1
		if not sum == number_of_columns:
			raise ValueError("Grid table not converted properly")
	forward_rowspan = []
	for row_index in range(len(data_rows)):
		if len(forward_rowspan) == 0:
			forward_rowspan = [0 for _ in range(len(data_rows[row_index]))]
		sum = 0
		for cell_index in range(len(data_rows[row_index])):
			sum += data_rows[row_index][cell_index]['colspan']
			if row_index > 0 and data_rows[row_index][cell_index]['colspan'] == 0:
				if forward_rowspan[cell_index] > 0:
					sum += 1
				forward_rowspan[cell_index] -= 1
			if forward_rowspan[cell_index] == 0 and data_rows[row_index][cell_index]['rowspan'] > 1:
				forward_rowspan[cell_index] = data_rows[row_index][cell_index]['rowspan'] - 1
		if not sum == number_of_columns:
			raise ValueError("Grid table not converted properly")
	#if has_header:
	#	table_with_spans = header_rows

	#table_with_spans += data_rows

	#return table_with_spans
	return header_rows, data_rows

def generate_html_table_with_spans(pandoc_table):
	"""
	Generate an HTML table from a Pandoc-style grid table with row and column spans.

	:param pandoc_table: String of the Pandoc-style grid table.
	:return: HTML string.
	"""
	grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table)

	html = "<table>\n"
	has_header = False

	for row in grid_header:
		for cell in row:
			if cell['rowspan'] != 0 and cell['colspan'] != 0:
				has_header = True
	if has_header:
		html += "    <thead>\n"
		for row in grid_header:
			html += "        <tr>\n"
			for cell in row:
				if cell['rowspan'] == 0 or cell['colspan'] == 0:
					continue
				else:
					rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
					colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
					html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
			html += "        </tr>\n"
		html += "    </thead>\n"

	html += "    <tbody>\n"
	for row in grid_body:
		html += "        <tr>\n"
		for cell in row:
			if cell['rowspan'] == 0 or cell['colspan'] == 0:
				continue
			else:
				rowspan = f" rowspan=\"{cell['rowspan']}\"" if cell["rowspan"] > 1 else ""
				colspan = f" colspan=\"{cell['colspan']}\"" if cell["colspan"] > 1 else ""
				html += f"            <td{rowspan}{colspan}>{cell['content']}</td>\n"
		html += "        </tr>\n"

	html += "    </tbody>\n"
	html += "</table>"
	return html

def analyseMarkdown(filename:str) -> Document:
	"""	Analyse the markdown file and split it into clauses.
@@ -473,6 +780,9 @@ def analyseMarkdown(filename:str) -> Document:
	inCodefence = False
	inTable = False
	tableHasSeparator = False
	inGridTable = False
	gridTableHasSeparator = False
	gridTable = ""
	for line in inLines:

		# Detect and handle codefences
@@ -493,7 +803,7 @@ def analyseMarkdown(filename:str) -> Document:
			continue

		# Detect and handle tables
		if _matchTable.match(line) and not inTable:
		if _matchTable.match(line) and not inTable and not inGridTable:
			inTable = True
			outClauses[-1].append(Line(line, LineType.TABLEHEADER))
			continue
@@ -512,6 +822,34 @@ def analyseMarkdown(filename:str) -> Document:
				outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW
				# continue with other matches

		#Detect grid tables and convert them to html table
		if _matchGridTable.match(line) and not inGridTable:
			inGridTable = True
			#outClauses[-1].append(Line(line, LineType.TABLEHEADER))
			gridTable += line
			continue
		if inGridTable:
			if _matchGridTableHeaderSeparator.match(line) or _matchGridTableBodySeparator.match(line):
				#outClauses[-1].append(Line(line, LineType.TABLESEPARATOR))
				gridTable += line
				continue
			elif _matchTable.match(line):
				#outClauses[-1].append(Line(line, LineType.TABLEROW))
				gridTable += line
				continue
			else:
				inGridTable = False
				# Mark the previous line as the last row in the table
				#outClauses[-1].lines[-1].lineType = LineType.TABLELASTROW
				print(gridTable)
				htmltable = ""
				htmltable = generate_html_table_with_spans(gridTable)
				print(htmltable)
				for row in htmltable:
					outClauses[-1].append(Line(row, LineType.TABLEROW))
				gridTable = ""
		# continue with other matches

		# Detect notes
		# Notes are lines that start with a '>'.
		if _matchNote.match(line):