diff --git a/gitlabFilter/README.md b/gitlabFilter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1357e35eb1cd5529eb4fd036f814d198f9108ffd --- /dev/null +++ b/gitlabFilter/README.md @@ -0,0 +1,27 @@ +# Grid Table filter + +The grid_table_filter.rb file is a Ruby script that defines a filter for converting Pandoc-style grid tables into HTML tables with support for rowspan and colspan. Here is a summary of the key components and functionality. + +## Module and Class Definitions + +The script is encapsulated within the Banzai::Filter module. +The main class is GridTableFilter, which inherits from HTML::Pipeline::TextFilter. + +- Regex Constants: Several regex constants are defined to match different parts of the grid table structure, such as separators and body lines. + +- Helper Classes: + - Cell: Represents a cell in the table with attributes like content, rowspan, colspan, alignment, etc. + - Row: Represents a row in the table, containing an array of Cell objects. + - RowTracker: Tracks the number of rows for each column to manage rowspan. + +- Helper Methods: + - separator?: Checks if a line is a separator. + - handling_content: Processes the content of a cell, handling lists and newlines. + - adjust_colspan: Adjusts the colspan of cells based on the delimiter positions. + +- Main Methods: + - parse_pandoc_table_with_spans: Parses the Pandoc table, identifies headers, and processes rows to create a structured representation of the table. + - generate_html_table_with_spans: Converts the parsed table structure into an HTML table. + - call: The main entry point for the filter, which applies the regex to find grid tables and converts them to HTML. + +- Error Handling: The script includes error handling to manage invalid table formats and log errors. diff --git a/gitlabFilter/grid_table_filter.rb b/gitlabFilter/grid_table_filter.rb new file mode 100644 index 0000000000000000000000000000000000000000..10ff1e198369cef046843656047f60d514f6553f --- /dev/null +++ b/gitlabFilter/grid_table_filter.rb @@ -0,0 +1,631 @@ +# frozen_string_literal: true + +# +# GridTableFilter.rb +# +# (c) 2025 by Miguel Angel Reina Ortega & Andreas Kraft +# License: BSD 3-Clause License. See the LICENSE file for further details. +# + +# TODO: This is now a legacy filter, and is only used with the Ruby parser. +# The current markdown parser now properly handles grid table blocks. +# issue: https://gitlab.com/gitlab-org/gitlab/-/issues/460864 +# GridTableFilter.rb +# +# Converts Pandoc-style grid tables to HTML tables with rowspan and colspan support +# + +module Banzai + module Filter + class GridTableFilter < HTML::Pipeline::TextFilter + # rubocop:disable Lint/MixedRegexpCaptureTypes -- PoC + MARKDOWN_GRID_TABLE_BLOCK_REGEX = %r{ + (? + # Grid table blocks: + # +---+---+---+---+ + # Anything, starting with | blocks which are ignored by this filter + # +---+---+---+---+ + + ^\s*\+(-+\+)+$\n # First separator line + (?:^\s*[|+][^\n]*$\n)* + ^\s*\+(-+\+)+$ # Last separator line + + ) + }mx + # rubocop:enable Lint/MixedRegexpCaptureTypes + + require 'logger' + + # Add these regex constants at the top of the file, after the require statement + GRID_TABLE_SEPARATOR = /^\s*\+([-:=]+\+)+\s*$/ + GRID_TABLE_HEADER_SEPARATOR = /^\s*\+([=:]+\+)+\s*$/ + GRID_TABLE_BODY_SEPARATOR = /[^\n]*\+([:-]+\+)+[^\n]*$/ + GRID_TABLE_BODY_SEPARATOR_LINE = /^[-:]+$/ + + NEXT_ELEMENT_LIST_MARK = "∆" + + class Cell + attr_accessor :content, :rowspan, :colspan, :colspan_adjusted, :alignment, :position_start, :position, + :list_flag + + def initialize + @content = nil + @rowspan = 0 + @colspan = 0 + @colspan_adjusted = false + @alignment = 'align="center"' + @position_start = nil + @position = nil + @list_flag = false + end + + def calculate_and_set_alignment(header_delimiter_positions, default_alignments) + raise "Cell position must be set before calculating alignment" if @position.nil? || @position_start.nil? + + header_delimiter_index = 0 + while header_delimiter_index < default_alignments.length && + @position_start > header_delimiter_positions[header_delimiter_index] + header_delimiter_index += 1 + end + + raise "Invalid table formatting" unless header_delimiter_index < default_alignments.length + + @alignment = default_alignments[header_delimiter_index] + end + end + + class Row + attr_accessor :cells + + def initialize(length = 1) + @cells = Array.new(length) { Cell.new } + end + + def [](index) + @cells[index] + end + + def []=(index, value) + @cells[index] = value + end + end + + class RowTracker + attr_accessor :row_tracker + + def initialize(items) + @row_tracker = Array.new(items, 0) + end + + def [](index) + @row_tracker[index] + end + + def []=(index, value) + @row_tracker[index] = value + end + + def max_value + @row_tracker.max + end + end + + # Helper method to detect separator lines + def separator?(line) + GRID_TABLE_SEPARATOR.match?(line) + end + + # Helper method to handle content in cells + # rubocop:disable Metrics/PerceivedComplexity -- PoC + def handling_content(cell, content) + modified_content = content.strip + if cell.content.nil? + cell.rowspan += 1 + cell.colspan += 1 + if modified_content.start_with?("- ") # List + cell.list_flag = true + modified_content = modified_content.gsub(/\\\s*$/, '\n') + + # Add list element end mark to know when the list element ends + cell.content = "#{modified_content}#{NEXT_ELEMENT_LIST_MARK}" + elsif cell.list_flag && !content.strip.empty? + modified_content = modified_content.gsub(/\\\s*$/, '\n') + cell.content = "#{modified_content}#{NEXT_ELEMENT_LIST_MARK}" # add the list element end mark + elsif modified_content.empty? + cell.content = "\n" + else + cell.content = modified_content.gsub(/\\\s*$/, "\n") + end + elsif modified_content.start_with?("- ") + cell.content += "\n" unless cell.list_flag + cell.list_flag = true + modified_content = modified_content.gsub(/\\\s*$/, '\n') + cell.content += "#{modified_content}#{NEXT_ELEMENT_LIST_MARK}" + elsif cell.list_flag && !modified_content.empty? + cell.content = cell.content.strip.chomp(NEXT_ELEMENT_LIST_MARK.to_s) + modified_content = modified_content.gsub(/\\\s*$/, '\n') + cell.content += " #{modified_content}#{NEXT_ELEMENT_LIST_MARK}" + elsif modified_content.empty? + if cell.list_flag + cell.list_flag = false + cell.content += "\n\n" + end + + cell.content += cell.content.end_with?("\n") ? "" : "\n" + else + modified_content = modified_content.gsub(/\\\s*$/, "\n") + cell.content += " #{modified_content}" + end + + cell + end + # rubocop:enable Metrics/PerceivedComplexity + + # Helper method to adjust colspan + def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions) + (column_index...number_of_parts).each do |j| + delimiter_start = nil + col_i = column_index + + until delimiter_start + delimiter_start = col_i > 0 ? row[col_i - 1].position : 0 + col_i -= 1 + end + + delimiters = ['|', '+'] + positions = delimiters.filter_map do |delimiter| + pos = line[delimiter_start + 1..]&.index(delimiter) + pos ? pos + delimiter_start + 1 : nil + end.compact + + position = positions.min + + if position && position > delimiter_positions[j] + row[column_index].colspan += 1 + + if position == delimiter_positions[-1] + colspan_allocated = row[column_index].colspan + row[column_index].colspan += number_of_columns - colspan_allocated - column_index + end + elsif position && position < delimiter_positions[j] + raise "Wrong cell formatting" + else + break + end + end + + row[column_index] + end + + def check_delimiter_alignment(line, delimiter_positions) + return false if line.empty? || delimiter_positions.empty? + + # puts "\nChecking line: #{line}" + # puts "Expected delimiter positions: #{delimiter_positions}" + + # For any row (only +, only |, mix of + and |) + current_positions = [] + start_pos = 1 + + while start_pos < line.length + pos = line.index(/[|+]/, start_pos) # Find the next occurrence of | or + starting from start_pos + break if pos.nil? # Exit if no more delimiters are found + + current_positions << pos + start_pos = pos + 1 # Move to the next character after the found delimiter + end + + # puts "Current positions: #{current_positions}" + + # Check if the last expected delimiter position is found in current_positions + current_positions.include?(delimiter_positions[-1]) && + line.match?(/\A[|+]/) && # Check if the line starts with | or + + # Ensure all current positions are in delimiter_positions + current_positions.all? do |pos| + delimiter_positions.include?(pos) + end + end + + # rubocop:disable Metrics/AbcSize -- PoC + # rubocop:disable Metrics/CyclomaticComplexity -- PoC + # rubocop:disable Metrics/PerceivedComplexity -- PoC + def parse_pandoc_table_with_spans(pandoc_table) + # Split the input into lines + lines = pandoc_table.rstrip.split("\n").map(&:rstrip) + + # Retrieve separator indices + separator_indices = lines.each_index.select { |i| separator?(lines[i]) } + + raise "No valid separators found in the provided Pandoc table." if separator_indices.empty? + + # Calculate max number of columns and delimiter positions + delimiter_positions = [] + number_of_columns = separator_indices.map { |i| lines[i].count("+") - 1 }.max + + # Determine delimiter positions + separator_index_max_columns = separator_indices.find { |i| lines[i].count("+") - 1 == number_of_columns } + number_of_columns.times do |j| + start_pos = j == 0 ? 0 : delimiter_positions[j - 1] + pos = lines[separator_index_max_columns][start_pos + 1..]&.index("+") + delimiter_positions << (pos ? pos + start_pos + 1 : -1) + end + + # Process header + has_header = false + header_delimiter_positions = [] + default_alignments = [] + header_rows = [] + header_separator_index = nil + + # Determine header delimiter positions + separator_indices.each do |index| + next unless GRID_TABLE_HEADER_SEPARATOR.match?(lines[index]) + + has_header = true + header_separator_index = index + parts = lines[index].strip.delete_prefix("+").split("+") + + parts.each_with_index do |part, part_index| + default_alignments << if part.start_with?(":") && !part.end_with?(":") + 'left' + elsif !part.start_with?(":") && part.end_with?(":") + 'right' + else + 'center' + end + + start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] + pos = lines[index][start_pos + 1..]&.index("+") + header_delimiter_positions << (pos ? pos + start_pos + 1 : -1) + end + break + end + + unless has_header + # Set default alignments from the first separator which takes the role of header + header_separator_index = 0 + line = lines.find { |l| !l.strip.empty? } # first non-blank line + parts = line.strip.delete_prefix("+").split("+") + + parts.each_with_index do |part, part_index| + default_alignments << if part.start_with?(":") && !part.end_with?(":") + 'left' + elsif !part.start_with?(":") && part.end_with?(":") + 'right' + else + 'center' + end + + start_pos = part_index == 0 ? 0 : header_delimiter_positions[part_index - 1] + pos = line[start_pos + 1..]&.index("+") + header_delimiter_positions << (pos ? pos + start_pos + 1 : -1) + end + end + + # Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in table separators: #{lines[-1]}" unless check_delimiter_alignment(lines[-1], + delimiter_positions) + + # Process table body (including rows belonging to header as they are processed in the same way) + data_rows = [] + + (separator_indices.length - 1).times do |row| + rows = [] + rows_tracker = nil + in_data_row = false + start = separator_indices[row] + end_idx = separator_indices[row + 1] + row_lines = lines[start...end_idx] + + next if row_lines.empty? + + row_lines.each do |line| + line = line.rstrip + # First line (normally a separator) of each block + if separator?(line) && !in_data_row + in_data_row = true + # Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in separator row: #{line}" unless check_delimiter_alignment(line, + delimiter_positions) + + parts = line.strip.delete_prefix("+").split("+") + delimiter_index = 0 + rows << Row.new(number_of_columns) + rows_tracker = RowTracker.new(number_of_columns) + + i = 0 + parts.each_with_index do |_, j| + next unless i < number_of_columns + + delimiter_index += parts[j].length + 1 + rows[-1][i].position_start = delimiter_index - parts[j].length + rows[-1][i].position = delimiter_index + rows[-1][i].calculate_and_set_alignment(header_delimiter_positions, default_alignments) + + i += 1 while delimiter_index > delimiter_positions[i] + i += 1 + end + # Lines in a block + elsif in_data_row + # Regular data row or partial separator + if GRID_TABLE_BODY_SEPARATOR.match?(line) # Partial separator + # Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in partial separator: #{line}" unless check_delimiter_alignment(line, + delimiter_positions) + + parts = line.strip.gsub(/^(\+|\|)/, '').split(/[\|\+]/) + + rows << Row.new(number_of_columns) + aux_delimiter_index = 0 + auxiliar_cell_index = 0 + + parts.each_with_index do |_, i| + next unless auxiliar_cell_index < number_of_columns + + aux_delimiter_index += parts[i].length + 1 + rows[-1][auxiliar_cell_index].position_start = aux_delimiter_index - parts[i].length + rows[-1][auxiliar_cell_index].position = aux_delimiter_index + rows[-1][auxiliar_cell_index].calculate_and_set_alignment(header_delimiter_positions, + default_alignments) + + auxiliar_cell_index += 1 while aux_delimiter_index > delimiter_positions[auxiliar_cell_index] + + auxiliar_cell_index += 1 + end + + raise "More cells than columns found" unless parts.length <= number_of_columns + + column_index = 0 + max_row_tracker = rows_tracker.max_value + + parts.each_with_index do |content, _i| + if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content) # Separator - split row + rows_tracker[column_index] = max_row_tracker + 1 + rows[rows_tracker[column_index]][column_index].list_flag = false + + column_forward = 0 + (column_index...delimiter_positions.length).each do |del_index| + if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index] + column_forward += 1 + end + end + + column_index += column_forward + else # Regular cell in Partial separator line + rows[rows_tracker[column_index]][column_index] = + handling_content(rows[rows_tracker[column_index]][column_index], content) + rows[rows_tracker[column_index]][column_index].rowspan += 1 + + unless rows[rows_tracker[column_index]][column_index].colspan_adjusted + rows[rows_tracker[column_index]][column_index].colspan_adjusted = true + rows[rows_tracker[column_index]][column_index] = + adjust_colspan(rows[rows_tracker[column_index]], + column_index, + number_of_columns, + line, + number_of_columns, + delimiter_positions) + end + + if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index] + colspan = rows[rows_tracker[column_index]][column_index].colspan + column_index += (colspan == 0 ? 1 : colspan) # rubocop:disable Metrics/BlockNesting -- PoC + end + end + end + else # Data row + cells_content = line.strip.delete_prefix("|").split("|") + # Check end table delimiter alignment (not checked during the lines processing) + raise "Misaligned delimiters in row: #{line}" unless check_delimiter_alignment( + line, delimiter_positions) + + raise "Missing delimiters in previous separator line" if parts.length < cells_content.length + + #raise "Missing delimiters in row: #{line}: delimiters = #{cells_content.length}, expected delimiters = #{parts.length}" if parts.length > cells_content.length + + column_index = 0 + + if cells_content.length < number_of_columns + cells_content.each_with_index do |content, _i| + rows[rows_tracker[column_index]][column_index] = + handling_content(rows[rows_tracker[column_index]][column_index], content) + + unless rows[rows_tracker[column_index]][column_index].colspan_adjusted + rows[rows_tracker[column_index]][column_index].colspan_adjusted = true + rows[rows_tracker[column_index]][column_index] = + adjust_colspan(rows[rows_tracker[column_index]], + column_index, + number_of_columns, + line, + number_of_columns, + delimiter_positions) + end + + if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index] + column_index += rows[rows_tracker[column_index]][column_index].colspan + end + end + elsif cells_content.length == number_of_columns + cells_content.each_with_index do |content, i| + rows[rows_tracker[i]][i] = handling_content(rows[rows_tracker[i]][i], content) + end + else + raise "More cells than columns found" + end + end + else + raise "No separator line found for row starting" + end + end + + if has_header && start < header_separator_index + rows.each { |header_row| header_rows << header_row.cells } + else + rows.each { |body_row| data_rows << body_row.cells } + end + + raise "No valid rows found in the provided Pandoc table." if data_rows.empty? && header_rows.empty? + end + + # Format text (bold and italic) + [header_rows, data_rows].each do |rows| + rows.each do |row| + row.each do |cell| + next if cell.content.nil? + + #cell.content = cell.content.gsub(/^|\s)(?\*\*|__)(?.+?)\g(?!\w)/, + # "\\k\\k") + + #cell.content = cell.content.gsub(/(?^|\s)(?\*|_)(?.+?)\g(?!\w)/, + # "\\k\\k") + # Convert newlines to HTML breaks + cell.content = cell.content&.gsub("\n", "
") + end + end + + # Validate grid correctness + forward_rowspan = [] + + rows.each_with_index do |row, row_index| + forward_rowspan = Array.new(row.length, 0) if forward_rowspan.empty? + sum = 0 + row_forward_rowspan = forward_rowspan.dup + row.each_with_index do |cell, cell_index| + sum += cell.colspan + + if cell.colspan == 0 + if row_forward_rowspan[cell_index] > 0 + sum += 1 + forward_rowspan[cell_index] -= 1 + end + end + if row_forward_rowspan[cell_index] == 0 && cell.rowspan > 1 + forward_rowspan[cell_index] = cell.rowspan - 1 + colspan = 1 + while cell.colspan > colspan + forward_rowspan[cell_index + colspan] = cell.rowspan - 1 + colspan += 1 + end + end + end + + raise "Grid table not converted properly" unless sum == number_of_columns + end + end + + [header_rows, data_rows] + end + + def generate_html_table_with_spans(pandoc_table) + begin + grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table) + rescue StandardError => e + logger = Logger.new($stdout) + logger.error("Grid table could not be generated: #{e.message}") + + "\n\nHTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOG FILE\n\n#{e.message}\n\nCommit ID: ce3607dbcafafe03531c1c50b3f749cc2318656c\n\n" + else + html = '' + has_header = false + + grid_header.each do |row| + row.each do |cell| + if cell.rowspan != 0 && cell.colspan != 0 + has_header = true + break + end + end + end + + if has_header + html += '' + grid_header.each do |row| + html += '' + row.each do |cell| + next if cell.rowspan == 0 || cell.colspan == 0 + + # Prepare content, in case there's a list + matches = cell.content&.scan( + /\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) + + if matches + list = "
    " + matches.each do |match| + list += "
  • #{match[1]}
  • " + end + list += "
" + cell.content = cell.content.gsub( + /(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/o, list) + # Enforce left alignment if cell contains a list + cell.alignment = 'left' + end + + rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" + colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" + html += %(\n\n#{cell.content}\n\n) + end + html += '' + end + html += '' + end + + html += '' + grid_body.each do |row| + html += '' + row.each do |cell| + next if cell.rowspan == 0 || cell.colspan == 0 + + matches = cell.content&.scan( + /\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+?)#{NEXT_ELEMENT_LIST_MARK}\n?/o) + + if matches + list = "
    " + matches.each do |match| + list += "
  • #{match[1]}
  • " + end + cell.content = cell.content.gsub( + /(\s*([-*+]|\d+\.)\s+([^#{NEXT_ELEMENT_LIST_MARK}]+#{NEXT_ELEMENT_LIST_MARK}\n?))+/o, list) + # Enforce left alignment if cell contains a list + cell.alignment = 'left' + end + + rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : "" + colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : "" + html += %(\n\n#{cell.content}\n\n) + end + html += '
' + end + + html += '' + html += '
' + html + end + end + # rubocop:enable Metrics/PerceivedComplexity + # rubocop:enable Metrics/CyclomaticComplexity + # rubocop:enable Metrics/AbcSize + + def call + return @text unless MarkdownFilter.glfm_markdown?(context) + + regex = MARKDOWN_GRID_TABLE_BLOCK_REGEX + return @text unless regex.match?(@text) + + @text.gsub(regex) do + match = Regexp.last_match + # Extract the grid table content from the match + grid_table = match[:code] + if grid_table + # Convert grid table to HTML table + generate_html_table_with_spans(grid_table) + else + # Return original text if no grid table found + match.to_s + end + end + end + end + end +end diff --git a/gitlabFilter/plain_markdown_pipeline.rb b/gitlabFilter/plain_markdown_pipeline.rb new file mode 100644 index 0000000000000000000000000000000000000000..c5bb665d8dc0e477aacb7dd509658785d94db8ea --- /dev/null +++ b/gitlabFilter/plain_markdown_pipeline.rb @@ -0,0 +1,21 @@ +# frozen_string_literal: true + +module Banzai + module Pipeline + class PlainMarkdownPipeline < BasePipeline + def self.filters + FilterArray[ + Filter::IncludeFilter, + Filter::GridTableFilter, + Filter::MarkdownPreEscapeLegacyFilter, + Filter::DollarMathPreLegacyFilter, + Filter::BlockquoteFenceLegacyFilter, + Filter::MarkdownFilter, + Filter::ConvertTextToDocFilter, + Filter::DollarMathPostLegacyFilter, + Filter::MarkdownPostEscapeLegacyFilter + ] + end + end + end + end \ No newline at end of file