Commit 3491470e authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Gitlab grid table filter

parent a1b466b0
Loading
Loading
Loading
Loading

grid_table_filter.rb

0 → 100644
+522 −0
Original line number Diff line number Diff line
# frozen_string_literal: true

#
#	GridTableFilter.rb
#
#	(c) 2025 by Miguel Angel Reina Ortega & Andreas Kraft
#	License: BSD 3-Clause License. See the LICENSE file for further details.
#

# TODO: This is now a legacy filter, and is only used with the Ruby parser.
# The current markdown parser now properly handles grid table blocks.
# issue: https://gitlab.com/gitlab-org/gitlab/-/issues/460864
# GridTableFilter.rb
#
# Converts Pandoc-style grid tables to HTML tables with rowspan and colspan support
#

module Banzai
    module Filter
      class GridTableFilter < HTML::Pipeline::TextFilter
        MARKDOWN_GRID_TABLE_BLOCK_REGEX = %r{
          (?<code>
            # Grid table blocks:
            # +---+---+---+---+
            # Anything, starting with | blocks which are ignored by this filter
            # +---+---+---+---+
  
            ^\s*\+-.*\+\s$          # First separator line
            (?:.*\n)*?              # Any number of rows (non-greedy)
            \s*\+-.*\+\s$           # Last separator line
          )
        }mx
  
        require 'logger'

        class Cell
        attr_accessor :content, :rowspan, :colspan, :colspan_adjusted, :alignment, :position, :list_flag

        def initialize
            @content = nil
            @rowspan = 0
            @colspan = 0
            @colspan_adjusted = false
            @alignment = 'align="center"'
            @position = nil
            @list_flag = false
        end

        def set_alignment(default_alignments, header_delimiter_positions)
            header_delimiter_index = 0
            while header_delimiter_index < default_alignments.length && @position > header_delimiter_positions[header_delimiter_index]
            header_delimiter_index += 1
            end

            if header_delimiter_index < default_alignments.length
            if @position < header_delimiter_positions[header_delimiter_index]
                @alignment = default_alignments[header_delimiter_index]
            elsif @position == header_delimiter_positions[header_delimiter_index]
                @alignment = default_alignments[header_delimiter_index]
                header_delimiter_index += 1
            end
            else
            raise "Invalid table formatting"
            end
        end
        end

        class Row
        attr_accessor :cells

        def initialize(length = 1)
            @cells = Array.new(length) { Cell.new }
        end

        def [](index)
            @cells[index]
        end

        def []=(index, value)
            @cells[index] = value
        end
        end

        class RowTracker
        attr_accessor :row_tracker

        def initialize(items)
            @row_tracker = Array.new(items, 0)
        end

        def [](index)
            @row_tracker[index]
        end

        def []=(index, value)
            @row_tracker[index] = value
        end
        end

        # Add these regex constants at the top of the file, after the require statement
        GRID_TABLE_SEPARATOR = /\s*\+([-:=]+\+)+\s*$/
        GRID_TABLE_HEADER_SEPARATOR = /.*\+([=:]+\+)+.*$/
        GRID_TABLE_BODY_SEPARATOR = /.*\+([:-]+\+)+.*$/
        GRID_TABLE_BODY_SEPARATOR_LINE = /[-:]+$/

        def parse_pandoc_table_with_spans(pandoc_table)
        # Split the input into lines
        lines = pandoc_table.strip.split("\n").map(&:strip)

        # Helper method to detect separator lines
        def is_separator(line)
            GRID_TABLE_SEPARATOR.match?(line)
        end

        # Helper method to handle content in cells
        def handling_content(cell, content)
            if cell.content.nil?
            cell.rowspan += 1
            cell.colspan += 1
            if content.strip.start_with?("- ") # List
                cell.list_flag = true
                cell.content = "#{content.strip}\n"
            elsif cell.list_flag && !content.strip.empty?
                cell.content += "#{content.strip}\n"
            elsif content.strip == ""
                cell.list_flag = false
                cell.content = "\n"
            else
                cell.content = content.strip.gsub(/\\\s*$/, "\n")
            end
            else
            if content.strip.start_with?("- ")
                unless cell.list_flag
                cell.content += "\n"
                end
                cell.list_flag = true
                cell.content += "#{content.strip}\n"
            elsif cell.list_flag && !content.strip.empty?
                cell.content = cell.content.strip.chomp("\n")
                cell.content += " #{content.strip}\n"
            elsif content.strip.empty?
                cell.list_flag = false
                cell.content += cell.content.end_with?("\n") ? "" : "\n"
            else
                content = content.strip.gsub(/\\\s*$/, "\n")
                cell.content += " #{content}"
            end
            end
            cell
        end

        # Helper method to adjust colspan
        def adjust_colspan(row, column_index, number_of_parts, line, number_of_columns, delimiter_positions)
            (column_index...number_of_parts).each do |j|
            delimiter_start = nil
            col_i = column_index
            until delimiter_start
                delimiter_start = col_i > 0 ? row[col_i - 1].position : 0
                col_i -= 1
            end

            positions = ["|", "+"].map do |delimiter|
                pos = line[delimiter_start + 1..-1]&.index(delimiter)
                pos ? pos + delimiter_start + 1 : nil
            end.compact
            
            position = positions.min

            if position && position > delimiter_positions[j]
                row[column_index].colspan += 1
                if position == delimiter_positions[-1]
                colspan_allocated = row[column_index].colspan
                row[column_index].colspan += number_of_columns - colspan_allocated - column_index
                end
            elsif position && position < delimiter_positions[j]
                raise "Wrong cell formatting"
            else
                break
            end
            end
            row[column_index]
        end

        separator_indices = lines.each_index.select { |i| is_separator(lines[i]) }
        
        raise "No valid separators found in the provided Pandoc table." if separator_indices.empty?

        # Calculate max number of columns and delimiter positions
        delimiter_positions = []
        number_of_columns = separator_indices.map { |i| lines[i].count("+") - 1 }.max

        separator_index_max_columns = separator_indices.find { |i| lines[i].count("+") - 1 == number_of_columns }
        number_of_columns.times do |j|
            start_pos = j.zero? ? 0 : delimiter_positions[j - 1]
            pos = lines[separator_index_max_columns][start_pos + 1..-1]&.index("+")
            delimiter_positions << (pos ? pos + start_pos + 1 : -1)
        end

        # Process header
        has_header = false
        header_delimiter_positions = []
        default_alignments = []
        header_rows = []
        header_separator_index = nil

        separator_indices.each do |index|
            if GRID_TABLE_HEADER_SEPARATOR.match?(lines[index])
            has_header = true
            header_separator_index = index
            parts = lines[index].strip.delete_prefix("+").split("+")
            
            parts.each_with_index do |part, part_index|
                default_alignments << if part.start_with?(":") && !part.end_with?(":")
                                    'align="left"'
                                    elsif !part.start_with?(":") && part.end_with?(":")
                                    'align="right"'
                                    else
                                    'align="center"'
                                    end

                start_pos = part_index.zero? ? 0 : header_delimiter_positions[part_index - 1]
                pos = lines[index][start_pos + 1..-1]&.index("+")
                header_delimiter_positions << (pos ? pos + start_pos + 1 : -1)
            end
            break
            end
        end

        # Process table body
        data_rows = []
        (separator_indices.length - 1).times do |row|
            rows = []
            rows_tracker = nil
            in_data_row = false
            start, end_idx = separator_indices[row], separator_indices[row + 1]
            row_lines = lines[start...end_idx]

            next if row_lines.empty?

            row_lines.each do |line|
            if is_separator(line) && !in_data_row
                in_data_row = true
                parts = line.strip.delete_prefix("+").split("+")
                delimiter_index = 0
                rows << Row.new(number_of_columns)
                rows_tracker = RowTracker.new(number_of_columns)
                
                i = 0
                parts.each_with_index do |_, j|
                next unless i < number_of_columns
                
                delimiter_index += parts[j].length + 1
                rows[-1][i].position = delimiter_index
                rows[-1][i].set_alignment(default_alignments, header_delimiter_positions)
                
                while delimiter_index > delimiter_positions[i]
                    i += 1
                end
                i += 1
                end

            elsif in_data_row
                if GRID_TABLE_BODY_SEPARATOR.match?(line)
                cells_content = line.strip.delete_prefix("|").delete_prefix("+")
                                    .delete_suffix("|").delete_suffix("+").split(/[\|\+]/)
                
                rows << Row.new(number_of_columns)
                aux_delimiter_index = 0
                auxiliar_cell_index = 0
                
                cells_content.each_with_index do |_, i|
                    next unless auxiliar_cell_index < number_of_columns
                    
                    aux_delimiter_index += cells_content[i].length + 1
                    rows[-1][auxiliar_cell_index].position = aux_delimiter_index
                    rows[-1][auxiliar_cell_index].set_alignment(default_alignments, header_delimiter_positions)
                    
                    while aux_delimiter_index > delimiter_positions[auxiliar_cell_index]
                    auxiliar_cell_index += 1
                    end
                    auxiliar_cell_index += 1
                end

                if cells_content.length <= number_of_columns
                    column_index = 0
                    cells_content.each_with_index do |content, i|
                    if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content)
                        rows_tracker[column_index] += 1
                        rows[rows_tracker[column_index]][column_index].list_flag = false
                        
                        column_forward = 0
                        (column_index...delimiter_positions.length).each do |del_index|
                        if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index]
                            column_forward += 1
                            rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1
                        end
                        end
                        column_index += column_forward
                    else
                        rows[rows_tracker[column_index]][column_index] = 
                        handling_content(rows[rows_tracker[column_index]][column_index], content)
                        rows[rows_tracker[column_index]][column_index].rowspan += 1
                        
                        unless rows[rows_tracker[column_index]][column_index].colspan_adjusted
                        rows[rows_tracker[column_index]][column_index].colspan_adjusted = true
                        rows[rows_tracker[column_index]][column_index] = 
                            adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns, 
                                        line, number_of_columns, delimiter_positions)
                        end

                        if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]
                        colspan = rows[rows_tracker[column_index]][column_index].colspan
                        column_index += colspan.zero? ? 1 : colspan
                        end
                    end
                    end
                else
                    raise "More cells than columns found"
                end
                else
                cells_content = line.strip.delete_prefix("|").split(/\s*\|\s*/)
                column_index = 0
                
                if cells_content.length < number_of_columns
                    cells_content.each_with_index do |content, i|
                    rows[rows_tracker[column_index]][column_index] = 
                        handling_content(rows[rows_tracker[column_index]][column_index], content)
                    
                    unless rows[rows_tracker[column_index]][column_index].colspan_adjusted
                        rows[rows_tracker[column_index]][column_index].colspan_adjusted = true
                        rows[rows_tracker[column_index]][column_index] = 
                        adjust_colspan(rows[rows_tracker[column_index]], column_index, number_of_columns,
                                    line, number_of_columns, delimiter_positions)
                    end

                    if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[column_index]
                        column_index += rows[rows_tracker[column_index]][column_index].colspan
                    end
                    end
                elsif cells_content.length == number_of_columns
                    cells_content.each_with_index do |content, i|
                    rows[rows_tracker[i]][i] = handling_content(rows[rows_tracker[i]][i], content)
                    end
                else
                    raise "More cells than columns found"
                end
                end
            else
                raise "No separator line found for row starting"
            end
            end

            if has_header && start >= header_separator_index
            rows.each { |body_row| data_rows << body_row.cells }
            elsif has_header && start < header_separator_index
            rows.each { |header_row| header_rows << header_row.cells }
            end
        end

        raise "No valid rows found in the provided Pandoc table." if data_rows.empty? && header_rows.empty?

        # Format text (bold and italic)
        [header_rows, data_rows].each do |rows|
            rows.each do |row|
            row.each do |cell|
                next if cell.content.nil?

                ["**", "__"].each do |bold_chars|
                while cell.content.include?(bold_chars)
                    cell.content = cell.content.sub(bold_chars, "<strong>")
                                            .sub(bold_chars, "</strong>")
                end
                end

                while cell.content.include?("_") && !cell.content.include?("\\_")
                cell.content = cell.content.rstrip.sub("_", "<i>").sub("_", "</i>")
                end
                
                while cell.content.include?("\\_")
                cell.content = cell.content.rstrip.sub("\\_", "_")
                end
            end
            end
        end

        # Convert newlines to HTML breaks
        [header_rows, data_rows].each do |rows|
            rows.each do |row|
            row.each do |cell|
                cell.content = cell.content&.gsub("\n", "<br />")
            end
            end
        end

        # Validate grid correctness
        [header_rows, data_rows].each do |rows|
            forward_rowspan = []
            
            rows.each_with_index do |row, row_index|
            forward_rowspan = Array.new(row.length, 0) if forward_rowspan.empty?
            sum = 0
            
            row.each_with_index do |cell, cell_index|
                sum += cell.colspan
                if row_index > 0 && cell.colspan.zero?
                if forward_rowspan[cell_index].positive?
                    sum += 1
                end
                forward_rowspan[cell_index] -= 1
                end
                
                if forward_rowspan[cell_index].zero? && cell.rowspan > 1
                forward_rowspan[cell_index] = cell.rowspan - 1
                end
            end
            
            raise "Grid table not converted properly" unless sum == number_of_columns
            end
        end

        [header_rows, data_rows]
        end

        def generate_html_table_with_spans(pandoc_table)
        begin
            grid_header, grid_body = parse_pandoc_table_with_spans(pandoc_table)
        rescue StandardError => e
            logger = Logger.new(STDOUT)
            logger.error("Grid table could not be generated: #{e.message}")
            return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS"
        else
            html = "<table>\n"
            has_header = false

            grid_header.each do |row|
                row.each do |cell|
                    if cell.rowspan != 0 && cell.colspan != 0
                        has_header = true
                        break
                    end
                end
            end

            if has_header
                html += "    <thead>\n"
                grid_header.each do |row|
                    html += "        <tr>\n"
                    row.each do |cell|
                    next if cell.rowspan == 0 || cell.colspan == 0

                    # Prepare content, in case there's a list
                    if matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^<]+?)(?=<br \/>|$)/)
                        list = "<ul>"
                        matches.each do |match|
                            list += "<li>#{match[1]}</li>"
                        end
                        list += "</ul>"
                        cell.content = cell.content.gsub(/(\s*([-*+]|\d+\.)\s+[^<]+?<br \/>)+/, list)
                        # Enforce left alignment if cell contains a list
                        cell.alignment = 'align="left"'
                    end

                    rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : ""
                    colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : ""
                    html += %{            <th#{rowspan}#{colspan} #{cell.alignment}>#{cell.content}</th>\n}
                    end
                    html += "        </tr>\n"
                end
                html += "    </thead>\n"
            end

            html += "    <tbody>\n"
            grid_body.each do |row|
                html += "        <tr>\n"
                row.each do |cell|
                    next if cell.rowspan == 0 || cell.colspan == 0

                    if matches = cell.content&.scan(/\s*([-*+]|\d+\.)\s+([^<]+?)(?=<br \/>|$)/)
                        list = "<ul>"
                        matches.each do |match|
                            list += "<li>#{match[1]}</li>"
                        end
                        list += "</ul>"
                        cell.content = cell.content.gsub(/(\s*([-*+]|\d+\.)\s+[^<]+?<br \/>)+/, list)
                        # Enforce left alignment if cell contains a list
                        cell.alignment = 'align="left"'
                    end

                    rowspan = cell.rowspan > 1 ? %( rowspan="#{cell.rowspan}") : ""
                    colspan = cell.colspan > 1 ? %( colspan="#{cell.colspan}") : ""
                    html += %{            <td#{rowspan}#{colspan} #{cell.alignment}>#{cell.content}</td>\n}
                end
                html += "        </tr>\n"
            end

            html += "    </tbody>\n"
            html += "</table>"
            html
        end
      end
      
      def call
        return @text if MarkdownFilter.glfm_markdown?(context)

        regex = Gitlab::UntrustedRegexp.new(MARKDOWN_GRID_TABLE_BLOCK_REGEX, multiline: true)
        return @text unless regex.match?(@text)

        regex.replace_gsub(@text) do |match|
          # Extract the grid table content from the match
          grid_table = match[:code]
          if grid_table
            # Convert grid table to HTML table
            generate_html_table_with_spans(grid_table)
          else
            # Return original text if no grid table found
            match.to_s
          end
        end
      end
    end # end of class GridTableFilter
  end # end of module Filter
end # end of module Banzai
 No newline at end of file