Commit 2afc6b8a authored by Miguel Angel Reina Ortega's avatar Miguel Angel Reina Ortega
Browse files

Regex corrections + fix for merged rows

parent 3491470e
Loading
Loading
Loading
Loading
+426 −414
Original line number Diff line number Diff line
@@ -25,9 +25,10 @@ module Banzai
            # Anything, starting with | blocks which are ignored by this filter
            # +---+---+---+---+
  
            ^\s*\+-.*\+\s$          # First separator line
            (?:.*\n)*?              # Any number of rows (non-greedy)
            \s*\+-.*\+\s$           # Last separator line
            ^\s*\+(-+\+)+$\n         # First separator line
            (?:^\s*[|+][^\n]*$\n)*
            ^\s*\+(-+\+)+$           # Last separator line
            
          )
        }mx
  
@@ -63,7 +64,7 @@ module Banzai
                raise "Invalid table formatting"
                end
            end
        end
        end # end of class Cell

        class Row
            attr_accessor :cells
@@ -79,7 +80,7 @@ module Banzai
            def []=(index, value)
                @cells[index] = value
            end
        end
        end # end of class Row

        class RowTracker
            attr_accessor :row_tracker
@@ -95,13 +96,17 @@ module Banzai
            def []=(index, value)
                @row_tracker[index] = value
            end

            def maxValue
                @row_tracker.max
            end
        end # end of class RowTracker

        # Add these regex constants at the top of the file, after the require statement
        GRID_TABLE_SEPARATOR = /\s*\+([-:=]+\+)+\s*$/
        GRID_TABLE_HEADER_SEPARATOR = /.*\+([=:]+\+)+.*$/
        GRID_TABLE_BODY_SEPARATOR = /.*\+([:-]+\+)+.*$/
        GRID_TABLE_BODY_SEPARATOR_LINE = /[-:]+$/
        GRID_TABLE_SEPARATOR = /^\s*\+([-:=]+\+)+\s*$/
        GRID_TABLE_HEADER_SEPARATOR = /^\s*\+([=:]+\+)+\s*$/
        GRID_TABLE_BODY_SEPARATOR = /[^\n]*\+([:-]+\+)+[^\n]*$/
        GRID_TABLE_BODY_SEPARATOR_LINE = /^[-:]+$/

        def parse_pandoc_table_with_spans(pandoc_table)
            # Split the input into lines
@@ -181,6 +186,7 @@ module Banzai
                row[column_index]
            end

            # Retrieve separator indices
            separator_indices = lines.each_index.select { |i| is_separator(lines[i]) }
            
            raise "No valid separators found in the provided Pandoc table." if separator_indices.empty?
@@ -189,6 +195,7 @@ module Banzai
            delimiter_positions = []
            number_of_columns = separator_indices.map { |i| lines[i].count("+") - 1 }.max

            # Determine delimiter positions
            separator_index_max_columns = separator_indices.find { |i| lines[i].count("+") - 1 == number_of_columns }
            number_of_columns.times do |j|
                start_pos = j.zero? ? 0 : delimiter_positions[j - 1]
@@ -203,6 +210,7 @@ module Banzai
            header_rows = []
            header_separator_index = nil

            # Determine header delimiter positions
            separator_indices.each do |index|
                if GRID_TABLE_HEADER_SEPARATOR.match?(lines[index])
                    has_header = true
@@ -226,18 +234,19 @@ module Banzai
                end
            end

        # Process table body
            # Process table body (including rows belonging to header as they are processed in the same way)
            data_rows = []
            (separator_indices.length - 1).times do |row|
                rows = []
                rows_tracker = nil
                in_data_row = false
            start, end_idx = separator_indices[row], separator_indices[row + 1]
                start, end_idx = separator_indices[row], separator_indices[row + 1] # Lines between separators including separator line start as it gives information about the number of columns of the row
                row_lines = lines[start...end_idx]

                next if row_lines.empty?

                row_lines.each do |line|
                    # First line (normally a separator) of each block
                    if is_separator(line) && !in_data_row
                        in_data_row = true
                        parts = line.strip.delete_prefix("+").split("+")
@@ -258,9 +267,10 @@ module Banzai
                        end
                        i += 1
                        end

                    # Lines in a block
                    elsif in_data_row
                if GRID_TABLE_BODY_SEPARATOR.match?(line)
                        # Regular data row or partial separator
                        if GRID_TABLE_BODY_SEPARATOR.match?(line) # Partial separator
                            cells_content = line.strip.delete_prefix("|").delete_prefix("+")
                                            .delete_suffix("|").delete_suffix("+").split(/[\|\+]/)
                        
@@ -283,20 +293,21 @@ module Banzai

                            if cells_content.length <= number_of_columns
                                column_index = 0
                                maxRowTracker = rows_tracker.maxValue
                                cells_content.each_with_index do |content, i|
                    if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content)
                        rows_tracker[column_index] += 1
                                    if GRID_TABLE_BODY_SEPARATOR_LINE.match?(content) # Separator - split row
                                        rows_tracker[column_index] = maxRowTracker + 1
                                        rows[rows_tracker[column_index]][column_index].list_flag = false
                                        
                                        column_forward = 0
                                        (column_index...delimiter_positions.length).each do |del_index|
                                            if rows[rows_tracker[column_index]][column_index].position >= delimiter_positions[del_index]
                                                column_forward += 1
                            rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1
                                                #rows_tracker[column_index + column_forward - 1] += 1 if column_forward > 1
                                            end
                                        end
                                        column_index += column_forward
                    else
                                    else # Regular cell in Partial separator line
                                        rows[rows_tracker[column_index]][column_index] = 
                                        handling_content(rows[rows_tracker[column_index]][column_index], content)
                                        rows[rows_tracker[column_index]][column_index].rowspan += 1
@@ -317,8 +328,8 @@ module Banzai
                            else
                                raise "More cells than columns found"
                            end
                else
                cells_content = line.strip.delete_prefix("|").split(/\s*\|\s*/)
                        else # Data row
                            cells_content = line.strip.delete_prefix("|").delete_suffix("|").split(/\|/)
                            column_index = 0
                            
                            if cells_content.length < number_of_columns
@@ -419,7 +430,7 @@ module Banzai
            end

            [header_rows, data_rows]
        end
        end # end of parse_pandoc_table_with_spans

        def generate_html_table_with_spans(pandoc_table)
            begin
@@ -427,7 +438,7 @@ module Banzai
            rescue StandardError => e
                logger = Logger.new(STDOUT)
                logger.error("Grid table could not be generated: #{e.message}")
            return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS"
                return "HTML TABLE COULD NOT BE GENERATED FROM MARKDOWN GRID TABLE. CHECK LOGS FILE"
            else
                html = "<table>\n"
                has_header = false
@@ -497,15 +508,16 @@ module Banzai
                html += "</table>"
                html
            end
      end
        end # end of def generate_html_table_with_spans
      
        def call
        return @text if MarkdownFilter.glfm_markdown?(context)
            return @text unless MarkdownFilter.glfm_markdown?(context)

        regex = Gitlab::UntrustedRegexp.new(MARKDOWN_GRID_TABLE_BLOCK_REGEX, multiline: true)
            regex = MARKDOWN_GRID_TABLE_BLOCK_REGEX
            return @text unless regex.match?(@text)

        regex.replace_gsub(@text) do |match|
            @text.gsub(regex) do
            match = Regexp.last_match
            # Extract the grid table content from the match
            grid_table = match[:code]
            if grid_table
@@ -516,7 +528,7 @@ module Banzai
                match.to_s
            end
            end
      end
        end # end of def call
    end # end of class GridTableFilter
  end # end of module Filter
end # end of module Banzai