lib/cucumber/cucumber_expressions/cucumber_expression_tokenizer.rb



require 'cucumber/cucumber_expressions/ast'
require 'cucumber/cucumber_expressions/errors'

module Cucumber
  module CucumberExpressions
    class CucumberExpressionTokenizer
      def tokenize(expression)
        @expression = expression
        tokens = []
        @buffer = []
        previous_token_type = TokenType::START_OF_LINE
        treat_as_text = false
        @escaped = 0
        @buffer_start_index = 0

        codepoints = expression.codepoints

        if codepoints.empty?
          tokens.push(Token.new(TokenType::START_OF_LINE, '', 0, 0))
        end

        codepoints.each do |codepoint|
          if !treat_as_text && Token.is_escape_character(codepoint)
            @escaped += 1
            treat_as_text = true
            next
          end
          current_token_type = token_type_of(codepoint, treat_as_text)
          treat_as_text = false

          if should_create_new_token?(previous_token_type, current_token_type)
            token = convert_buffer_to_token(previous_token_type)
            previous_token_type = current_token_type
            @buffer.push(codepoint)
            tokens.push(token)
          else
            previous_token_type = current_token_type
            @buffer.push(codepoint)
          end
        end

        if @buffer.length > 0
          token = convert_buffer_to_token(previous_token_type)
          tokens.push(token)
        end

        raise TheEndOfLineCannotBeEscaped.new(expression) if treat_as_text

        tokens.push(Token.new(TokenType::END_OF_LINE, '', codepoints.length, codepoints.length))
        tokens
      end

      private

      # TODO: Make these lambdas

      def convert_buffer_to_token(token_type)
        escape_tokens = 0
        if token_type == TokenType::TEXT
          escape_tokens = @escaped
          @escaped = 0
        end

        consumed_index = @buffer_start_index + @buffer.length + escape_tokens
        t = Token.new(
            token_type,
            @buffer.map { |codepoint| codepoint.chr(Encoding::UTF_8) }.join(''),
            @buffer_start_index,
            consumed_index
        )
        @buffer = []
        @buffer_start_index = consumed_index
        t
      end

      def token_type_of(codepoint, treat_as_text)
        unless treat_as_text
          return Token.type_of(codepoint)
        end
        if Token.can_escape(codepoint)
          return TokenType::TEXT
        end
        raise CantEscape.new(
            @expression,
            @buffer_start_index + @buffer.length + @escaped
        )
      end

      def should_create_new_token?(previous_token_type, current_token_type)
        current_token_type != previous_token_type ||
            (current_token_type != TokenType::WHITE_SPACE && current_token_type != TokenType::TEXT)
      end
    end
  end
end