class ElasticAPM::Sql::Tokenizer

@api private

def initialize(input)

def initialize(input)
  @input = input
  @scanner = StringScanner.new(input)
  @byte_start = 0
end

def next_char

def next_char
  char = @scanner.getch
  @byte_end = @scanner.pos
  char
end

def next_token(char)

rubocop:disable Metrics/CyclomaticComplexity

def next_token(char)
  case char
  when '_'   then scan_keyword_or_identifier(possible_keyword: false)
  when '.'   then PERIOD
  when '$'   then scan_dollar_sign
  when '`'   then scan_quoted_indentifier('`')
  when '"'   then scan_quoted_indentifier('"')
  when '['   then scan_quoted_indentifier(']')
  when '('   then LPAREN
  when ')'   then RPAREN
  when '/'   then scan_bracketed_or_cql_comment
  when '-'   then scan_simple_comment
  when "'"   then scan_string_literal
  when ALPHA then scan_keyword_or_identifier(possible_keyword: true)
  when DIGIT then scan_numeric_literal
  else            OTHER
  end
end

def peek_char(length = 1)

multi-byte character
StringScanner#peek returns next byte which could be an incomplete utf

def peek_char(length = 1)
  # The maximum byte count of utf chars is 4:
  # > In UTF-8, characters from the U+0000..U+10FFFF range (the UTF-16
  #   accessible range) are encoded using sequences of 1 to 4 octets.
  # # https://tools.ietf.org/html/rfc3629
  return nil if length > 4
  char = @scanner.peek(length)
  return nil if char.empty?
  return char if char.valid_encoding?
  peek_char(length + 1)
end

def scan

def scan
  scanner.skip(SPACE)
  @byte_start = scanner.pos
  char = next_char
  return false unless char
  @token = next_token(char)
  true
end

def scan_bracketed_comment

rubocop:disable Metrics/CyclomaticComplexity

def scan_bracketed_comment
  nesting = 1
  while (char = next_char)
    case char
    when '/'
      next unless peek_char == '*'
      next_char
      nesting += 1
    when '*'
      next unless peek_char == '/'
      next_char
      nesting -= 1
      return COMMENT if nesting == 0
    end
  end
end

def scan_bracketed_or_cql_comment

def scan_bracketed_or_cql_comment
  case peek_char
  when '*' then scan_bracketed_comment
  when '/' then scan_cql_comment
  else OTHER
  end
end

def scan_cql_comment

def scan_cql_comment
  return OTHER unless peek_char == '/'
  while (char = next_char)
    break if char == "\n"
  end
  COMMENT
end

def scan_dollar_sign

rubocop:disable Metrics/CyclomaticComplexity

def scan_dollar_sign
  while (peek = peek_char)
    case peek
    when DIGIT
      next_char while peek_char =~ DIGIT
    when '$', '_', ALPHA, SPACE
      # PostgreSQL supports dollar-quoted string literal syntax,
      # like $foo$...$foo$. The tag (foo in this case) is optional,
      # and if present follows identifier rules.
      while (char = next_char)
        case char
        when '$'
          # This marks the end of the initial $foo$.
          snap = text
          slice = input.slice(scanner.pos, input.length)
          index = slice.index(snap)
          next unless index && index >= 0
          delta = index + snap.bytesize
          @byte_end += delta
          scanner.pos += delta
          return STRING
        when SPACE
          # Unknown token starting with $, consume chars until space.
          @byte_end -= char.bytesize
          return OTHER
        end
      end
    else break
    end
  end
  OTHER
end

def scan_keyword_or_identifier(possible_keyword:)

rubocop:disable Metrics/CyclomaticComplexity, Metrics/PerceivedComplexity

def scan_keyword_or_identifier(possible_keyword:)
  while (peek = peek_char)
    if peek == '_' || peek == '$' || peek =~ DIGIT
      possible_keyword = false
      next next_char
    end
    next next_char if ALPHA.match?(peek)
    break
  end
  return IDENT unless possible_keyword
  snap = text
  if snap.length < KEYWORD_MIN_LENGTH || snap.length > KEYWORD_MAX_LENGTH
    return IDENT
  end
  keyword = KEYWORDS[snap.length].find { |kw| snap.upcase == kw.to_s }
  return keyword if keyword
  IDENT
end

def scan_numeric_literal

rubocop:disable Metrics/CyclomaticComplexity

def scan_numeric_literal
  period = false
  exponent = false
  while (peek = peek_char)
    case peek
    when DIGIT then next_char
    when '.'
      return NUMBER if period
      next_char
      period = true
    when 'e', 'E'
      return NUMBER if exponent
      next_char
      next_char if /[+-]/.match?(peek_char)
    else break
    end
  end
  NUMBER
end

def scan_quoted_indentifier(delimiter)

def scan_quoted_indentifier(delimiter)
  while (char = next_char)
    next unless char == delimiter
    if delimiter == '"' && peek_char == delimiter
      next next_char
    end
    break
  end
  # Remove quotes from identifier
  @byte_start += char.bytesize
  @byte_end -= char.bytesize
  IDENT
end

def scan_simple_comment

def scan_simple_comment
  return OTHER unless peek_char == '-'
  while (char = next_char)
    break if char == "\n"
  end
  COMMENT
end

def scan_string_literal

def scan_string_literal
  delimiter = "'"
  while (char = next_char)
    if char == '\\'
      # Skip escaped character, e.g. 'what\'s up?'
      next_char
      next
    end
    next unless char == delimiter
    return STRING unless peek_char
    return STRING if peek_char != delimiter
    next_char
  end
end

def text

def text
  @input.byteslice(@byte_start, @byte_end - @byte_start)
end

Namespace

ElasticAPM::Sql

Included Modules

ElasticAPM::Sql::Tokenizer::Tokens

Instance Methods

Defined in

lib/elastic_apm/sql/tokenizer.rb