class ElasticAPM::Sql::Tokenizer
@api private
def initialize(input)
def initialize(input) @input = input @scanner = StringScanner.new(input) @byte_start = 0 end
def next_char
def next_char char = @scanner.getch @byte_end = @scanner.pos char end
def next_token(char)
def next_token(char) case char when '_' then scan_keyword_or_identifier(possible_keyword: false) when '.' then PERIOD when '$' then scan_dollar_sign when '`' then scan_quoted_indentifier('`') when '"' then scan_quoted_indentifier('"') when '[' then scan_quoted_indentifier(']') when '(' then LPAREN when ')' then RPAREN when '/' then scan_bracketed_or_cql_comment when '-' then scan_simple_comment when "'" then scan_string_literal when ALPHA then scan_keyword_or_identifier(possible_keyword: true) when DIGIT then scan_numeric_literal else OTHER end end
def peek_char(length = 1)
StringScanner#peek returns next byte which could be an incomplete utf
def peek_char(length = 1) # The maximum byte count of utf chars is 4: # > In UTF-8, characters from the U+0000..U+10FFFF range (the UTF-16 # accessible range) are encoded using sequences of 1 to 4 octets. # # https://tools.ietf.org/html/rfc3629 return nil if length > 4 char = @scanner.peek(length) return nil if char.empty? return char if char.valid_encoding? peek_char(length + 1) end
def scan
def scan scanner.skip(SPACE) @byte_start = scanner.pos char = next_char return false unless char @token = next_token(char) true end
def scan_bracketed_comment
def scan_bracketed_comment nesting = 1 while (char = next_char) case char when '/' next unless peek_char == '*' next_char nesting += 1 when '*' next unless peek_char == '/' next_char nesting -= 1 return COMMENT if nesting == 0 end end end
def scan_bracketed_or_cql_comment
def scan_bracketed_or_cql_comment case peek_char when '*' then scan_bracketed_comment when '/' then scan_cql_comment else OTHER end end
def scan_cql_comment
def scan_cql_comment return OTHER unless peek_char == '/' while (char = next_char) break if char == "\n" end COMMENT end
def scan_dollar_sign
def scan_dollar_sign while (peek = peek_char) case peek when DIGIT next_char while peek_char =~ DIGIT when '$', '_', ALPHA, SPACE # PostgreSQL supports dollar-quoted string literal syntax, # like $foo$...$foo$. The tag (foo in this case) is optional, # and if present follows identifier rules. while (char = next_char) case char when '$' # This marks the end of the initial $foo$. snap = text slice = input.slice(scanner.pos, input.length) index = slice.index(snap) next unless index && index >= 0 delta = index + snap.bytesize @byte_end += delta scanner.pos += delta return STRING when SPACE # Unknown token starting with $, consume chars until space. @byte_end -= char.bytesize return OTHER end end else break end end OTHER end
def scan_keyword_or_identifier(possible_keyword:)
def scan_keyword_or_identifier(possible_keyword:) while (peek = peek_char) if peek == '_' || peek == '$' || peek =~ DIGIT possible_keyword = false next next_char end next next_char if ALPHA.match?(peek) break end return IDENT unless possible_keyword snap = text if snap.length < KEYWORD_MIN_LENGTH || snap.length > KEYWORD_MAX_LENGTH return IDENT end keyword = KEYWORDS[snap.length].find { |kw| snap.upcase == kw.to_s } return keyword if keyword IDENT end
def scan_numeric_literal
def scan_numeric_literal period = false exponent = false while (peek = peek_char) case peek when DIGIT then next_char when '.' return NUMBER if period next_char period = true when 'e', 'E' return NUMBER if exponent next_char next_char if /[+-]/.match?(peek_char) else break end end NUMBER end
def scan_quoted_indentifier(delimiter)
def scan_quoted_indentifier(delimiter) while (char = next_char) next unless char == delimiter if delimiter == '"' && peek_char == delimiter next next_char end break end # Remove quotes from identifier @byte_start += char.bytesize @byte_end -= char.bytesize IDENT end
def scan_simple_comment
def scan_simple_comment return OTHER unless peek_char == '-' while (char = next_char) break if char == "\n" end COMMENT end
def scan_string_literal
def scan_string_literal delimiter = "'" while (char = next_char) if char == '\\' # Skip escaped character, e.g. 'what\'s up?' next_char next end next unless char == delimiter return STRING unless peek_char return STRING if peek_char != delimiter next_char end end
def text
def text @input.byteslice(@byte_start, @byte_end - @byte_start) end