lib/graphql/language/lexer.rb



# frozen_string_literal: true
module GraphQL
  module Language

    class Lexer
      def initialize(graphql_str, filename: nil, max_tokens: nil)
        if !(graphql_str.encoding == Encoding::UTF_8 || graphql_str.ascii_only?)
          graphql_str = graphql_str.dup.force_encoding(Encoding::UTF_8)
        end
        @string = graphql_str
        @filename = filename
        @scanner = StringScanner.new(graphql_str)
        @pos = nil
        @max_tokens = max_tokens || Float::INFINITY
        @tokens_count = 0
      end

      def eos?
        @scanner.eos?
      end

      attr_reader :pos

      def advance
        @scanner.skip(IGNORE_REGEXP)
        return false if @scanner.eos?
        @tokens_count += 1
        if @tokens_count > @max_tokens
          raise_parse_error("This query is too large to execute.")
        end
        @pos = @scanner.pos
        next_byte = @string.getbyte(@pos)
        next_byte_is_for = FIRST_BYTES[next_byte]
        case next_byte_is_for
        when ByteFor::PUNCTUATION
          @scanner.pos += 1
          PUNCTUATION_NAME_FOR_BYTE[next_byte]
        when ByteFor::NAME
          if len = @scanner.skip(KEYWORD_REGEXP)
            case len
            when 2
              :ON
            when 12
              :SUBSCRIPTION
            else
              pos = @pos

              # Use bytes 2 and 3 as a unique identifier for this keyword
              bytes = (@string.getbyte(pos + 2) << 8) | @string.getbyte(pos + 1)
              KEYWORD_BY_TWO_BYTES[_hash(bytes)]
            end
          else
            @scanner.skip(IDENTIFIER_REGEXP)
            :IDENTIFIER
          end
        when ByteFor::IDENTIFIER
          @scanner.skip(IDENTIFIER_REGEXP)
          :IDENTIFIER
        when ByteFor::NUMBER
          @scanner.skip(NUMERIC_REGEXP)

          if GraphQL.reject_numbers_followed_by_names
            new_pos = @scanner.pos
            peek_byte = @string.getbyte(new_pos)
            next_first_byte = FIRST_BYTES[peek_byte]
            if next_first_byte == ByteFor::NAME || next_first_byte == ByteFor::IDENTIFIER
              number_part = token_value
              name_part = @scanner.scan(IDENTIFIER_REGEXP)
              raise_parse_error("Name after number is not allowed (in `#{number_part}#{name_part}`)")
            end
          end
          # Check for a matched decimal:
          @scanner[1] ? :FLOAT : :INT
        when ByteFor::ELLIPSIS
          if @string.getbyte(@pos + 1) != 46 || @string.getbyte(@pos + 2) != 46
            raise_parse_error("Expected `...`, actual: #{@string[@pos..@pos + 2].inspect}")
          end
          @scanner.pos += 3
          :ELLIPSIS
        when ByteFor::STRING
          if @scanner.skip(BLOCK_STRING_REGEXP) || @scanner.skip(QUOTED_STRING_REGEXP)
            :STRING
          else
            raise_parse_error("Expected string or block string, but it was malformed")
          end
        else
          @scanner.pos += 1
          :UNKNOWN_CHAR
        end
      rescue ArgumentError => err
        if err.message == "invalid byte sequence in UTF-8"
          raise_parse_error("Parse error on bad Unicode escape sequence", nil, nil)
        end
      end

      def token_value
        @string.byteslice(@scanner.pos - @scanner.matched_size, @scanner.matched_size)
      rescue StandardError => err
        raise GraphQL::Error, "(token_value failed: #{err.class}: #{err.message})"
      end

      def debug_token_value(token_name)
        if token_name && Lexer::Punctuation.const_defined?(token_name)
          Lexer::Punctuation.const_get(token_name)
        elsif token_name == :ELLIPSIS
          "..."
        elsif token_name == :STRING
          string_value
        elsif @scanner.matched_size.nil?
          @scanner.peek(1)
        else
          token_value
        end
      end

      ESCAPES = /\\["\\\/bfnrt]/
      ESCAPES_REPLACE = {
        '\\"' => '"',
        "\\\\" => "\\",
        "\\/" => '/',
        "\\b" => "\b",
        "\\f" => "\f",
        "\\n" => "\n",
        "\\r" => "\r",
        "\\t" => "\t",
      }
      UTF_8 = /\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
      VALID_STRING = /\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
      ESCAPED = /(?:#{ESCAPES}|#{UTF_8})/o

      def string_value
        str = token_value
        is_block = str.start_with?('"""')
        if is_block
          str.gsub!(/\A"""|"""\z/, '')
          return Language::BlockString.trim_whitespace(str)
        else
          str.gsub!(/\A"|"\z/, '')

          if !str.valid_encoding? || !str.match?(VALID_STRING)
            raise_parse_error("Bad unicode escape in #{str.inspect}")
          else
            Lexer.replace_escaped_characters_in_place(str)

            if !str.valid_encoding?
              raise_parse_error("Bad unicode escape in #{str.inspect}")
            else
              str
            end
          end
        end
      end

      def line_number
        @scanner.string[0..@pos].count("\n") + 1
      end

      def column_number
        @scanner.string[0..@pos].split("\n").last.length
      end

      def raise_parse_error(message, line = line_number, col = column_number)
        raise GraphQL::ParseError.new(message, line, col, @string, filename: @filename)
      end

      IGNORE_REGEXP = %r{
        (?:
          [, \c\r\n\t]+ |
          \#.*$
        )*
      }x
      IDENTIFIER_REGEXP = /[_A-Za-z][_0-9A-Za-z]*/
      INT_REGEXP =        /-?(?:[0]|[1-9][0-9]*)/
      FLOAT_DECIMAL_REGEXP = /[.][0-9]+/
      FLOAT_EXP_REGEXP =     /[eE][+-]?[0-9]+/
      # TODO: FLOAT_EXP_REGEXP should not be allowed to follow INT_REGEXP, integers are not allowed to have exponent parts.
      NUMERIC_REGEXP =  /#{INT_REGEXP}(#{FLOAT_DECIMAL_REGEXP}#{FLOAT_EXP_REGEXP}|#{FLOAT_DECIMAL_REGEXP}|#{FLOAT_EXP_REGEXP})?/

      KEYWORDS = [
        "on",
        "fragment",
        "true",
        "false",
        "null",
        "query",
        "mutation",
        "subscription",
        "schema",
        "scalar",
        "type",
        "extend",
        "implements",
        "interface",
        "union",
        "enum",
        "input",
        "directive",
        "repeatable"
      ].freeze

      KEYWORD_REGEXP = /#{Regexp.union(KEYWORDS.sort)}\b/
      KEYWORD_BY_TWO_BYTES = [
        :INTERFACE,
        :MUTATION,
        :EXTEND,
        :FALSE,
        :ENUM,
        :TRUE,
        :NULL,
        nil,
        nil,
        nil,
        nil,
        nil,
        nil,
        nil,
        :QUERY,
        nil,
        nil,
        :REPEATABLE,
        :IMPLEMENTS,
        :INPUT,
        :TYPE,
        :SCHEMA,
        nil,
        nil,
        nil,
        :DIRECTIVE,
        :UNION,
        nil,
        nil,
        :SCALAR,
        nil,
        :FRAGMENT
      ]

      # This produces a unique integer for bytes 2 and 3 of each keyword string
      # See https://tenderlovemaking.com/2023/09/02/fast-tokenizers-with-stringscanner.html
      def _hash key
        (key * 18592990) >> 27 & 0x1f
      end

      module Punctuation
        LCURLY =        '{'
        RCURLY =        '}'
        LPAREN =        '('
        RPAREN =        ')'
        LBRACKET =      '['
        RBRACKET =      ']'
        COLON =         ':'
        VAR_SIGN =      '$'
        DIR_SIGN =      '@'
        EQUALS =        '='
        BANG =          '!'
        PIPE =          '|'
        AMP =           '&'
      end

      # A sparse array mapping the bytes for each punctuation
      # to a symbol name for that punctuation
      PUNCTUATION_NAME_FOR_BYTE = Punctuation.constants.each_with_object([]) { |name, arr|
        punct = Punctuation.const_get(name)
        arr[punct.ord] = name
      }

      QUOTE =         '"'
      UNICODE_DIGIT = /[0-9A-Za-z]/
      FOUR_DIGIT_UNICODE = /#{UNICODE_DIGIT}{4}/
      N_DIGIT_UNICODE = %r{#{Punctuation::LCURLY}#{UNICODE_DIGIT}{4,}#{Punctuation::RCURLY}}x
      UNICODE_ESCAPE = %r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}
      STRING_ESCAPE = %r{[\\][\\/bfnrt]}
      BLOCK_QUOTE =   '"""'
      ESCAPED_QUOTE = /\\"/;
      STRING_CHAR = /#{ESCAPED_QUOTE}|[^"\\\n\r]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/
      QUOTED_STRING_REGEXP = %r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x
      BLOCK_STRING_REGEXP = %r{
        #{BLOCK_QUOTE}
        (?: [^"\\]               |  # Any characters that aren't a quote or slash
           (?<!") ["]{1,2} (?!") |  # Any quotes that don't have quotes next to them
           \\"{0,3}(?!")         |  # A slash followed by <= 3 quotes that aren't followed by a quote
           \\                    |  # A slash
           "{1,2}(?!")              # 1 or 2 " followed by something that isn't a quote
        )*
        (?:"")?
        #{BLOCK_QUOTE}
      }xm

      # Use this array to check, for a given byte that will start a token,
      # what kind of token might it start?
      FIRST_BYTES = Array.new(255)

      module ByteFor
        NUMBER = 0 # int or float
        NAME = 1 # identifier or keyword
        STRING = 2
        ELLIPSIS = 3
        IDENTIFIER = 4 # identifier, *not* a keyword
        PUNCTUATION = 5
      end

      (0..9).each { |i| FIRST_BYTES[i.to_s.ord] = ByteFor::NUMBER }
      FIRST_BYTES["-".ord] = ByteFor::NUMBER
      # Some of these may be overwritten below, if keywords start with the same character
      ("A".."Z").each { |char| FIRST_BYTES[char.ord] = ByteFor::IDENTIFIER }
      ("a".."z").each { |char| FIRST_BYTES[char.ord] = ByteFor::IDENTIFIER }
      FIRST_BYTES['_'.ord] = ByteFor::IDENTIFIER
      FIRST_BYTES['.'.ord] = ByteFor::ELLIPSIS
      FIRST_BYTES['"'.ord] = ByteFor::STRING
      KEYWORDS.each { |kw| FIRST_BYTES[kw.getbyte(0)] = ByteFor::NAME }
      Punctuation.constants.each do |punct_name|
        punct = Punctuation.const_get(punct_name)
        FIRST_BYTES[punct.ord] = ByteFor::PUNCTUATION
      end


      # Replace any escaped unicode or whitespace with the _actual_ characters
      # To avoid allocating more strings, this modifies the string passed into it
      def self.replace_escaped_characters_in_place(raw_string)
        raw_string.gsub!(ESCAPED) do |matched_str|
          if (point_str_1 = $1 || $2)
            codepoint_1 = point_str_1.to_i(16)
            if (codepoint_2 = $3)
              codepoint_2 = codepoint_2.to_i(16)
              if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate
                  (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate
                # A surrogate pair
                combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000
                [combined].pack('U'.freeze)
              else
                # Two separate code points
                [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze)
              end
            else
              [codepoint_1].pack('U'.freeze)
            end
          else
            ESCAPES_REPLACE[matched_str]
          end
        end
        nil
      end

      # This is not used during parsing because the parser
      # doesn't actually need tokens.
      def self.tokenize(string)
        lexer = GraphQL::Language::Lexer.new(string)
        tokens = []
        prev_token = nil
        while (token_name = lexer.advance)
          new_token = [
            token_name,
            lexer.line_number,
            lexer.column_number,
            lexer.debug_token_value(token_name),
            prev_token,
          ]
          tokens << new_token
          prev_token = new_token
        end
        tokens
      end
    end
  end
end