lib/graphql/language/lexer.rb



# frozen_string_literal: true

require "strscan"

module GraphQL
  module Language
    module Lexer
      IDENTIFIER =    /[_A-Za-z][_0-9A-Za-z]*/
      NEWLINE =       /[\c\r\n]/
      BLANK   =       /[, \t]+/
      COMMENT =       /#[^\n\r]*/
      INT =           /[-]?(?:[0]|[1-9][0-9]*)/
      FLOAT_DECIMAL = /[.][0-9]+/
      FLOAT_EXP =     /[eE][+-]?[0-9]+/
      FLOAT =         /#{INT}(#{FLOAT_DECIMAL}#{FLOAT_EXP}|#{FLOAT_DECIMAL}|#{FLOAT_EXP})/

      module Literals
        ON =            /on\b/
        FRAGMENT =      /fragment\b/
        TRUE =          /true\b/
        FALSE =         /false\b/
        NULL =          /null\b/
        QUERY =         /query\b/
        MUTATION =      /mutation\b/
        SUBSCRIPTION =  /subscription\b/
        SCHEMA =        /schema\b/
        SCALAR =        /scalar\b/
        TYPE =          /type\b/
        EXTEND =        /extend\b/
        IMPLEMENTS =    /implements\b/
        INTERFACE =     /interface\b/
        UNION =         /union\b/
        ENUM =          /enum\b/
        INPUT =         /input\b/
        DIRECTIVE =     /directive\b/
        REPEATABLE =    /repeatable\b/
        LCURLY =        '{'
        RCURLY =        '}'
        LPAREN =        '('
        RPAREN =        ')'
        LBRACKET =      '['
        RBRACKET =      ']'
        COLON =         ':'
        VAR_SIGN =      '$'
        DIR_SIGN =      '@'
        ELLIPSIS =      '...'
        EQUALS =        '='
        BANG =          '!'
        PIPE =          '|'
        AMP =           '&'
      end

      include Literals

      QUOTE =         '"'
      UNICODE_DIGIT = /[0-9A-Za-z]/
      FOUR_DIGIT_UNICODE = /#{UNICODE_DIGIT}{4}/
      N_DIGIT_UNICODE = %r{#{LCURLY}#{UNICODE_DIGIT}{4,}#{RCURLY}}x
      UNICODE_ESCAPE = %r{\\u(?:#{FOUR_DIGIT_UNICODE}|#{N_DIGIT_UNICODE})}
        # # https://graphql.github.io/graphql-spec/June2018/#sec-String-Value
      STRING_ESCAPE = %r{[\\][\\/bfnrt]}
      BLOCK_QUOTE =   '"""'
      ESCAPED_QUOTE = /\\"/;
      STRING_CHAR = /#{ESCAPED_QUOTE}|[^"\\]|#{UNICODE_ESCAPE}|#{STRING_ESCAPE}/

      LIT_NAME_LUT = Literals.constants.each_with_object({}) { |n, o|
        key = Literals.const_get(n)
        key = key.is_a?(Regexp) ? key.source.gsub(/(\\b|\\)/, '') : key
        o[key] = n
      }

      LIT = Regexp.union(Literals.constants.map { |n| Literals.const_get(n) })

      QUOTED_STRING = %r{#{QUOTE} (?:#{STRING_CHAR})* #{QUOTE}}x
      BLOCK_STRING = %r{
        #{BLOCK_QUOTE}
        (?: [^"\\]               |  # Any characters that aren't a quote or slash
           (?<!") ["]{1,2} (?!") |  # Any quotes that don't have quotes next to them
           \\"{0,3}(?!")         |  # A slash followed by <= 3 quotes that aren't followed by a quote
           \\                    |  # A slash
           "{1,2}(?!")              # 1 or 2 " followed by something that isn't a quote
        )*
        (?:"")?
        #{BLOCK_QUOTE}
      }xm

      # # catch-all for anything else. must be at the bottom for precedence.
      UNKNOWN_CHAR =         /./

      def self.tokenize string
        meta = {
          line: 1,
          col: 1,
          tokens: [],
          previous_token: nil,
        }

        unless string.valid_encoding?
          emit(:BAD_UNICODE_ESCAPE, 0, 0, meta, string)
          return meta[:tokens]
        end

        scan = StringScanner.new string

        while !scan.eos?
          pos = scan.pos

          case
          when str = scan.scan(FLOAT)         then emit(:FLOAT, pos, scan.pos, meta, str)
          when str = scan.scan(INT)           then emit(:INT, pos, scan.pos, meta, str)
          when str = scan.scan(LIT)           then emit(LIT_NAME_LUT[str], pos, scan.pos, meta, -str)
          when str = scan.scan(IDENTIFIER)    then emit(:IDENTIFIER, pos, scan.pos, meta, str)
          when str = scan.scan(BLOCK_STRING)  then emit_block(pos, scan.pos, meta, str.gsub(/^#{BLOCK_QUOTE}|#{BLOCK_QUOTE}$/, ''))
          when str = scan.scan(QUOTED_STRING) then emit_string(pos, scan.pos, meta, str.gsub(/^"|"$/, ''))
          when str = scan.scan(COMMENT)       then record_comment(pos, scan.pos, meta, str)
          when str = scan.scan(NEWLINE)
            meta[:line] += 1
            meta[:col] = 1
          when scan.scan(BLANK)
            meta[:col] += scan.pos - pos
          when str = scan.scan(UNKNOWN_CHAR) then emit(:UNKNOWN_CHAR, pos, scan.pos, meta, str)
          else
            # This should never happen since `UNKNOWN_CHAR` ensures we make progress
            raise "Unknown string?"
          end
        end

        meta[:tokens]
      end

      def self.emit(token_name, ts, te, meta, token_value)
        meta[:tokens] << token = [
          token_name,
          meta[:line],
          meta[:col],
          token_value,
          meta[:previous_token],
        ]
        meta[:previous_token] = token
        # Bump the column counter for the next token
        meta[:col] += te - ts
      end

      # Replace any escaped unicode or whitespace with the _actual_ characters
      # To avoid allocating more strings, this modifies the string passed into it
      def self.replace_escaped_characters_in_place(raw_string)
        raw_string.gsub!(ESCAPES, ESCAPES_REPLACE)
        raw_string.gsub!(UTF_8) do |_matched_str|
          codepoint_1 = ($1 || $2).to_i(16)
          codepoint_2 = $3

          if codepoint_2
            codepoint_2 = codepoint_2.to_i(16)
            if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate
                (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate
              # A surrogate pair
              combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000
              [combined].pack('U'.freeze)
            else
              # Two separate code points
              [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze)
            end
          else
            [codepoint_1].pack('U'.freeze)
          end
        end
        nil
      end

      def self.record_comment(ts, te, meta, str)
        token = [
          :COMMENT,
          meta[:line],
          meta[:col],
          str,
          meta[:previous_token],
        ]

        meta[:previous_token] = token

        meta[:col] += te - ts
      end

      ESCAPES = /\\["\\\/bfnrt]/
      ESCAPES_REPLACE = {
        '\\"' => '"',
        "\\\\" => "\\",
        "\\/" => '/',
        "\\b" => "\b",
        "\\f" => "\f",
        "\\n" => "\n",
        "\\r" => "\r",
        "\\t" => "\t",
      }
      UTF_8 = /\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
      VALID_STRING = /\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o

      def self.emit_block(ts, te, meta, value)
        line_incr = value.count("\n")
        value = GraphQL::Language::BlockString.trim_whitespace(value)
        emit_string(ts, te, meta, value)
        meta[:line] += line_incr
      end

      def self.emit_string(ts, te, meta, value)
        if !value.valid_encoding? || !value.match?(VALID_STRING)
          emit(:BAD_UNICODE_ESCAPE, ts, te, meta, value)
        else
          replace_escaped_characters_in_place(value)

          if !value.valid_encoding?
            emit(:BAD_UNICODE_ESCAPE, ts, te, meta, value)
          else
            emit(:STRING, ts, te, meta, value)
          end
        end
      end
    end
  end
end