lib/rouge/lexer.rb



# -*- coding: utf-8 -*- #
# frozen_string_literal: true

# stdlib
require 'strscan'
require 'cgi'
require 'set'

module Rouge
  # @abstract
  # A lexer transforms text into a stream of `[token, chunk]` pairs.
  class Lexer
    include Token::Tokens

    @option_docs = {}

    class << self
      # Lexes `stream` with the given options.  The lex is delegated to a
      # new instance.
      #
      # @see #lex
      def lex(stream, opts={}, &b)
        new(opts).lex(stream, &b)
      end

      # In case #continue_lex is called statically, we simply
      # begin a new lex from the beginning, since there is no state.
      #
      # @see #continue_lex
      def continue_lex(*a, &b)
        lex(*a, &b)
      end

      # Given a name in string, return the correct lexer class.
      # @param [String] name
      # @return [Class<Rouge::Lexer>,nil]
      def find(name)
        registry[name.to_s]
      end

      # Find a lexer, with fancy shiny features.
      #
      # * The string you pass can include CGI-style options
      #
      #     Lexer.find_fancy('erb?parent=tex')
      #
      # * You can pass the special name 'guess' so we guess for you,
      #   and you can pass a second argument of the code to guess by
      #
      #     Lexer.find_fancy('guess', "#!/bin/bash\necho Hello, world")
      #
      # This is used in the Redcarpet plugin as well as Rouge's own
      # markdown lexer for highlighting internal code blocks.
      #
      def find_fancy(str, code=nil, additional_options={})

        if str && !str.include?('?') && str != 'guess'
          lexer_class = find(str)
          return lexer_class && lexer_class.new(additional_options)
        end

        name, opts = str ? str.split('?', 2) : [nil, '']

        # parse the options hash from a cgi-style string
        opts = CGI.parse(opts || '').map do |k, vals|
          val = case vals.size
          when 0 then true
          when 1 then vals[0]
          else vals
          end

          [ k.to_s, val ]
        end

        opts = additional_options.merge(Hash[opts])

        lexer_class = case name
        when 'guess', nil
          self.guess(:source => code, :mimetype => opts['mimetype'])
        when String
          self.find(name)
        end

        lexer_class && lexer_class.new(opts)
      end

      # Specify or get this lexer's title. Meant to be human-readable.
      def title(t=nil)
        if t.nil?
          t = tag.capitalize
        end
        @title ||= t
      end

      # Specify or get this lexer's description.
      def desc(arg=:absent)
        if arg == :absent
          @desc
        else
          @desc = arg
        end
      end

      def option_docs
        @option_docs ||= InheritableHash.new(superclass.option_docs)
      end

      def option(name, desc)
        option_docs[name.to_s] = desc
      end

      # Specify or get the path name containing a small demo for
      # this lexer (can be overriden by {demo}).
      def demo_file(arg=:absent)
        return @demo_file = Pathname.new(arg) unless arg == :absent

        @demo_file = Pathname.new(File.join(__dir__, 'demos', tag))
      end

      # Specify or get a small demo string for this lexer
      def demo(arg=:absent)
        return @demo = arg unless arg == :absent

        @demo = File.read(demo_file, mode: 'rt:bom|utf-8')
      end

      # @return a list of all lexers.
      def all
        @all ||= registry.values.uniq
      end

      # Guess which lexer to use based on a hash of info.
      #
      # This accepts the same arguments as Lexer.guess, but will never throw
      # an error.  It will return a (possibly empty) list of potential lexers
      # to use.
      def guesses(info={})
        mimetype, filename, source = info.values_at(:mimetype, :filename, :source)
        custom_globs = info[:custom_globs]

        guessers = (info[:guessers] || []).dup

        guessers << Guessers::Mimetype.new(mimetype) if mimetype
        guessers << Guessers::GlobMapping.by_pairs(custom_globs, filename) if custom_globs && filename
        guessers << Guessers::Filename.new(filename) if filename
        guessers << Guessers::Modeline.new(source) if source
        guessers << Guessers::Source.new(source) if source
        guessers << Guessers::Disambiguation.new(filename, source) if source && filename

        Guesser.guess(guessers, Lexer.all)
      end

      # Guess which lexer to use based on a hash of info.
      #
      # @option info :mimetype
      #   A mimetype to guess by
      # @option info :filename
      #   A filename to guess by
      # @option info :source
      #   The source itself, which, if guessing by mimetype or filename
      #   fails, will be searched for shebangs, <!DOCTYPE ...> tags, and
      #   other hints.
      # @param [Proc] fallback called if multiple lexers are detected.
      #   If omitted, Guesser::Ambiguous is raised.
      #
      # @see Lexer.detect?
      # @see Lexer.guesses
      # @return [Class<Rouge::Lexer>]
      def guess(info={}, &fallback)
        lexers = guesses(info)

        return Lexers::PlainText if lexers.empty?
        return lexers[0] if lexers.size == 1

        if fallback
          fallback.call(lexers)
        else
          raise Guesser::Ambiguous.new(lexers)
        end
      end

      def guess_by_mimetype(mt)
        guess :mimetype => mt
      end

      def guess_by_filename(fname)
        guess :filename => fname
      end

      def guess_by_source(source)
        guess :source => source
      end

      def enable_debug!
        @debug_enabled = true
      end

      def disable_debug!
        remove_instance_variable :@debug_enabled if defined? @debug_enabled
      end

      def debug_enabled?
        (defined? @debug_enabled) ? true : false
      end

      # Determine if a lexer has a method named +:detect?+ defined in its
      # singleton class.
      def detectable?
        @detectable ||= methods(false).include?(:detect?)
      end

    protected
      # @private
      def register(name, lexer)
        # reset an existing list of lexers
        @all = nil if defined?(@all)
        registry[name.to_s] = lexer
      end

    public
      # Used to specify or get the canonical name of this lexer class.
      #
      # @example
      #   class MyLexer < Lexer
      #     tag 'foo'
      #   end
      #
      #   MyLexer.tag # => 'foo'
      #
      #   Lexer.find('foo') # => MyLexer
      def tag(t=nil)
        return @tag if t.nil?

        @tag = t.to_s
        Lexer.register(@tag, self)
      end

      # Used to specify alternate names this lexer class may be found by.
      #
      # @example
      #   class Erb < Lexer
      #     tag 'erb'
      #     aliases 'eruby', 'rhtml'
      #   end
      #
      #   Lexer.find('eruby') # => Erb
      def aliases(*args)
        args.map!(&:to_s)
        args.each { |arg| Lexer.register(arg, self) }
        (@aliases ||= []).concat(args)
      end

      # Specify a list of filename globs associated with this lexer.
      #
      # If a filename glob is associated with more than one lexer, this can
      # cause a Guesser::Ambiguous error to be raised in various guessing
      # methods. These errors can be avoided by disambiguation. Filename globs
      # are disambiguated in one of two ways. Either the lexer will define a
      # `self.detect?` method (intended for use with shebangs and doctypes) or a
      # manual rule will be specified in Guessers::Disambiguation.
      #
      # @example
      #   class Ruby < Lexer
      #     filenames '*.rb', '*.ruby', 'Gemfile', 'Rakefile'
      #   end
      def filenames(*fnames)
        (@filenames ||= []).concat(fnames)
      end

      # Specify a list of mimetypes associated with this lexer.
      #
      # @example
      #   class Html < Lexer
      #     mimetypes 'text/html', 'application/xhtml+xml'
      #   end
      def mimetypes(*mts)
        (@mimetypes ||= []).concat(mts)
      end

      # @private
      def assert_utf8!(str)
        encoding = str.encoding.name
        return if encoding == 'US-ASCII' || encoding == 'UTF-8' || encoding == 'ASCII-8BIT'

        raise EncodingError.new(
          "Bad encoding: #{str.encoding.names.join(',')}. " +
          "Please convert your string to UTF-8."
        )
      end

    private
      def registry
        @registry ||= {}
      end
    end

    # -*- instance methods -*- #

    attr_reader :options
    # Create a new lexer with the given options.  Individual lexers may
    # specify extra options.  The only current globally accepted option
    # is `:debug`.
    #
    # @option opts :debug
    #   Prints debug information to stdout.  The particular info depends
    #   on the lexer in question.  In regex lexers, this will log the
    #   state stack at the beginning of each step, along with each regex
    #   tried and each stream consumed.  Try it, it's pretty useful.
    def initialize(opts={})
      @options = {}
      opts.each { |k, v| @options[k.to_s] = v }

      @debug = Lexer.debug_enabled? && bool_option('debug')
    end

    def as_bool(val)
      case val
      when nil, false, 0, '0', 'off'
        false
      when Array
        val.empty? ? true : as_bool(val.last)
      else
        true
      end
    end

    def as_string(val)
      return as_string(val.last) if val.is_a?(Array)

      val ? val.to_s : nil
    end

    def as_list(val)
      case val
      when Array
        val.flat_map { |v| as_list(v) }
      when String
        val.split(',')
      else
        []
      end
    end

    def as_lexer(val)
      return as_lexer(val.last) if val.is_a?(Array)
      return val.new(@options) if val.is_a?(Class) && val < Lexer

      case val
      when Lexer
        val
      when String
        lexer_class = Lexer.find(val)
        lexer_class && lexer_class.new(@options)
      end
    end

    def as_token(val)
      return as_token(val.last) if val.is_a?(Array)
      case val
      when Token
        val
      else
        Token[val]
      end
    end

    def bool_option(name, &default)
      name_str = name.to_s

      if @options.key?(name_str)
        as_bool(@options[name_str])
      else
        default ? default.call : false
      end
    end

    def string_option(name, &default)
      as_string(@options.delete(name.to_s, &default))
    end

    def lexer_option(name, &default)
      as_lexer(@options.delete(name.to_s, &default))
    end

    def list_option(name, &default)
      as_list(@options.delete(name.to_s, &default))
    end

    def token_option(name, &default)
      as_token(@options.delete(name.to_s, &default))
    end

    def hash_option(name, defaults, &val_cast)
      name = name.to_s
      out = defaults.dup

      base = @options.delete(name.to_s)
      base = {} unless base.is_a?(Hash)
      base.each { |k, v| out[k.to_s] = val_cast ? val_cast.call(v) : v }

      @options.keys.each do |key|
        next unless key =~ /(\w+)\[(\w+)\]/ and $1 == name
        value = @options.delete(key)

        out[$2] = val_cast ? val_cast.call(value) : value
      end

      out
    end

    # @abstract
    #
    # Called after each lex is finished.  The default implementation
    # is a noop.
    def reset!
    end

    # Given a string, yield [token, chunk] pairs.  If no block is given,
    # an enumerator is returned.
    #
    # @option opts :continue
    #   Continue the lex from the previous state (i.e. don't call #reset!)
    #
    # @note The use of :continue => true has been deprecated. A warning is
    #       issued if run with `$VERBOSE` set to true.
    #
    # @note The use of arbitrary `opts` has never been supported, but we
    #       previously ignored them with no error. We now warn unconditionally.
    def lex(string, opts=nil, &b)
      if opts
        if (opts.keys - [:continue]).size > 0
          # improper use of options hash
          warn('Improper use of Lexer#lex - this method does not receive options.' +
               ' This will become an error in a future version.')
        end

        if opts[:continue]
          warn '`lex :continue => true` is deprecated, please use #continue_lex instead'
          return continue_lex(string, &b)
        end
      end

      return enum_for(:lex, string) unless block_given?

      Lexer.assert_utf8!(string)
      reset!

      continue_lex(string, &b)
    end

    # Continue the lex from the the current state without resetting
    def continue_lex(string, &b)
      return enum_for(:continue_lex, string, &b) unless block_given?

      # consolidate consecutive tokens of the same type
      last_token = nil
      last_val = nil
      stream_tokens(string) do |tok, val|
        next if val.empty?

        if tok == last_token
          last_val << val
          next
        end

        b.call(last_token, last_val) if last_token
        last_token = tok
        last_val = val
      end

      b.call(last_token, last_val) if last_token
    end

    # delegated to {Lexer.tag}
    def tag
      self.class.tag
    end

    # @abstract
    #
    # Yield `[token, chunk]` pairs, given a prepared input stream.  This
    # must be implemented.
    #
    # @param [StringScanner] stream
    #   the stream
    def stream_tokens(stream, &b)
      raise 'abstract'
    end

    # @abstract
    #
    # Return true if there is an in-text indication (such as a shebang
    # or DOCTYPE declaration) that this lexer should be used.
    #
    # @param [TextAnalyzer] text
    #   the text to be analyzed, with a couple of handy methods on it,
    #   like {TextAnalyzer#shebang?} and {TextAnalyzer#doctype?}
    def self.detect?(text)
      false
    end
  end

  module Lexers
    @_loaded_lexers = {}

    def self.load_lexer(relpath)
      return if @_loaded_lexers.key?(relpath)
      @_loaded_lexers[relpath] = true
      load File.join(__dir__, 'lexers', relpath)
    end
  end
end