lib/coderay/scanner.rb



# encoding: utf-8
require 'strscan'

module CodeRay
  
  autoload :WordList, coderay_path('helpers', 'word_list')
  
  # = Scanners
  #
  # This module holds the Scanner class and its subclasses.
  # For example, the Ruby scanner is named CodeRay::Scanners::Ruby
  # can be found in coderay/scanners/ruby.
  #
  # Scanner also provides methods and constants for the register
  # mechanism and the [] method that returns the Scanner class
  # belonging to the given lang.
  #
  # See PluginHost.
  module Scanners
    extend PluginHost
    plugin_path File.dirname(__FILE__), 'scanners'
    
    
    # = Scanner
    #
    # The base class for all Scanners.
    #
    # It is a subclass of Ruby's great +StringScanner+, which
    # makes it easy to access the scanning methods inside.
    #
    # It is also +Enumerable+, so you can use it like an Array of
    # Tokens:
    #
    #   require 'coderay'
    #   
    #   c_scanner = CodeRay::Scanners[:c].new "if (*p == '{') nest++;"
    #   
    #   for text, kind in c_scanner
    #     puts text if kind == :operator
    #   end
    #   
    #   # prints: (*==)++;
    #
    # OK, this is a very simple example :)
    # You can also use +map+, +any?+, +find+ and even +sort_by+,
    # if you want.
    class Scanner < StringScanner
      
      extend Plugin
      plugin_host Scanners
      
      # Raised if a Scanner fails while scanning
      ScanError = Class.new StandardError
      
      # The default options for all scanner classes.
      #
      # Define @default_options for subclasses.
      DEFAULT_OPTIONS = { }
      
      KINDS_NOT_LOC = [:comment, :doctype, :docstring]
      
      attr_accessor :state
      
      class << self
        
        # Normalizes the given code into a string with UNIX newlines, in the
        # scanner's internal encoding, with invalid and undefined charachters
        # replaced by placeholders. Always returns a new object.
        def normalize code
          # original = code
          code = code.to_s unless code.is_a? ::String
          return code if code.empty?
          
          if code.respond_to? :encoding
            code = encode_with_encoding code, self.encoding
          else
            code = to_unix code
          end
          # code = code.dup if code.eql? original
          code
        end
        
        # The typical filename suffix for this scanner's language.
        def file_extension extension = lang
          @file_extension ||= extension.to_s
        end
        
        # The encoding used internally by this scanner.
        def encoding name = 'UTF-8'
          @encoding ||= defined?(Encoding.find) && Encoding.find(name)
        end
        
        # The lang of this Scanner class, which is equal to its Plugin ID.
        def lang
          @plugin_id
        end
        
      protected
        
        def encode_with_encoding code, target_encoding
          if code.encoding == target_encoding
            if code.valid_encoding?
              return to_unix(code)
            else
              source_encoding = guess_encoding code
            end
          else
            source_encoding = code.encoding
          end
          # print "encode_with_encoding from #{source_encoding} to #{target_encoding}"
          code.encode target_encoding, source_encoding, :universal_newline => true, :undef => :replace, :invalid => :replace
        end
        
        def to_unix code
          code.index(?\r) ? code.gsub(/\r\n?/, "\n") : code
        end
        
        def guess_encoding s
          #:nocov:
          IO.popen("file -b --mime -", "w+") do |file|
            file.write s[0, 1024]
            file.close_write
            begin
              Encoding.find file.gets[/charset=([-\w]+)/, 1]
            rescue ArgumentError
              Encoding::BINARY
            end
          end
          #:nocov:
        end
        
      end
      
      # Create a new Scanner.
      #
      # * +code+ is the input String and is handled by the superclass
      #   StringScanner.
      # * +options+ is a Hash with Symbols as keys.
      #   It is merged with the default options of the class (you can
      #   overwrite default options here.)
      #
      # Else, a Tokens object is used.
      def initialize code = '', options = {}
        if self.class == Scanner
          raise NotImplementedError, "I am only the basic Scanner class. I can't scan anything. :( Use my subclasses."
        end
        
        @options = self.class::DEFAULT_OPTIONS.merge options
        
        super self.class.normalize(code)
        
        @tokens = options[:tokens] || Tokens.new
        @tokens.scanner = self if @tokens.respond_to? :scanner=
        
        setup
      end
      
      # Sets back the scanner. Subclasses should redefine the reset_instance
      # method instead of this one.
      def reset
        super
        reset_instance
      end
      
      # Set a new string to be scanned.
      def string= code
        code = self.class.normalize(code)
        super code
        reset_instance
      end
      
      # the Plugin ID for this scanner
      def lang
        self.class.lang
      end
      
      # the default file extension for this scanner
      def file_extension
        self.class.file_extension
      end
      
      # Scan the code and returns all tokens in a Tokens object.
      def tokenize source = nil, options = {}
        options = @options.merge(options)
        
        set_tokens_from_options options
        set_string_from_source source
        
        begin
          scan_tokens @tokens, options
        rescue => e
          message = "Error in %s#scan_tokens, initial state was: %p" % [self.class, defined?(state) && state]
          raise_inspect e.message, @tokens, message, 30, e.backtrace
        end
        
        @cached_tokens = @tokens
        if source.is_a? Array
          @tokens.split_into_parts(*source.map { |part| part.size })
        else
          @tokens
        end
      end
      
      # Cache the result of tokenize.
      def tokens
        @cached_tokens ||= tokenize
      end
      
      # Traverse the tokens.
      def each &block
        tokens.each(&block)
      end
      include Enumerable
      
      # The current line position of the scanner, starting with 1.
      # See also: #column.
      #
      # Beware, this is implemented inefficiently. It should be used
      # for debugging only.
      def line pos = self.pos
        return 1 if pos <= 0
        binary_string[0...pos].count("\n") + 1
      end
      
      # The current column position of the scanner, starting with 1.
      # See also: #line.
      def column pos = self.pos
        return 1 if pos <= 0
        pos - (binary_string.rindex(?\n, pos - 1) || -1)
      end
      
      # The string in binary encoding.
      # 
      # To be used with #pos, which is the index of the byte the scanner
      # will scan next.
      def binary_string
        @binary_string ||=
          if string.respond_to?(:bytesize) && string.bytesize != string.size
            #:nocov:
            string.dup.force_encoding('binary')
            #:nocov:
          else
            string
          end
      end
      
    protected
      
      # Can be implemented by subclasses to do some initialization
      # that has to be done once per instance.
      #
      # Use reset for initialization that has to be done once per
      # scan.
      def setup  # :doc:
      end
      
      def set_string_from_source source
        case source
        when Array
          self.string = self.class.normalize(source.join)
        when nil
          reset
        else
          self.string = self.class.normalize(source)
        end
      end
      
      def set_tokens_from_options options
        @tokens = options[:tokens] || @tokens || Tokens.new
        @tokens.scanner = self if @tokens.respond_to? :scanner=
      end
      
      # This is the central method, and commonly the only one a
      # subclass implements.
      #
      # Subclasses must implement this method; it must return +tokens+
      # and must only use Tokens#<< for storing scanned tokens!
      def scan_tokens tokens, options  # :doc:
        raise NotImplementedError, "#{self.class}#scan_tokens not implemented."
      end
      
      # Resets the scanner.
      def reset_instance
        @tokens.clear if @tokens.respond_to?(:clear) && !@options[:keep_tokens]
        @cached_tokens = nil
        @binary_string = nil if defined? @binary_string
      end
      
      SCAN_ERROR_MESSAGE = <<-MESSAGE


***ERROR in %s: %s (after %s tokens)

tokens:
%s

%s

surrounding code:
%p  ~~  %p


***ERROR***

      MESSAGE
      
      def raise_inspect_arguments message, tokens, state, ambit
        return File.basename(caller[0]),
          message,
          tokens_size(tokens),
          tokens_last(tokens, 10).map(&:inspect).join("\n"),
          scanner_state_info(state),
          binary_string[pos - ambit, ambit],
          binary_string[pos, ambit]
      end
      
      SCANNER_STATE_INFO = <<-INFO
current line: %d  column: %d  pos: %d
matched: %p  state: %p
bol?: %p,  eos?: %p
      INFO
      
      def scanner_state_info state
        SCANNER_STATE_INFO % [
          line, column, pos,
          matched, state || 'No state given!',
          bol?, eos?,
        ]
      end
      
      # Scanner error with additional status information
      def raise_inspect message, tokens, state = self.state, ambit = 30, backtrace = caller
        raise ScanError, SCAN_ERROR_MESSAGE % raise_inspect_arguments(message, tokens, state, ambit), backtrace
      end
      
      def tokens_size tokens
        tokens.size if tokens.respond_to?(:size)
      end
      
      def tokens_last tokens, n
        tokens.respond_to?(:last) ? tokens.last(n) : []
      end
      
      # Shorthand for scan_until(/\z/).
      # This method also avoids a JRuby 1.9 mode bug.
      def scan_rest
        rest = self.rest
        terminate
        rest
      end
      
    end
    
  end
end