lib/syntax/lang/ruby.rb



require 'syntax'
require 'set'

module Syntax

  # A tokenizer for the Ruby language. It recognizes all common syntax
  # (and some less common syntax) but because it is not a true lexer, it
  # will make mistakes on some ambiguous cases.
  class Ruby < Tokenizer

    # The list of all identifiers recognized as keywords.
    KEYWORDS =
      Set.new %w{if then elsif else end begin do rescue ensure while for
         class module def yield raise until unless and or not when
         case super undef break next redo retry in return alias
         defined?}

    # Perform ruby-specific setup
    def setup
      @selector = false
      @allow_operator = false
      @heredocs = []
    end

    # Step through a single iteration of the tokenization process.
    def step
      case
        when bol? && check( /=begin/ )
          start_group( :comment, scan_until( /^=end#{EOL}/ ) )
        when bol? && check( /__END__#{EOL}/ )
          start_group( :comment, scan_until( /\Z/ ) )
      else
        case
          when check( /def\s+/ )
            start_group :keyword, scan( /def\s+/ )
            start_group :method,  scan_until( /(?=[;(\s]|#{EOL})/ )
          when check( /class\s+/ )
            start_group :keyword, scan( /class\s+/ )
            start_group :class,  scan_until( /(?=[;\s<]|#{EOL})/ )
          when check( /module\s+/ )
            start_group :keyword, scan( /module\s+/ )
            start_group :module,  scan_until( /(?=[;\s]|#{EOL})/ )
          when check( /::/ )
            start_group :punct, scan(/::/)
          when check( /:"/ )
            start_group :symbol, scan(/:/)
            scan_delimited_region :symbol, :symbol, "", true
            @allow_operator = true
          when check( /:'/ )
            start_group :symbol, scan(/:/)
            scan_delimited_region :symbol, :symbol, "", false
            @allow_operator = true
          when scan( /:[_a-zA-Z@$][$@\w]*[=!?]?/ )
            start_group :symbol, matched
            @allow_operator = true
          when scan( /\?(\\[^\n\r]|[^\\\n\r\s])/ )
            start_group :char, matched
            @allow_operator = true
          when check( /(__FILE__|__LINE__|true|false|nil|self)[?!]?/ )
            if @selector || matched[-1] == ?? || matched[-1] == ?!
              start_group :ident,
                scan(/(__FILE__|__LINE__|true|false|nil|self)[?!]?/)
            else
              start_group :constant,
                scan(/(__FILE__|__LINE__|true|false|nil|self)/)
            end
            @selector = false
            @allow_operator = true
          when scan(/0([bB][01]+|[oO][0-7]+|[dD][0-9]+|[xX][0-9a-fA-F]+)/)
            start_group :number, matched
            @allow_operator = true
          else
            case peek(2)
              when "%r"
                scan_delimited_region :punct, :regex, scan( /../ ), true
                @allow_operator = true
              when "%w", "%q"
                scan_delimited_region :punct, :string, scan( /../ ), false
                @allow_operator = true
              when "%s"
                scan_delimited_region :punct, :symbol, scan( /../ ), false
                @allow_operator = true
              when "%W", "%Q", "%x"
                scan_delimited_region :punct, :string, scan( /../ ), true
                @allow_operator = true
              when /%[^\sa-zA-Z0-9]/
                scan_delimited_region :punct, :string, scan( /./ ), true
                @allow_operator = true
              when "<<"
                saw_word = ( chunk[-1,1] =~ /[\w!?]/ )
                start_group :punct, scan( /<</ )
                if saw_word
                  @allow_operator = false
                  return
                end

                float_right = scan( /-/ )
                append "-" if float_right
                if ( type = scan( /['"]/ ) )
                  append type
                  delim = scan_until( /(?=#{type})/ )
                  if delim.nil?
                    append scan_until( /\Z/ )
                    return
                  end
                else
                  delim = scan( /\w+/ ) or return
                end
                start_group :constant, delim
                start_group :punct, scan( /#{type}/ ) if type
                @heredocs << [ float_right, type, delim ]
                @allow_operator = true
              else
                case peek(1)
                  when /[\n\r]/
                    unless @heredocs.empty?
                      scan_heredoc(*@heredocs.shift)
                    else
                      start_group :normal, scan( /\s+/ )
                    end
                    @allow_operator = false
                  when /\s/
                    start_group :normal, scan( /\s+/ )
                  when "#"
                    start_group :comment, scan( /#[^\n\r]*/ )
                  when /[A-Z]/
                    start_group @selector ? :ident : :constant, scan( /\w+/ )
                    @allow_operator = true
                  when /[a-z_]/
                    word = scan( /\w+[?!]?/ )
                    if !@selector && KEYWORDS.include?( word )
                      start_group :keyword, word
                      @allow_operator = false
                    elsif
                      start_group :ident, word
                      @allow_operator = true
                    end
                    @selector = false
                  when /\d/
                    start_group :number,
                      scan( /[\d_]+(\.[\d_]+)?([eE][\d_]+)?/ )
                    @allow_operator = true
                  when '"'
                    scan_delimited_region :punct, :string, "", true
                    @allow_operator = true
                  when '/'
                    if @allow_operator
                      start_group :punct, scan(%r{/})
                      @allow_operator = false
                    else
                      scan_delimited_region :punct, :regex, "", true
                      @allow_operator = true
                    end
                  when "'"
                    scan_delimited_region :punct, :string, "", false
                    @allow_operator = true
                  when "."
                    dots = scan( /\.{1,3}/ )
                    start_group :punct, dots
                    @selector = ( dots.length == 1 )
                  when /[@]/
                    start_group :attribute, scan( /@{1,2}\w*/ )
                    @allow_operator = true
                  when /[$]/
                    start_group :global, scan(/\$/)
                    start_group :global, scan( /\w+|./ ) if check(/./)
                    @allow_operator = true
                  when /[-!?*\/+=<>(\[\{}:;,&|%]/
                    start_group :punct, scan(/./)
                    @allow_operator = false
                  when /[)\]]/
                    start_group :punct, scan(/./)
                    @allow_operator = true
                  else
                    # all else just falls through this, to prevent
                    # infinite loops...
                    append getch
                end
            end
        end
      end
    end

    private

      # Scan a delimited region of text. This handles the simple cases (strings
      # delimited with quotes) as well as the more complex cases of %-strings
      # and here-documents.
      #
      # * +delim_group+ is the group to use to classify the delimiters of the
      #   region
      # * +inner_group+ is the group to use to classify the contents of the
      #   region
      # * +starter+ is the text to use as the starting delimiter
      # * +exprs+ is a boolean flag indicating whether the region is an
      #   interpolated string or not
      # * +delim+ is the text to use as the delimiter of the region. If +nil+,
      #   the next character will be treated as the delimiter.
      # * +heredoc+ is either +false+, meaning the region is not a heredoc, or
      #   <tt>:flush</tt> (meaning the delimiter must be flushed left), or
      #   <tt>:float</tt> (meaning the delimiter doens't have to be flush left).
      def scan_delimited_region( delim_group, inner_group, starter, exprs,
        delim=nil, heredoc=false )
      # begin
        if !delim
          start_group delim_group, starter
          delim = scan( /./ )
          append delim

          delim = case delim
            when '{' then '}'
            when '(' then ')'
            when '[' then ']'
            when '<' then '>'
            else delim
          end
        end

        start_region inner_group

        items = "\\\\|".dup
        if heredoc
          items << "(^"
          items << '\s*' if heredoc == :float
          items << "#{Regexp.escape(delim)}\s*?)#{EOL}"
        else
          items << "#{Regexp.escape(delim)}"
        end
        items << "|#(\\$|@@?|\\{)" if exprs
        items = Regexp.new( items )

        loop do
          p = pos
          match = scan_until( items )
          if match.nil?
            start_group inner_group, scan_until( /\Z/ )
            break
          else
            text = pre_match[p..-1]
            start_group inner_group, text if text.length > 0
            case matched.strip
              when "\\"
                unless exprs
                  case peek(1)
                    when "'"
                      scan(/./)
                      start_group :escape, "\\'"
                    when "\\"
                      scan(/./)
                      start_group :escape, "\\\\"
                    else
                      start_group inner_group, "\\"
                  end
                else
                  start_group :escape, "\\"
                  c = getch
                  append c
                  case c
                    when 'x'
                      append scan( /[a-fA-F0-9]{1,2}/ )
                    when /[0-7]/
                      append scan( /[0-7]{0,2}/ )
                  end
                end
              when delim
                end_region inner_group
                start_group delim_group, matched
                break
              when /^#/
                do_highlight = (option(:expressions) == :highlight)
                start_region :expr if do_highlight
                start_group :expr, matched
                case matched[1]
                  when ?{
                    depth = 1
                    content = "".dup
                    while depth > 0
                      p = pos
                      c = scan_until( /[\{}]/ )
                      if c.nil?
                        content << scan_until( /\Z/ )
                        break
                      else
                        depth += ( matched == "{" ? 1 : -1 )
                        content << pre_match[p..-1]
                        content << matched if depth > 0
                      end
                    end
                    if do_highlight
                      subtokenize "ruby", content
                      start_group :expr, "}"
                    else
                      append content + "}"
                    end
                  when ?$, ?@
                    append scan( /\w+/ )
                end
                end_region :expr if do_highlight
              else raise "unexpected match on #{matched}"
            end
          end
        end
      end

      # Scan a heredoc beginning at the current position.
      #
      # * +float+ indicates whether the delimiter may be floated to the right
      # * +type+ is +nil+, a single quote, or a double quote
      # * +delim+ is the delimiter to look for
      def scan_heredoc(float, type, delim)
        scan_delimited_region( :constant, :string, "", type != "'",
          delim, float ? :float : :flush )
      end
  end

  SYNTAX["ruby"] = Ruby

end