lib/syntax/common.rb



require 'strscan'

module Syntax

  # A single token extracted by a tokenizer. It is simply the lexeme
  # itself, decorated with a 'group' attribute to identify the type of the
  # lexeme.
  class Token < String

    # the type of the lexeme that was extracted.
    attr_reader :group

    # the instruction associated with this token (:none, :region_open, or
    # :region_close)
    attr_reader :instruction

    # Create a new Token representing the given text, and belonging to the
    # given group.
    def initialize( text, group, instruction = :none )
      super text
      @group = group
      @instruction = instruction
    end

  end

  # The base class of all tokenizers. It sets up the scanner and manages the
  # looping until all tokens have been extracted. It also provides convenience
  # methods to make sure adjacent tokens of identical groups are returned as
  # a single token.
  class Tokenizer

    # The current group being processed by the tokenizer
    attr_reader :group

    # The current chunk of text being accumulated
    attr_reader :chunk

    # Start tokenizing. This sets up the state in preparation for tokenization,
    # such as creating a new scanner for the text and saving the callback block.
    # The block will be invoked for each token extracted.
    def start( text, &block )
      @chunk = "".dup
      @group = :normal
      @callback = block
      @text = StringScanner.new( text )
      setup
    end

    # Subclasses may override this method to provide implementation-specific
    # setup logic.
    def setup
    end

    # Finish tokenizing. This flushes the buffer, yielding any remaining text
    # to the client.
    def finish
      start_group nil
      teardown
    end

    # Subclasses may override this method to provide implementation-specific
    # teardown logic.
    def teardown
    end

    # Subclasses must implement this method, which is called for each iteration
    # of the tokenization process. This method may extract multiple tokens.
    def step
      raise NotImplementedError, "subclasses must implement #step"
    end

    # Begins tokenizing the given text, calling #step until the text has been
    # exhausted.
    def tokenize( text, &block )
      start text, &block
      step until @text.eos?
      finish
    end

    # Specify a set of tokenizer-specific options. Each tokenizer may (or may
    # not) publish any options, but if a tokenizer does those options may be
    # used to specify optional behavior.
    def set( opts={} )
      ( @options ||= Hash.new ).update opts
    end

    # Get the value of the specified option.
    def option(opt)
      @options ? @options[opt] : nil
    end

    private

      EOL = /(?=\r\n?|\n|$)/

      # A convenience for delegating method calls to the scanner.
      def self.delegate( sym )
        define_method( sym ) { |*a| @text.__send__( sym, *a ) }
      end

      delegate :bol?
      delegate :eos?
      delegate :scan
      delegate :scan_until
      delegate :check
      delegate :check_until
      delegate :getch
      delegate :matched
      delegate :pre_match
      delegate :peek
      delegate :pos

      # Access the n-th subgroup from the most recent match.
      def subgroup(n)
        @text[n]
      end

      # Append the given data to the currently active chunk.
      def append( data )
        @chunk << data
      end

      # Request that a new group be started. If the current group is the same
      # as the group being requested, a new group will not be created. If a new
      # group is created and the current chunk is not empty, the chunk's
      # contents will be yielded to the client as a token, and then cleared.
      #
      # After the new group is started, if +data+ is non-nil it will be appended
      # to the chunk.
      def start_group( gr, data=nil )
        flush_chunk if gr != @group
        @group = gr
        @chunk << data if data
      end

      def start_region( gr, data=nil )
        flush_chunk
        @group = gr
        @callback.call( Token.new( data||"", @group, :region_open ) )
      end

      def end_region( gr, data=nil )
        flush_chunk
        @group = gr
        @callback.call( Token.new( data||"", @group, :region_close ) )
      end

      def flush_chunk
        @callback.call( Token.new( @chunk, @group ) ) unless @chunk.empty?
        @chunk = "".dup
      end

      def subtokenize( syntax, text )
        tokenizer = Syntax.load( syntax )
        tokenizer.set @options if @options
        flush_chunk
        tokenizer.tokenize( text, &@callback )
      end

  end

end