lib/rdoc/markup/parser.rb



require 'strscan'
require 'rdoc/text'

##
# A recursive-descent parser for RDoc markup.
#
# The parser tokenizes an input string then parses the tokens into a Document.
# Documents can be converted into output formats by writing a visitor like
# RDoc::Markup::ToHTML.
#
# The parser only handles the block-level constructs Paragraph, List,
# ListItem, Heading, Verbatim, BlankLine and Rule.  Inline markup such as
# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.
#
# To see what markup the Parser implements read RDoc.  To see how to use
# RDoc markup to format text in your program read RDoc::Markup.

class RDoc::Markup::Parser

  include RDoc::Text

  ##
  # List token types

  LIST_TOKENS = [
    :BULLET,
    :LABEL,
    :LALPHA,
    :NOTE,
    :NUMBER,
    :UALPHA,
  ]

  ##
  # Parser error subclass

  class Error < RuntimeError; end

  ##
  # Raised when the parser is unable to handle the given markup

  class ParseError < Error; end

  ##
  # Enables display of debugging information

  attr_accessor :debug

  ##
  # Token accessor

  attr_reader :tokens

  ##
  # Parsers +str+ into a Document

  def self.parse str
    parser = new
    #parser.debug = true
    parser.tokenize str
    RDoc::Markup::Document.new(*parser.parse)
  end

  ##
  # Returns a token stream for +str+, for testing

  def self.tokenize str
    parser = new
    parser.tokenize str
    parser.tokens
  end

  ##
  # Creates a new Parser.  See also ::parse

  def initialize
    @tokens = []
    @current_token = nil
    @debug = false

    @line = 0
    @line_pos = 0
  end

  ##
  # Builds a Heading of +level+

  def build_heading level
    heading = RDoc::Markup::Heading.new level, text
    skip :NEWLINE

    heading
  end

  ##
  # Builds a List flush to +margin+

  def build_list margin
    p :list_start => margin if @debug

    list = RDoc::Markup::List.new

    until @tokens.empty? do
      type, data, column, = get

      case type
      when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then
        list_type = type

        if column < margin then
          unget
          break
        end

        if list.type and list.type != list_type then
          unget
          break
        end

        list.type = list_type

        case type
        when :NOTE, :LABEL then
          _, indent, = get # SPACE
          if :NEWLINE == peek_token.first then
            get
            peek_type, new_indent, peek_column, = peek_token
            indent = new_indent if
              peek_type == :INDENT and peek_column >= column
            unget
          end
        else
          data = nil
          _, indent, = get
        end

        list_item = build_list_item(margin + indent, data)

        list << list_item if list_item
      else
        unget
        break
      end
    end

    p :list_end => margin if @debug

    return nil if list.empty?

    list
  end

  ##
  # Builds a ListItem that is flush to +indent+ with type +item_type+

  def build_list_item indent, item_type = nil
    p :list_item_start => [indent, item_type] if @debug

    list_item = RDoc::Markup::ListItem.new item_type

    until @tokens.empty? do
      type, data, column = get

      if column < indent and
         not type == :NEWLINE and
         (type != :INDENT or data < indent) then
        unget
        break
      end

      case type
      when :INDENT then
        unget
        list_item.push(*parse(indent))
      when :TEXT then
        unget
        list_item << build_paragraph(indent)
      when :HEADER then
        list_item << build_heading(data)
      when :NEWLINE then
        list_item << RDoc::Markup::BlankLine.new
      when *LIST_TOKENS then
        unget
        list_item << build_list(column)
      else
        raise ParseError, "Unhandled token #{@current_token.inspect}"
      end
    end

    p :list_item_end => [indent, item_type] if @debug

    return nil if list_item.empty?

    list_item.parts.shift if
      RDoc::Markup::BlankLine === list_item.parts.first and
      list_item.length > 1

    list_item
  end

  ##
  # Builds a Paragraph that is flush to +margin+

  def build_paragraph margin
    p :paragraph_start => margin if @debug

    paragraph = RDoc::Markup::Paragraph.new

    until @tokens.empty? do
      type, data, column, = get

      case type
      when :INDENT then
        next if data == margin and peek_token[0] == :TEXT

        unget
        break
      when :TEXT then
        if column != margin then
          unget
          break
        end

        paragraph << data
        skip :NEWLINE
      else
        unget
        break
      end
    end

    p :paragraph_end => margin if @debug

    paragraph
  end

  ##
  # Builds a Verbatim that is flush to +margin+

  def build_verbatim margin
    p :verbatim_begin => margin if @debug
    verbatim = RDoc::Markup::Verbatim.new

    until @tokens.empty? do
      type, data, column, = get

      case type
      when :INDENT then
        if margin >= data then
          unget
          break
        end

        indent = data - margin

        verbatim << ' ' * indent
      when :HEADER then
        verbatim << '=' * data

        _, _, peek_column, = peek_token
        peek_column ||= column + data
        verbatim << ' ' * (peek_column - column - data)
      when :RULE then
        width = 2 + data
        verbatim << '-' * width

        _, _, peek_column, = peek_token
        peek_column ||= column + data + 2
        verbatim << ' ' * (peek_column - column - width)
      when :TEXT then
        verbatim << data
      when *LIST_TOKENS then
        if column <= margin then
          unget
          break
        end

        list_marker = case type
                      when :BULLET                   then data
                      when :LABEL                    then "[#{data}]"
                      when :LALPHA, :NUMBER, :UALPHA then "#{data}."
                      when :NOTE                     then "#{data}::"
                      end

        verbatim << list_marker

        _, data, = get

        verbatim << ' ' * (data - list_marker.length)
      when :NEWLINE then
        verbatim << data
        break unless [:INDENT, :NEWLINE].include? peek_token[0]
      else
        unget
        break
      end
    end

    verbatim.normalize

    p :verbatim_end => margin if @debug

    verbatim
  end

  ##
  # Pulls the next token from the stream.

  def get
    @current_token = @tokens.shift
    p :get => @current_token if @debug
    @current_token
  end

  ##
  # Parses the tokens into a Document

  def parse indent = 0
    p :parse_start => indent if @debug

    document = []

    until @tokens.empty? do
      type, data, column, = get

      if type != :INDENT and column < indent then
        unget
        break
      end

      case type
      when :HEADER then
        document << build_heading(data)
      when :INDENT then
        if indent > data then
          unget
          break
        elsif indent == data then
          next
        end

        unget
        document << build_verbatim(indent)
      when :NEWLINE then
        document << RDoc::Markup::BlankLine.new
        skip :NEWLINE, false
      when :RULE then
        document << RDoc::Markup::Rule.new(data)
        skip :NEWLINE
      when :TEXT then
        unget
        document << build_paragraph(indent)

        # we're done with this paragraph (indent mismatch)
        break if peek_token[0] == :TEXT
      when *LIST_TOKENS then
        unget

        list = build_list(indent)

        document << list if list

        # we're done with this list (indent mismatch)
        break if LIST_TOKENS.include? peek_token.first and indent > 0
      else
        type, data, column, line = @current_token
        raise ParseError,
              "Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
      end
    end

    p :parse_end => indent if @debug

    document
  end

  ##
  # Returns the next token on the stream without modifying the stream

  def peek_token
    token = @tokens.first || []
    p :peek => token if @debug
    token
  end

  ##
  # Skips a token of +token_type+, optionally raising an error.

  def skip token_type, error = true
    type, data, = get

    return unless type # end of stream

    return @current_token if token_type == type

    unget

    raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if
      error
  end

  ##
  # Consumes tokens until NEWLINE and turns them back into text

  def text
    text = ''

    loop do
      type, data, = get

      text << case type
              when :BULLET then
                _, space, = get # SPACE
                "*#{' ' * (space - 1)}"
              when :LABEL then
                _, space, = get # SPACE
                "[#{data}]#{' ' * (space - data.length - 2)}"
              when :LALPHA, :NUMBER, :UALPHA then
                _, space, = get # SPACE
                "#{data}.#{' ' * (space - 2)}"
              when :NOTE then
                _, space = get # SPACE
                "#{data}::#{' ' * (space - data.length - 2)}"
              when :TEXT then
                data
              when :NEWLINE then
                unget
                break
              when nil then
                break
              else
                raise ParseError, "unhandled token #{@current_token.inspect}"
              end
    end

    text
  end

  ##
  # Calculates the column and line of the current token based on +offset+.

  def token_pos offset
    [offset - @line_pos, @line]
  end

  ##
  # Turns text +input+ into a stream of tokens

  def tokenize input
    s = StringScanner.new input

    @line = 0
    @line_pos = 0

    until s.eos? do
      pos = s.pos

      @tokens << case
                 when s.scan(/\r?\n/) then
                   token = [:NEWLINE, s.matched, *token_pos(pos)]
                   @line_pos = s.pos
                   @line += 1
                   token
                 when s.scan(/ +/) then
                   [:INDENT, s.matched_size, *token_pos(pos)]
                 when s.scan(/(=+)\s*/) then
                   level = s[1].length
                   level = 6 if level > 6
                   @tokens << [:HEADER, level, *token_pos(pos)]

                   pos = s.pos
                   s.scan(/.*/)
                   [:TEXT, s.matched, *token_pos(pos)]
                 when s.scan(/^(-{3,}) *$/) then
                   [:RULE, s[1].length - 2, *token_pos(pos)]
                 when s.scan(/([*-])\s+/) then
                   @tokens << [:BULLET, s[1], *token_pos(pos)]
                   [:SPACE, s.matched_size, *token_pos(pos)]
                 when s.scan(/([a-z]|\d+)\.[ \t]+\S/i) then
                   list_label = s[1]
                   width      = s.matched_size - 1

                   s.pos -= 1 # unget \S

                   list_type = case list_label
                               when /[a-z]/ then :LALPHA
                               when /[A-Z]/ then :UALPHA
                               when /\d/    then :NUMBER
                               else
                                 raise ParseError, "BUG token #{list_label}"
                               end

                   @tokens << [list_type, list_label, *token_pos(pos)]
                   [:SPACE, width, *token_pos(pos)]
                 when s.scan(/\[(.*?)\]( +|$)/) then
                   @tokens << [:LABEL, s[1], *token_pos(pos)]
                   [:SPACE, s.matched_size, *token_pos(pos)]
                 when s.scan(/(.*?)::( +|$)/) then
                   @tokens << [:NOTE, s[1], *token_pos(pos)]
                   [:SPACE, s.matched_size, *token_pos(pos)]
                 else s.scan(/.*/)
                   [:TEXT, s.matched, *token_pos(pos)]
                 end
    end

    self
  end

  ##
  # Returns the current token or +token+ to the token stream

  def unget token = @current_token
    p :unget => token if @debug
    raise Error, 'too many #ungets' if token == @tokens.first
    @tokens.unshift token if token
  end

end

require 'rdoc/markup/blank_line'
require 'rdoc/markup/document'
require 'rdoc/markup/heading'
require 'rdoc/markup/list'
require 'rdoc/markup/list_item'
require 'rdoc/markup/raw'
require 'rdoc/markup/paragraph'
require 'rdoc/markup/rule'
require 'rdoc/markup/verbatim'