require 'strscan'
require 'rdoc/text'
##
# A recursive-descent parser for RDoc markup.
#
# The parser tokenizes an input string then parses the tokens into a Document.
# Documents can be converted into output formats by writing a visitor like
# RDoc::Markup::ToHTML.
#
# The parser only handles the block-level constructs Paragraph, List,
# ListItem, Heading, Verbatim, BlankLine and Rule. Inline markup such as
# <tt>\+blah\+</tt> is handled separately by RDoc::Markup::AttributeManager.
#
# To see what markup the Parser implements read RDoc. To see how to use
# RDoc markup to format text in your program read RDoc::Markup.
class RDoc::Markup::Parser
include RDoc::Text
##
# List token types
LIST_TOKENS = [
:BULLET,
:LABEL,
:LALPHA,
:NOTE,
:NUMBER,
:UALPHA,
]
##
# Parser error subclass
class Error < RuntimeError; end
##
# Raised when the parser is unable to handle the given markup
class ParseError < Error; end
##
# Enables display of debugging information
attr_accessor :debug
##
# Token accessor
attr_reader :tokens
##
# Parsers +str+ into a Document
def self.parse str
parser = new
#parser.debug = true
parser.tokenize str
RDoc::Markup::Document.new(*parser.parse)
end
##
# Returns a token stream for +str+, for testing
def self.tokenize str
parser = new
parser.tokenize str
parser.tokens
end
##
# Creates a new Parser. See also ::parse
def initialize
@tokens = []
@current_token = nil
@debug = false
@line = 0
@line_pos = 0
end
##
# Builds a Heading of +level+
def build_heading level
heading = RDoc::Markup::Heading.new level, text
skip :NEWLINE
heading
end
##
# Builds a List flush to +margin+
def build_list margin
p :list_start => margin if @debug
list = RDoc::Markup::List.new
until @tokens.empty? do
type, data, column, = get
case type
when :BULLET, :LABEL, :LALPHA, :NOTE, :NUMBER, :UALPHA then
list_type = type
if column < margin then
unget
break
end
if list.type and list.type != list_type then
unget
break
end
list.type = list_type
case type
when :NOTE, :LABEL then
_, indent, = get # SPACE
if :NEWLINE == peek_token.first then
get
peek_type, new_indent, peek_column, = peek_token
indent = new_indent if
peek_type == :INDENT and peek_column >= column
unget
end
else
data = nil
_, indent, = get
end
list_item = build_list_item(margin + indent, data)
list << list_item if list_item
else
unget
break
end
end
p :list_end => margin if @debug
return nil if list.empty?
list
end
##
# Builds a ListItem that is flush to +indent+ with type +item_type+
def build_list_item indent, item_type = nil
p :list_item_start => [indent, item_type] if @debug
list_item = RDoc::Markup::ListItem.new item_type
until @tokens.empty? do
type, data, column = get
if column < indent and
not type == :NEWLINE and
(type != :INDENT or data < indent) then
unget
break
end
case type
when :INDENT then
unget
list_item.push(*parse(indent))
when :TEXT then
unget
list_item << build_paragraph(indent)
when :HEADER then
list_item << build_heading(data)
when :NEWLINE then
list_item << RDoc::Markup::BlankLine.new
when *LIST_TOKENS then
unget
list_item << build_list(column)
else
raise ParseError, "Unhandled token #{@current_token.inspect}"
end
end
p :list_item_end => [indent, item_type] if @debug
return nil if list_item.empty?
list_item.parts.shift if
RDoc::Markup::BlankLine === list_item.parts.first and
list_item.length > 1
list_item
end
##
# Builds a Paragraph that is flush to +margin+
def build_paragraph margin
p :paragraph_start => margin if @debug
paragraph = RDoc::Markup::Paragraph.new
until @tokens.empty? do
type, data, column, = get
case type
when :INDENT then
next if data == margin and peek_token[0] == :TEXT
unget
break
when :TEXT then
if column != margin then
unget
break
end
paragraph << data
skip :NEWLINE
else
unget
break
end
end
p :paragraph_end => margin if @debug
paragraph
end
##
# Builds a Verbatim that is flush to +margin+
def build_verbatim margin
p :verbatim_begin => margin if @debug
verbatim = RDoc::Markup::Verbatim.new
until @tokens.empty? do
type, data, column, = get
case type
when :INDENT then
if margin >= data then
unget
break
end
indent = data - margin
verbatim << ' ' * indent
when :HEADER then
verbatim << '=' * data
_, _, peek_column, = peek_token
peek_column ||= column + data
verbatim << ' ' * (peek_column - column - data)
when :RULE then
width = 2 + data
verbatim << '-' * width
_, _, peek_column, = peek_token
peek_column ||= column + data + 2
verbatim << ' ' * (peek_column - column - width)
when :TEXT then
verbatim << data
when *LIST_TOKENS then
if column <= margin then
unget
break
end
list_marker = case type
when :BULLET then data
when :LABEL then "[#{data}]"
when :LALPHA, :NUMBER, :UALPHA then "#{data}."
when :NOTE then "#{data}::"
end
verbatim << list_marker
_, data, = get
verbatim << ' ' * (data - list_marker.length)
when :NEWLINE then
verbatim << data
break unless [:INDENT, :NEWLINE].include? peek_token[0]
else
unget
break
end
end
verbatim.normalize
p :verbatim_end => margin if @debug
verbatim
end
##
# Pulls the next token from the stream.
def get
@current_token = @tokens.shift
p :get => @current_token if @debug
@current_token
end
##
# Parses the tokens into a Document
def parse indent = 0
p :parse_start => indent if @debug
document = []
until @tokens.empty? do
type, data, column, = get
if type != :INDENT and column < indent then
unget
break
end
case type
when :HEADER then
document << build_heading(data)
when :INDENT then
if indent > data then
unget
break
elsif indent == data then
next
end
unget
document << build_verbatim(indent)
when :NEWLINE then
document << RDoc::Markup::BlankLine.new
skip :NEWLINE, false
when :RULE then
document << RDoc::Markup::Rule.new(data)
skip :NEWLINE
when :TEXT then
unget
document << build_paragraph(indent)
# we're done with this paragraph (indent mismatch)
break if peek_token[0] == :TEXT
when *LIST_TOKENS then
unget
list = build_list(indent)
document << list if list
# we're done with this list (indent mismatch)
break if LIST_TOKENS.include? peek_token.first and indent > 0
else
type, data, column, line = @current_token
raise ParseError,
"Unhandled token #{type} (#{data.inspect}) at #{line}:#{column}"
end
end
p :parse_end => indent if @debug
document
end
##
# Returns the next token on the stream without modifying the stream
def peek_token
token = @tokens.first || []
p :peek => token if @debug
token
end
##
# Skips a token of +token_type+, optionally raising an error.
def skip token_type, error = true
type, data, = get
return unless type # end of stream
return @current_token if token_type == type
unget
raise ParseError, "expected #{token_type} got #{@current_token.inspect}" if
error
end
##
# Consumes tokens until NEWLINE and turns them back into text
def text
text = ''
loop do
type, data, = get
text << case type
when :BULLET then
_, space, = get # SPACE
"*#{' ' * (space - 1)}"
when :LABEL then
_, space, = get # SPACE
"[#{data}]#{' ' * (space - data.length - 2)}"
when :LALPHA, :NUMBER, :UALPHA then
_, space, = get # SPACE
"#{data}.#{' ' * (space - 2)}"
when :NOTE then
_, space = get # SPACE
"#{data}::#{' ' * (space - data.length - 2)}"
when :TEXT then
data
when :NEWLINE then
unget
break
when nil then
break
else
raise ParseError, "unhandled token #{@current_token.inspect}"
end
end
text
end
##
# Calculates the column and line of the current token based on +offset+.
def token_pos offset
[offset - @line_pos, @line]
end
##
# Turns text +input+ into a stream of tokens
def tokenize input
s = StringScanner.new input
@line = 0
@line_pos = 0
until s.eos? do
pos = s.pos
@tokens << case
when s.scan(/\r?\n/) then
token = [:NEWLINE, s.matched, *token_pos(pos)]
@line_pos = s.pos
@line += 1
token
when s.scan(/ +/) then
[:INDENT, s.matched_size, *token_pos(pos)]
when s.scan(/(=+)\s*/) then
level = s[1].length
level = 6 if level > 6
@tokens << [:HEADER, level, *token_pos(pos)]
pos = s.pos
s.scan(/.*/)
[:TEXT, s.matched, *token_pos(pos)]
when s.scan(/^(-{3,}) *$/) then
[:RULE, s[1].length - 2, *token_pos(pos)]
when s.scan(/([*-])\s+/) then
@tokens << [:BULLET, s[1], *token_pos(pos)]
[:SPACE, s.matched_size, *token_pos(pos)]
when s.scan(/([a-z]|\d+)\.[ \t]+\S/i) then
list_label = s[1]
width = s.matched_size - 1
s.pos -= 1 # unget \S
list_type = case list_label
when /[a-z]/ then :LALPHA
when /[A-Z]/ then :UALPHA
when /\d/ then :NUMBER
else
raise ParseError, "BUG token #{list_label}"
end
@tokens << [list_type, list_label, *token_pos(pos)]
[:SPACE, width, *token_pos(pos)]
when s.scan(/\[(.*?)\]( +|$)/) then
@tokens << [:LABEL, s[1], *token_pos(pos)]
[:SPACE, s.matched_size, *token_pos(pos)]
when s.scan(/(.*?)::( +|$)/) then
@tokens << [:NOTE, s[1], *token_pos(pos)]
[:SPACE, s.matched_size, *token_pos(pos)]
else s.scan(/.*/)
[:TEXT, s.matched, *token_pos(pos)]
end
end
self
end
##
# Returns the current token or +token+ to the token stream
def unget token = @current_token
p :unget => token if @debug
raise Error, 'too many #ungets' if token == @tokens.first
@tokens.unshift token if token
end
end
require 'rdoc/markup/blank_line'
require 'rdoc/markup/document'
require 'rdoc/markup/heading'
require 'rdoc/markup/list'
require 'rdoc/markup/list_item'
require 'rdoc/markup/raw'
require 'rdoc/markup/paragraph'
require 'rdoc/markup/rule'
require 'rdoc/markup/verbatim'