require 'regexp_parser/expression'
module Regexp::Parser
include Regexp::Expression
include Regexp::Syntax
class ParserError < StandardError; end
class UnknownTokenTypeError < ParserError
def initialize(type, token)
super "Unknown token type #{type} #{token.inspect}"
end
end
class UnknownTokenError < ParserError
def initialize(type, token)
super "Unknown #{type} token #{token.token}"
end
end
def self.parse(input, syntax = "ruby/#{RUBY_VERSION}", &block)
@nesting = [@root = @node = Root.new]
@conditional_nesting = []
Regexp::Lexer.scan(input, syntax) do |token|
parse_token token
end
if block_given?
block.call @root
else
@root
end
end
def self.nest(exp)
@nesting.push exp
@node << exp
@node = exp
end
def self.nest_conditional(exp)
@conditional_nesting.push exp
@node << exp
@node = exp
end
def self.parse_token(token)
case token.type
when :meta; meta(token)
when :quantifier; quantifier(token)
when :anchor; anchor(token)
when :escape; escape(token)
when :group; group(token)
when :assertion; group(token)
when :set, :subset; set(token)
when :type; type(token)
when :backref; backref(token)
when :conditional; conditional(token)
when :keep; keep(token)
when :property, :nonproperty
property(token)
when :literal
@node << Literal.new(token)
when :free_space
free_space(token)
else
raise UnknownTokenTypeError.new(token.type, token)
end
end
def self.set(token)
case token.token
when :open
open_set(token)
when :close
close_set(token)
when :negate
negate_set
when :member, :range, :escape, :collation, :equivalent
append_set(token)
when *Token::Escape::All
append_set(token)
when *Token::CharacterSet::All
append_set(token)
when *Token::UnicodeProperty::All
append_set(token)
else
raise UnknownTokenError.new('CharacterSet', token)
end
end
def self.meta(token)
case token.token
when :dot
@node << CharacterType::Any.new(token)
when :alternation
unless @node.token == :alternation
unless @node.last.is_a?(Alternation)
alt = Alternation.new(token)
seq = Alternative.new(alt.level, alt.set_level, alt.conditional_level)
while @node.expressions.last
seq.insert @node.expressions.pop
end
alt.alternative(seq)
@node << alt
@node = alt
@node.alternative
else
@node = @node.last
@node.alternative
end
else
@node.alternative
end
else
raise UnknownTokenError.new('Meta', token)
end
end
def self.backref(token)
case token.token
when :name_ref
@node << Backreference::Name.new(token)
when :name_nest_ref
@node << Backreference::NameNestLevel.new(token)
when :name_call
@node << Backreference::NameCall.new(token)
when :number, :number_ref
@node << Backreference::Number.new(token)
when :number_rel_ref
@node << Backreference::NumberRelative.new(token)
when :number_nest_ref
@node << Backreference::NumberNestLevel.new(token)
when :number_call
@node << Backreference::NumberCall.new(token)
when :number_rel_call
@node << Backreference::NumberCallRelative.new(token)
else
raise UnknownTokenError.new('Backreference', token)
end
end
def self.type(token)
case token.token
when :digit
@node << CharacterType::Digit.new(token)
when :nondigit
@node << CharacterType::NonDigit.new(token)
when :hex
@node << CharacterType::Hex.new(token)
when :nonhex
@node << CharacterType::NonHex.new(token)
when :space
@node << CharacterType::Space.new(token)
when :nonspace
@node << CharacterType::NonSpace.new(token)
when :word
@node << CharacterType::Word.new(token)
when :nonword
@node << CharacterType::NonWord.new(token)
else
raise UnknownTokenError.new('CharacterType', token)
end
end
def self.conditional(token)
case token.token
when :open
nest_conditional(Conditional::Expression.new(token))
when :condition
@conditional_nesting.last.condition(Conditional::Condition.new(token))
@conditional_nesting.last.branch
when :separator
@conditional_nesting.last.branch
@node = @conditional_nesting.last.branches.last
when :close
@conditional_nesting.pop
@node = if @conditional_nesting.empty?
@nesting.last
else
@conditional_nesting.last
end
else
raise UnknownTokenError.new('Conditional', token)
end
end
def self.property(token)
include Regexp::Expression::UnicodeProperty
case token.token
when :alnum; @node << Alnum.new(token)
when :alpha; @node << Alpha.new(token)
when :any; @node << Any.new(token)
when :ascii; @node << Ascii.new(token)
when :blank; @node << Blank.new(token)
when :cntrl; @node << Cntrl.new(token)
when :digit; @node << Digit.new(token)
when :graph; @node << Graph.new(token)
when :lower; @node << Lower.new(token)
when :print; @node << Print.new(token)
when :punct; @node << Punct.new(token)
when :space; @node << Space.new(token)
when :upper; @node << Upper.new(token)
when :word; @node << Word.new(token)
when :xdigit; @node << Xdigit.new(token)
when :newline; @node << Newline.new(token)
when :letter_any; @node << Letter::Any.new(token)
when :letter_uppercase; @node << Letter::Uppercase.new(token)
when :letter_lowercase; @node << Letter::Lowercase.new(token)
when :letter_titlecase; @node << Letter::Titlecase.new(token)
when :letter_modifier; @node << Letter::Modifier.new(token)
when :letter_other; @node << Letter::Other.new(token)
when :mark_any; @node << Mark::Any.new(token)
when :mark_nonspacing; @node << Mark::Nonspacing.new(token)
when :mark_spacing; @node << Mark::Spacing.new(token)
when :mark_enclosing; @node << Mark::Enclosing.new(token)
when :number_any; @node << Number::Any.new(token)
when :number_decimal; @node << Number::Decimal.new(token)
when :number_letter; @node << Number::Letter.new(token)
when :number_other; @node << Number::Other.new(token)
when :punct_any; @node << Punctuation::Any.new(token)
when :punct_connector; @node << Punctuation::Connector.new(token)
when :punct_dash; @node << Punctuation::Dash.new(token)
when :punct_open; @node << Punctuation::Open.new(token)
when :punct_close; @node << Punctuation::Close.new(token)
when :punct_initial; @node << Punctuation::Initial.new(token)
when :punct_final; @node << Punctuation::Final.new(token)
when :punct_other; @node << Punctuation::Other.new(token)
when :separator_any; @node << Separator::Any.new(token)
when :separator_space; @node << Separator::Space.new(token)
when :separator_line; @node << Separator::Line.new(token)
when :separator_para; @node << Separator::Paragraph.new(token)
when :symbol_any; @node << Symbol::Any.new(token)
when :symbol_math; @node << Symbol::Math.new(token)
when :symbol_currency; @node << Symbol::Currency.new(token)
when :symbol_modifier; @node << Symbol::Modifier.new(token)
when :symbol_other; @node << Symbol::Other.new(token)
when :other; @node << Codepoint::Any.new(token)
when :control; @node << Codepoint::Control.new(token)
when :format; @node << Codepoint::Format.new(token)
when :surrogate; @node << Codepoint::Surrogate.new(token)
when :private_use; @node << Codepoint::PrivateUse.new(token)
when :unassigned; @node << Codepoint::Unassigned.new(token)
when *Token::UnicodeProperty::Age
@node << Age.new(token)
when *Token::UnicodeProperty::Derived
@node << Derived.new(token)
when *Regexp::Syntax::Token::UnicodeProperty::Script
@node << Script.new(token)
when *Regexp::Syntax::Token::UnicodeProperty::UnicodeBlock
@node << Block.new(token)
else
raise UnknownTokenError.new('UnicodeProperty', token)
end
end
def self.anchor(token)
case token.token
when :bol
@node << Anchor::BeginningOfLine.new(token)
when :eol
@node << Anchor::EndOfLine.new(token)
when :bos
@node << Anchor::BOS.new(token)
when :eos
@node << Anchor::EOS.new(token)
when :eos_ob_eol
@node << Anchor::EOSobEOL.new(token)
when :word_boundary
@node << Anchor::WordBoundary.new(token)
when :nonword_boundary
@node << Anchor::NonWordBoundary.new(token)
when :match_start
@node << Anchor::MatchStart.new(token)
else
raise UnknownTokenError.new('Anchor', token)
end
end
def self.escape(token)
case token.token
when :backspace
@node << EscapeSequence::Backspace.new(token)
when :escape
@node << EscapeSequence::AsciiEscape.new(token)
when :bell
@node << EscapeSequence::Bell.new(token)
when :form_feed
@node << EscapeSequence::FormFeed.new(token)
when :newline
@node << EscapeSequence::Newline.new(token)
when :carriage
@node << EscapeSequence::Return.new(token)
when :space
@node << EscapeSequence::Space.new(token)
when :tab
@node << EscapeSequence::Tab.new(token)
when :vertical_tab
@node << EscapeSequence::VerticalTab.new(token)
when :control
@node << EscapeSequence::Control.new(token)
when :meta_sequence
if token.text =~ /\A\\M-\\C/
@node << EscapeSequence::MetaControl.new(token)
else
@node << EscapeSequence::Meta.new(token)
end
else
# treating everything else as a literal
@node << EscapeSequence::Literal.new(token)
end
end
def self.keep(token)
@node << Keep::Mark.new(token)
end
def self.free_space(token)
case token.token
when :comment
@node << Comment.new(token)
when :whitespace
if @node.last and @node.last.is_a?(WhiteSpace)
@node.last.merge(WhiteSpace.new(token))
else
@node << WhiteSpace.new(token)
end
else
raise UnknownTokenError.new('FreeSpace', token)
end
end
def self.quantifier(token)
offset = -1
target_node = @node.expressions[offset]
while target_node and target_node.is_a?(FreeSpace)
target_node = @node.expressions[offset -= 1]
end
raise ArgumentError.new("No valid target found for '#{token.text}' "+
"quantifier") unless target_node
unless target_node
if token.token == :zero_or_one
raise "Quantifier given without a target, or the syntax of the group " +
"or its options is incorrect"
else
raise "Quantifier `#{token.text}' given without a target"
end
end
case token.token
when :zero_or_one
target_node.quantify(:zero_or_one, token.text, 0, 1, :greedy)
when :zero_or_one_reluctant
target_node.quantify(:zero_or_one, token.text, 0, 1, :reluctant)
when :zero_or_one_possessive
target_node.quantify(:zero_or_one, token.text, 0, 1, :possessive)
when :zero_or_more
target_node.quantify(:zero_or_more, token.text, 0, -1, :greedy)
when :zero_or_more_reluctant
target_node.quantify(:zero_or_more, token.text, 0, -1, :reluctant)
when :zero_or_more_possessive
target_node.quantify(:zero_or_more, token.text, 0, -1, :possessive)
when :one_or_more
target_node.quantify(:one_or_more, token.text, 1, -1, :greedy)
when :one_or_more_reluctant
target_node.quantify(:one_or_more, token.text, 1, -1, :reluctant)
when :one_or_more_possessive
target_node.quantify(:one_or_more, token.text, 1, -1, :possessive)
when :interval
interval(target_node, token)
else
raise UnknownTokenError.new('Quantifier', token)
end
end
def self.interval(target_node, token)
text = token.text
mchr = text[text.length-1].chr =~ /[?+]/ ? text[text.length-1].chr : nil
case mchr
when '?'
range_text = text[0...-1]
mode = :reluctant
when '+'
range_text = text[0...-1]
mode = :possessive
else
range_text = text
mode = :greedy
end
range = range_text.gsub(/\{|\}/, '').split(',', 2).each {|i| i.strip}
min = range[0].empty? ? 0 : range[0]
max = range[1] ? (range[1].empty? ? -1 : range[1]) : min
target_node.quantify(:interval, text, min.to_i, max.to_i, mode)
end
def self.group(token)
case token.token
when :options
options(token)
when :close
close_group
when :comment
@node << Group::Comment.new(token)
else
open_group(token)
end
end
def self.options(token)
opt = token.text.split('-', 2)
exp = Group::Options.new(token)
exp.options = {
:m => opt[0].include?('m') ? true : false,
:i => opt[0].include?('i') ? true : false,
:x => opt[0].include?('x') ? true : false,
:d => opt[0].include?('d') ? true : false,
:a => opt[0].include?('a') ? true : false,
:u => opt[0].include?('u') ? true : false
}
nest(exp)
end
def self.open_group(token)
case token.token
when :passive
exp = Group::Passive.new(token)
when :atomic
exp = Group::Atomic.new(token)
when :named
exp = Group::Named.new(token)
when :capture
exp = Group::Capture.new(token)
when :lookahead
exp = Assertion::Lookahead.new(token)
when :nlookahead
exp = Assertion::NegativeLookahead.new(token)
when :lookbehind
exp = Assertion::Lookbehind.new(token)
when :nlookbehind
exp = Assertion::NegativeLookbehind.new(token)
else
raise UnknownTokenError.new('Group type open', token)
end
nest(exp)
end
def self.close_group
@nesting.pop
@node = @nesting.last
@node = @node.last if @node.last and @node.last.is_a?(Alternation)
end
def self.open_set(token)
token.token = :character
if token.type == :subset
@set << CharacterSubSet.new(token)
else
@node << (@set = CharacterSet.new(token))
end
end
def self.negate_set
@set.negate
end
def self.append_set(token)
@set << token.text
end
def self.close_set(token)
@set.close
end
end # module Regexp::Parser