class Regexp::Lexer
given syntax flavor.
normalizes tokens for the parser, and checks if they are implemented by the
collects emitted tokens into an array, calculates their nesting depth, and
A very thin wrapper around the scanner that breaks quantified literal runs,
def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", &block) new.lex(input, syntax, &block) end
def ascend(type, token)
def ascend(type, token) if type == :group or type == :assertion @nesting -= 1 if CLOSING_TOKENS.include?(token) end if type == :set or type == :subset @set_nesting -= 1 if token == :close end if type == :conditional @conditional_nesting -= 1 if token == :close end end
def break_literal(token)
called by scan to break a literal run that is longer than one character
def break_literal(token) text = token.text if text.scan(/./mu).length > 1 lead = text.sub(/.\z/mu, "") last = text[/.\z/mu] || '' if RUBY_VERSION >= '1.9' lead_length = lead.bytesize last_length = last.bytesize else lead_length = lead.length last_length = last.length end @tokens.pop @tokens << Regexp::Token.new(:literal, :literal, lead, token.ts, (token.te - last_length), @nesting, @set_nesting, @conditional_nesting) @tokens << Regexp::Token.new(:literal, :literal, last, (token.ts + lead_length), token.te, @nesting, @set_nesting, @conditional_nesting) end end
def descend(type, token)
def descend(type, token) if type == :group or type == :assertion @nesting += 1 if OPENING_TOKENS.include?(token) end if type == :set or type == :subset @set_nesting += 1 if token == :open end if type == :conditional @conditional_nesting += 1 if token == :open end end
def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block)
def lex(input, syntax = "ruby/#{RUBY_VERSION}", &block) syntax = Regexp::Syntax.new(syntax) @tokens = [] @nesting, @set_nesting, @conditional_nesting = 0, 0, 0 last = nil Regexp::Scanner.scan(input) do |type, token, text, ts, te| type, token = *syntax.normalize(type, token) syntax.check! type, token ascend(type, token) break_literal(last) if type == :quantifier and last and last.type == :literal current = Regexp::Token.new(type, token, text, ts, te, @nesting, @set_nesting, @conditional_nesting) current = merge_literal(current) if type == :literal and last and last.type == :literal current = merge_condition(current) if type == :conditional and [:condition, :condition_close].include?(token) last.next(current) if last current.previous(last) if last @tokens << current last = current descend(type, token) end if block_given? @tokens.map {|t| block.call(t)} else @tokens end end
def merge_condition(current)
def merge_condition(current) last = @tokens.pop Regexp::Token.new(:conditional, :condition, last.text + current.text, last.ts, current.te, @nesting, @set_nesting, @conditional_nesting) end
def merge_literal(current)
called by scan to merge two consecutive literals. this happens when tokens
def merge_literal(current) last = @tokens.pop Regexp::Token.new( :literal, :literal, last.text + current.text, last.ts, current.te, @nesting, @set_nesting, @conditional_nesting ) end