# A very thin wrapper around the scanner that breaks quantified literal runs,# collects emitted tokens into an array, calculates their nesting depth, and# normalizes tokens for the parser, and checks if they are implemented by the# given syntax flavor.moduleRegexp::LexerOPENING_TOKENS=[:capture,:options,:passive,:atomic,:named,:lookahead,:nlookahead,:lookbehind,:nlookbehind].freezeCLOSING_TOKENS=[:close].freezedefself.scan(input,syntax="ruby/#{RUBY_VERSION}",&block)syntax=Regexp::Syntax.new(syntax)@tokens=[]@nesting,@set_nesting=0,0last=nilRegexp::Scanner.scan(input)do|type,token,text,ts,te|type,token=*syntax.normalize(type,token)syntax.check!type,tokenascend(type,token)break_literal(last)iftype==:quantifierandlastandlast.type==:literalcurrent=Regexp::Token.new(type,token,text,ts,te,@nesting,@set_nesting)current=merge_literal(current)iftype==:literalandlastandlast.type==:literallast.next(current)iflastcurrent.previous(last)iflast@tokens<<currentlast=currentdescend(type,token)endifblock_given?@tokens.each{|t|block.call(t)}else@tokensendenddefself.ascend(type,token)iftype==:grouportype==:assertion@nesting-=1ifCLOSING_TOKENS.include?(token)endiftype==:setortype==:subset@set_nesting-=1iftoken==:closeendenddefself.descend(type,token)iftype==:grouportype==:assertion@nesting+=1ifOPENING_TOKENS.include?(token)endiftype==:setortype==:subset@set_nesting+=1iftoken==:openendend# called by scan to break a literal run that is longer than one character# into two separate tokens when it is followed by a quantifierdefself.break_literal(token)text=token.textiftext.scan(/./mu).length>1lead=text.sub(/.\z/mu,"")last=text[/.\z/mu]||''ifRUBY_VERSION>='1.9'lead_length=lead.bytesizelast_length=last.bytesizeelselead_length=lead.lengthlast_length=last.lengthend@tokens.pop@tokens<<Regexp::Token.new(:literal,:literal,lead,token.ts,(token.te-last_length),@nesting,@set_nesting)@tokens<<Regexp::Token.new(:literal,:literal,last,(token.ts+lead_length),token.te,@nesting,@set_nesting)endend# called by scan to merge two consecutive literals. this happens when tokens# get normalized (as in the case of posix/bre) and end up becoming literals.defself.merge_literal(current)last=@tokens.popreplace=Regexp::Token.new(:literal,:literal,last.text+current.text,last.ts,current.te,@nesting,@set_nesting)endend# module Regexp::Lexer