class Regexp::Lexer
given syntax flavor.
normalizes tokens for the parser, and checks if they are implemented by the
collects emitted tokens into an array, calculates their nesting depth, and
A very thin wrapper around the scanner that breaks quantified literal runs,
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block) new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block) end
def ascend(type, token)
def ascend(type, token) return unless CLOSING_TOKENS.include?(token) case type when :group, :assertion self.nesting = nesting - 1 when :set self.set_nesting = set_nesting - 1 when :conditional self.conditional_nesting = conditional_nesting - 1 else raise "unhandled nesting type #{type}" end end
def break_codepoint_list(token)
to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
if a codepoint list is followed by a quantifier, that quantifier applies
def break_codepoint_list(token) lead, _, tail = token.text.rpartition(' ') return if lead.empty? token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}', token.ts, (token.te - tail.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail, (token.ts + lead.length + 1), (token.te + 3), nesting, set_nesting, conditional_nesting) self.shift = shift + 3 # one space less, but extra \, u, {, and } token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end
def break_literal(token)
called by scan to break a literal run that is longer than one character
def break_literal(token) lead, last, _ = token.text.partition(/.\z/mu) return if lead.empty? token_1 = Regexp::Token.new(:literal, :literal, lead, token.ts, (token.te - last.length), nesting, set_nesting, conditional_nesting) token_2 = Regexp::Token.new(:literal, :literal, last, (token.ts + lead.length), token.te, nesting, set_nesting, conditional_nesting) token_1.previous = preprev_token token_1.next = token_2 token_2.previous = token_1 # .next will be set by #lex [token_1, token_2] end
def descend(type, token)
def descend(type, token) return unless OPENING_TOKENS.include?(token) case type when :group, :assertion self.nesting = nesting + 1 when :set self.set_nesting = set_nesting + 1 when :conditional self.conditional_nesting = conditional_nesting + 1 else raise "unhandled nesting type #{type}" end end
def emit(token)
def emit(token) if block # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block res = block.call(token) tokens << res if collect_tokens else tokens << token end end
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
def lex(input, syntax = nil, options: nil, collect_tokens: true, &block) syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT self.block = block self.collect_tokens = collect_tokens self.tokens = [] self.prev_token = nil self.preprev_token = nil self.nesting = 0 self.set_nesting = 0 self.conditional_nesting = 0 self.shift = 0 Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te| type, token = *syntax.normalize(type, token) syntax.check! type, token ascend(type, token) if (last = prev_token) && type == :quantifier && ( (last.type == :literal && (parts = break_literal(last))) || (last.token == :codepoint_list && (parts = break_codepoint_list(last))) ) emit(parts[0]) last = parts[1] end current = Regexp::Token.new(type, token, text, ts + shift, te + shift, nesting, set_nesting, conditional_nesting) if type == :conditional && CONDITION_TOKENS.include?(token) current = merge_condition(current, last) elsif last last.next = current current.previous = last emit(last) end self.preprev_token = last self.prev_token = current descend(type, token) end emit(prev_token) if prev_token collect_tokens ? tokens : nil end
def merge_condition(current, last)
def merge_condition(current, last) token = Regexp::Token.new(:conditional, :condition, last.text + current.text, last.ts, current.te, nesting, set_nesting, conditional_nesting) token.previous = preprev_token # .next will be set by #lex token end