class Regexp::Lexer

given syntax flavor.
normalizes tokens for the parser, and checks if they are implemented by the
collects emitted tokens into an array, calculates their nesting depth, and
A very thin wrapper around the scanner that breaks quantified literal runs,

def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)

def self.lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
  new.lex(input, syntax, options: options, collect_tokens: collect_tokens, &block)
end

def ascend(type, token)

def ascend(type, token)
  return unless CLOSING_TOKENS.include?(token)
  case type
  when :group, :assertion
    self.nesting = nesting - 1
  when :set
    self.set_nesting = set_nesting - 1
  when :conditional
    self.conditional_nesting = conditional_nesting - 1
  else
    raise "unhandled nesting type #{type}"
  end
end

def break_codepoint_list(token)

c.f. #break_literal.
to the last codepoint, e.g. /\u{61 62 63}{3}/ =~ 'abccc'
if a codepoint list is followed by a quantifier, that quantifier applies

def break_codepoint_list(token)
  lead, _, tail = token.text.rpartition(' ')
  return if lead.empty?
  token_1 = Regexp::Token.new(:escape, :codepoint_list, lead + '}',
            token.ts, (token.te - tail.length),
            nesting, set_nesting, conditional_nesting)
  token_2 = Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
            (token.ts + lead.length + 1), (token.te + 3),
            nesting, set_nesting, conditional_nesting)
  self.shift = shift + 3 # one space less, but extra \, u, {, and }
  token_1.previous = preprev_token
  token_1.next = token_2
  token_2.previous = token_1 # .next will be set by #lex
  [token_1, token_2]
end

def break_literal(token)

into two separate tokens when it is followed by a quantifier
called by scan to break a literal run that is longer than one character

def break_literal(token)
  lead, last, _ = token.text.partition(/.\z/mu)
  return if lead.empty?
  token_1 = Regexp::Token.new(:literal, :literal, lead,
            token.ts, (token.te - last.length),
            nesting, set_nesting, conditional_nesting)
  token_2 = Regexp::Token.new(:literal, :literal, last,
            (token.ts + lead.length), token.te,
            nesting, set_nesting, conditional_nesting)
  token_1.previous = preprev_token
  token_1.next = token_2
  token_2.previous = token_1 # .next will be set by #lex
  [token_1, token_2]
end

def descend(type, token)

def descend(type, token)
  return unless OPENING_TOKENS.include?(token)
  case type
  when :group, :assertion
    self.nesting = nesting + 1
  when :set
    self.set_nesting = set_nesting + 1
  when :conditional
    self.conditional_nesting = conditional_nesting + 1
  else
    raise "unhandled nesting type #{type}"
  end
end

def emit(token)

def emit(token)
  if block
    # TODO: in v3.0.0, remove `collect_tokens:` kwarg and only collect w/o block
    res = block.call(token)
    tokens << res if collect_tokens
  else
    tokens << token
  end
end

def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)

def lex(input, syntax = nil, options: nil, collect_tokens: true, &block)
  syntax = syntax ? Regexp::Syntax.for(syntax) : Regexp::Syntax::CURRENT
  self.block = block
  self.collect_tokens = collect_tokens
  self.tokens = []
  self.prev_token = nil
  self.preprev_token = nil
  self.nesting = 0
  self.set_nesting = 0
  self.conditional_nesting = 0
  self.shift = 0
  Regexp::Scanner.scan(input, options: options, collect_tokens: false) do |type, token, text, ts, te|
    type, token = *syntax.normalize(type, token)
    syntax.check! type, token
    ascend(type, token)
    if (last = prev_token) &&
       type == :quantifier &&
       (
         (last.type == :literal         && (parts = break_literal(last))) ||
         (last.token == :codepoint_list && (parts = break_codepoint_list(last)))
       )
      emit(parts[0])
      last = parts[1]
    end
    current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
                                nesting, set_nesting, conditional_nesting)
    if type == :conditional && CONDITION_TOKENS.include?(token)
      current = merge_condition(current, last)
    elsif last
      last.next = current
      current.previous = last
      emit(last)
    end
    self.preprev_token = last
    self.prev_token = current
    descend(type, token)
  end
  emit(prev_token) if prev_token
  collect_tokens ? tokens : nil
end

def merge_condition(current, last)

def merge_condition(current, last)
  token = Regexp::Token.new(:conditional, :condition, last.text + current.text,
    last.ts, current.te, nesting, set_nesting, conditional_nesting)
  token.previous = preprev_token # .next will be set by #lex
  token
end

Namespace

Regexp

Class Methods

:: lex

Instance Methods

Defined in

lib/regexp_parser/lexer.rb

Modules

Classes