lib/kleene/naive_online_regex.rb
require 'set' require_relative './kleene' module Kleene class NaiveOnlineRegex def initialize(regexen, window_size = 100) @regexen = regexen @window_size = window_size reset end def reset @buffer = '' @matches_per_regex = {} # Hash(Regexp, Set(OnlineMatch)) end # #ingest(input) is the online-style matching interface def ingest(input, _debug = false) # : Set(OnlineMatch) @buffer << input new_online_matches = Set.new @regexen.each do |regex| existing_matches_for_regex = (@matches_per_regex[regex] ||= Set.new) scan_matches = @buffer.scan_matches(regex) scan_online_matches = scan_matches.map {|match_data| OnlineMatch.new(regex, match_data) }.to_set new_matches = scan_online_matches - existing_matches_for_regex # new_matches : Set(OnlineMatch) existing_matches_for_regex.merge(new_matches) new_online_matches.merge(new_matches) end resize_buffer! new_online_matches end def matches # Hash(Regexp, Set(OnlineMatch)) @matches_per_regex end def matches_for(regex) # Set(OnlineMatch) | Nil @matches_per_regex[regex] end def resize_buffer! return unless @buffer.size > @window_size number_of_chars_at_front_of_buffer_that_should_roll_off = @buffer.size - @window_size @buffer = @buffer[-@window_size..-1] drop_matches_that_have_rolled_off(number_of_chars_at_front_of_buffer_that_should_roll_off) end def drop_matches_that_have_rolled_off(number_of_chars_at_front_of_buffer_that_rolled_off) @matches_per_regex.transform_values! do |match_set| new_set = Set.new match_set.each do |online_match| online_match_clone = online_match.clone online_match_clone.decrement_offsets(number_of_chars_at_front_of_buffer_that_rolled_off) new_set << online_match_clone if online_match_clone.offsets.first > 0 end new_set end end end # A {Regexp, MatchData} pair class OnlineMatch attr_reader :regex # Regexp attr_reader :match # MatchData attr_reader :offsets # Array(Int) -> [start, end] # excludes the end offset def initialize(regex, match) @regex = regex @match = match @offsets = match.offset(0) end def clone OnlineMatch.new(@regex, @match) end def identity [@regex, @offsets, to_a] end def ==(other) identity == other.identity end def eql?(other) self == other end def hash identity.hash end def to_a @match.to_a end def to_h { @regex => to_a, :offsets => @offsets } end def captures @match.captures end def [](*args) @match.method(:[]).call(*args) end def decrement_offsets(decrement) @offsets = @offsets.map {|offset| offset - decrement } end end end