lib/capybara/selector/regexp_disassembler.rb



# frozen_string_literal: true

module Capybara
  class Selector
    # @api private
    class RegexpDisassembler
      def initialize(regexp)
        @regexp = regexp
        @regexp_source = regexp.source
      end

      def substrings
        @substrings ||= begin
          source = @regexp_source.dup
          source.gsub!(/\\[^pgk]/, '.') # replace escaped characters with wildcard
          source.gsub!(/\\[gk](?:<[^>]*>)?/, '.') # replace sub expressions and back references with wildcard
          source.gsub!(/\\p\{[[:alpha:]]+\}?/, '.') # replace character properties with wildcard
          source.gsub!(/\[\[:[a-z]+:\]\]/, '.') # replace posix classes with wildcard
          while source.gsub!(/\[(?:[^\[\]]+)\]/, '.'); end # replace character classes with wildcard
          source.gsub!(/\(\?<?[=!][^)]*\)/, '') # remove lookahead/lookbehind assertions
          source.gsub!(/\(\?(?:<[^>]+>|>|:)/, '(') # replace named, atomic, and non-matching groups with unnamed matching groups

          while source.gsub!(GROUP_REGEX) { |_m| simplify_group(Regexp.last_match) }; end
          source.gsub!(/.[*?]\??/, '.') # replace optional character with wildcard
          source.gsub!(/(.)\+\??/, '\1.') # replace one or more with character plus wildcard
          source.gsub!(/(?<char>.)#{COUNTED_REP_REGEX.source}/) do |_m| # repeat counted characters
            (Regexp.last_match[:char] * Regexp.last_match[:min_rep].to_i).tap { |str| str << '.' if Regexp.last_match[:max_rep] }
          end
          return [] if source.include?('|') # can't handle alternation here

          strs = source.match(/\A\^?(.*?)\$?\Z/).captures[0].split('.').reject(&:empty?).uniq
          strs = strs.map(&:upcase) if @regexp.casefold?
          strs
        end
      end

    private

      def simplify_group(matches)
        if matches[:group].include?('|') # no support for alternation in groups
          '.'
        elsif matches[:one_or_more] # required but may repeat becomes text + wildcard
          matches[:group][1..-2] + '.'
        elsif matches[:optional] # optional group becomes wildcard
          '.'
        elsif matches[:min_rep]
          (matches[:group] * matches[:min_rep].to_i).tap { |r| r << '.' if matches[:max_rep] }
        else
          matches[:group][1..-2]
        end
      end

      COUNTED_REP_REGEX = /\{(?<min_rep>\d*)(?:,(?<max_rep>\d*))?\}/
      GROUP_REGEX = /
        (?<group>\([^()]*\))
          (?:
            (?:
             (?<optional>[*?]) |
             (?<one_or_more>\+) |
             (?:#{COUNTED_REP_REGEX.source})
            )\??
          )?
      /x
    end
  end
end