class HTMLProofer::Cache
def add_external(url, filenames, status_code, msg, found)
def add_external(url, filenames, status_code, msg, found) return unless external_enabled? clean_url = cleaned_url(url) @cache_log[:external][clean_url] = { time: @cache_time.to_s, found: found, status_code: status_code, message: msg, metadata: filenames } end
def add_internal(url, metadata, found)
def add_internal(url, metadata, found) return unless internal_enabled? @cache_log[:internal][url] = { time: @cache_time, metadata: [] } if @cache_log[:internal][url].nil? @cache_log[:internal][url][:metadata] << construct_internal_link_metadata(metadata, found) end
def cleaned_url(url)
def cleaned_url(url) d_url = escape_unescape(url) cleaned_url unless cleaned_url.end_with?("/", "#", "?") && cleaned_url.length > 1 d_url[0..-2]
def construct_internal_link_metadata(metadata, found)
def construct_internal_link_metadata(metadata, found) ce: metadata[:source], name: metadata[:filename], : metadata[:line], _url: metadata[:base_url], d: found,
def detect_url_changes(urls_detected, type)
def detect_url_changes(urls_detected, type) determine_deletions(urls_detected, type) additions = determine_additions(urls_detected, type) additions end
def determine_additions(urls_detected, type)
def determine_additions(urls_detected, type) ons = type == :external ? determine_external_additions(urls_detected) : determine_internal_additions(urls_detected) nk_count = additions.length nk_text = pluralize(new_link_count, "new #{type} link", "new #{type} links") r.log(:debug, "Adding #{new_link_text} to the cache") ons
def determine_deletions(urls_detected, type)
def determine_deletions(urls_detected, type) ons = 0 _log[type].delete_if do |url, cache| red_timeframe = type == :external ? !within_external_timeframe?(cache[:time]) : !within_internal_timeframe?(cache[:time]) xpired_timeframe ogger.log(:debug, "Removing #{url} from #{type} cache (expired timeframe)") letions += 1 ue f urls_detected.include?(url) lse f url_matches_type?(url, type) ogger.log(:debug, "Removing #{url} from #{type} cache (not detected anymore)") letions += 1 ue nk_text = pluralize(deletions, "outdated #{type} link", "outdated #{type} links") r.log(:debug, "Removing #{del_link_text} from the cache")
def determine_external_additions(urls_detected)
def determine_external_additions(urls_detected) etected.reject do |url, _metadata| cache_log[:external].include?(url) und = @cache_log[:external][url][:found] # if this is false, we're trying again less found @logger.log(:debug, "Adding #{url} to external cache (not found)") d und ogger.log(:debug, "Adding #{url} to external cache") lse
def determine_internal_additions(urls_detected)
def determine_internal_additions(urls_detected) etected.each_with_object({}) do |(url, detected_metadata), hsh| l is not even in cache cache_log[:internal][url].nil? ogger.log(:debug, "Adding #{url} to internal cache") h[url] = detected_metadata xt tect metadata additions TE: the time-stamp for the whole url key will not be updated, that it reflects the earliest time any of the metadata was checked e_metadata = @cache_log[:internal][url][:metadata] data_additions = detected_metadata.reject do |detected| isting_cache_metadata = cache_metadata.find { |cached, _| cached[:filename] == detected[:filename] } cache for this url, from an existing path, exists as found und = !existing_cache_metadata.nil? && !existing_cache_metadata.empty? && existing_cache_metadata[:found] less found @logger.log(:debug, "Adding #{detected} to internal cache for #{url}") d und etadata_additions.empty? xt url] = metadata_additions move from the cache the detected metadata additions as they correspond to failures to be rechecked his works assuming the detected url metadata have "found" set to false) he_log[:internal][url][:metadata] = cache_metadata.difference(metadata_additions)
def empty?
def empty? blank?(@cache_log) || (@cache_log[:internal].empty? && @cache_log[:external].empty?) end
def escape_unescape(url)
def escape_unescape(url) sable::URI.parse(url).normalize.to_s
def initialize(runner, options)
def initialize(runner, options) @runner = runner @logger = @runner.logger @cache_datetime = Time.now @cache_time = @cache_datetime.to_time if blank?(options) define_singleton_method(:enabled?) { false } define_singleton_method(:external_enabled?) { false } define_singleton_method(:internal_enabled?) { false } else # we still consider the cache as enabled, regardless of the specic timeframes define_singleton_method(:enabled?) { true } setup_cache!(options) @external_timeframe = parsed_timeframe(options[:timeframe][:external]) define_singleton_method(:external_enabled?) { !@external_timeframe.nil? } @internal_timeframe = parsed_timeframe(options[:timeframe][:internal]) define_singleton_method(:internal_enabled?) { !@internal_timeframe.nil? } end end
def parsed_timeframe(timeframe)
def parsed_timeframe(timeframe) return nil if timeframe.nil? time, date = timeframe.match(/(\d+)(\D)/).captures time = time.to_i case date when "M" time_ago(time, :months) when "w" time_ago(time, :weeks) when "d" time_ago(time, :days) when "h" time_ago(time, :hours) else raise ArgumentError, "#{date} is not a valid timeframe!" end end
def retrieve_urls(urls_detected, type)
def retrieve_urls(urls_detected, type) # if there are no urls, bail return {} if urls_detected.empty? urls_detected = urls_detected.transform_keys do |url| cleaned_url(url) end urls_to_check = detect_url_changes(urls_detected, type) urls_to_check end
def setup_cache!(options)
def setup_cache!(options) t_structure = { ion: CACHE_VERSION, rnal: {}, rnal: {}, ge_dir = options[:storage_dir] || DEFAULT_STORAGE_DIR ils.mkdir_p(storage_dir) unless Dir.exist?(storage_dir) file_name = options[:cache_file] || DEFAULT_CACHE_FILE_NAME _file = File.join(storage_dir, cache_file_name) (@cache_log = default_structure) unless File.exist?(@cache_file) ts = File.read(@cache_file) (@cache_log = default_structure) if blank?(contents) JSON.parse(contents, symbolize_names: true) che = (cache_version = log[:version]).nil? _log = if old_cache # previous cache version, create a new one ult_structure cache_version != CACHE_VERSION ache version is newer...do something :internal] = log[:internal].transform_keys(&:to_s) :external] = log[:external].transform_keys(&:to_s)
def size(type)
def size(type) @cache_log[type].size end
def time_ago(measurement, unit)
def time_ago(measurement, unit) nit months he_datetime - (SECONDS_PER_MONTH * measurement) weeks he_datetime - (SECONDS_PER_WEEK * measurement) days he_datetime - (SECONDS_PER_DAY * measurement) hours he_datetime - Rational(SECONDS_PER_HOUR * measurement) _time
def url_matches_type?(url, type)
def url_matches_type?(url, type) true if type == :internal && url !~ URI_REGEXP true if type == :external && url =~ URI_REGEXP
def within_external_timeframe?(time)
def within_external_timeframe?(time) within_timeframe?(time, @external_timeframe) end
def within_internal_timeframe?(time)
def within_internal_timeframe?(time) within_timeframe?(time, @internal_timeframe) end
def within_timeframe?(current_time, parsed_timeframe)
def within_timeframe?(current_time, parsed_timeframe) false if current_time.nil? || parsed_timeframe.nil? t_time = Time.parse(current_time) if current_time.is_a?(String) d_timeframe..@cache_time).cover?(current_time)
def write
def write return unless enabled? File.write(@cache_file, @cache_log.to_json) end