class HTMLProofer::UrlValidator::External
def add_failure(metadata, description, status = nil)
def add_failure(metadata, description, status = nil) if blank?(metadata) # possible if we're checking an array of links @failed_checks << Failure.new("", "Links > External", description, status: status) else metadata.each do |m| @failed_checks << Failure.new(m[:filename], "Links > External", description, line: m[:line], status: status) end end end
def check_hash_in_2xx_response(href, url, response, filenames)
Even though the response was a success, we may have been asked to check
def check_hash_in_2xx_response(href, url, response, filenames) return false if @runner.options[:only_4xx] return false unless @runner.options[:check_external_hash] return false unless url.hash? hash = url.hash headers = response.options.fetch(:headers, {}) content_type = headers.find { |k, _| k.casecmp("content-type").zero? } # attempt to verify PDF hash ref; see #787 for more details # FIXME: this is re-reading the PDF response if content_type && content_type[1].include?("pdf") io = URI.parse(url.to_s).open reader = PDF::Reader.new(io) pages = reader.pages if hash =~ /\Apage=(\d+)\z/ page = Regexp.last_match[1].to_i unless pages[page - 1] msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not" add_failure(filenames, msg, response.code) @cache.add_external(href, filenames, response.code, msg, false) end return true end end body_doc = create_nokogiri(response.body) unencoded_hash = Addressable::URI.unescape(hash) xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])] # user-content is a special addition by GitHub. if url.host =~ /github\.com/i xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])] # when linking to a file on GitHub, like #L12-L34, only the first "L" portion # will be identified as a linkable portion xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/ end return unless body_doc.xpath(xpath.join("|")).empty? msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not" add_failure(filenames, msg, response.code) @cache.add_external(href, filenames, response.code, msg, false) true end
def handle_connection_failure(href, metadata, response_code, status_message)
def handle_connection_failure(href, metadata, response_code, status_message) msgs = [<<~MSG, External link #{href} failed with something very wrong. It's possible libcurl couldn't connect to the server, or perhaps the request timed out. Sometimes, making too many requests at once also breaks things. MSG ] msgs << "Either way, the return message from the server is: #{status_message}" unless blank?(status_message) msg = msgs.join("\n").chomp @cache.add_external(href, metadata, 0, msg, false) return if @runner.options[:only_4xx] add_failure(metadata, msg, response_code) end
def handle_timeout(href, filenames, response_code)
def handle_timeout(href, filenames, response_code) msg = "External link #{href} failed: got a time out (response code #{response_code})" @cache.add_external(href, filenames, 0, msg, false) return if @runner.options[:only_4xx] add_failure(filenames, msg, response_code) end
def initialize(runner, external_urls)
def initialize(runner, external_urls) super(runner) @external_urls = external_urls @hydra = Typhoeus::Hydra.new(@runner.options[:hydra]) @before_request = [] @paths_with_queries = {} end
def new_url_query_values?(url)
def new_url_query_values?(url) true if (query_values = url.query_values).nil? s = query_values.keys.join("-") _path = url.domain_path ths_with_queries[domain_path].nil? hs_with_queries[domain_path] = [queries] !@paths_with_queries[domain_path].include?(queries) hs_with_queries[domain_path] << queries e
def queue_request(method, url, filenames)
def queue_request(method, url, filenames) opts = @runner.options[:typhoeus].merge(method: method) request = Typhoeus::Request.new(url.url, opts) @before_request.each do |callback| callback.call(request) end request.on_complete { |response| response_handler(response, url, filenames) } @hydra.queue(request) end
def response_handler(response, url, filenames)
def response_handler(response, url, filenames) method = response.request.options[:method] href = response.request.base_url.to_s response_code = response.code response.body.delete!("\x00") @logger.log(:debug, "Received a #{response_code} for #{href}") return if @runner.options[:ignore_status_codes].include?(response_code) if response_code.between?(200, 299) @cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response( href, url, response, filenames, ) elsif response.timed_out? handle_timeout(href, filenames, response_code) elsif response_code.zero? handle_connection_failure(href, filenames, response_code, response.status_message) elsif method == :head # some servers don't support HEAD queue_request(:get, url, filenames) else return if @runner.options[:only_4xx] && !response_code.between?(400, 499) # Received a non-successful http response. status_message = blank?(response.status_message) ? "" : ": #{response.status_message}" msg = "External link #{href} failed#{status_message}" add_failure(filenames, msg, response_code) @cache.add_external(href, filenames, response_code, msg, false) end end
def run_external_link_checker(external_urls)
for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
If the HEAD fails, we'll fall back to GET, as some servers are not configured
Finally, we'll first make a HEAD request, rather than GETing all the contents.
In addition, sorting the list lets libcurl keep connections to the same hosts alive.
`process_files`.
at the end. Otherwise, we're halting the consuming process for every file during
Proofer runs faster if we pull out all the external URLs and run the checks
def run_external_link_checker(external_urls) # Route log from Typhoeus/Ethon to our own logger Ethon.logger = @logger external_urls.each_pair do |external_url, metadata| url = Attribute::Url.new(@runner, external_url, base_url: nil) unless url.valid? add_failure(metadata, "#{url} is an invalid URL", 0) next end next unless new_url_query_values?(url) method = if @runner.options[:check_external_hash] && url.hash? :get else :head end queue_request(method, url, metadata) end @hydra.run end
def validate
def validate urls_to_check = @cache.external_enabled? ? @runner.load_external_cache : @external_urls urls_detected = pluralize(urls_to_check.count, "external link", "external links") @logger.log(:info, "Checking #{urls_detected}") run_external_link_checker(urls_to_check) @failed_checks end