# frozen_string_literal: true
require "typhoeus"
require "open-uri"
# require "uri"
require "pdf-reader"
module HTMLProofer
class UrlValidator
class External < UrlValidator
include HTMLProofer::Utils
attr_reader :external_urls
attr_writer :before_request
def initialize(runner, external_urls)
super(runner)
@external_urls = external_urls
@hydra = Typhoeus::Hydra.new(@runner.options[:hydra])
@before_request = []
@paths_with_queries = {}
end
def validate
urls_to_check = @cache.external_enabled? ? @runner.load_external_cache : @external_urls
urls_detected = pluralize(urls_to_check.count, "external link", "external links")
@logger.log(:info, "Checking #{urls_detected}")
run_external_link_checker(urls_to_check)
@failed_checks
end
# Proofer runs faster if we pull out all the external URLs and run the checks
# at the end. Otherwise, we're halting the consuming process for every file during
# `process_files`.
#
# In addition, sorting the list lets libcurl keep connections to the same hosts alive.
#
# Finally, we'll first make a HEAD request, rather than GETing all the contents.
# If the HEAD fails, we'll fall back to GET, as some servers are not configured
# for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
# not available as an option.
def run_external_link_checker(external_urls)
# Route log from Typhoeus/Ethon to our own logger
Ethon.logger = @logger
external_urls.each_pair do |external_url, metadata|
url = Attribute::Url.new(@runner, external_url, base_url: nil)
unless url.valid?
add_failure(metadata, "#{url} is an invalid URL", 0)
next
end
next unless new_url_query_values?(url)
method = if @runner.options[:check_external_hash] && url.hash?
:get
else
:head
end
queue_request(method, url, metadata)
end
@hydra.run
end
def queue_request(method, url, filenames)
opts = @runner.options[:typhoeus].merge(method: method)
request = Typhoeus::Request.new(url.url, opts)
@before_request.each do |callback|
callback.call(request)
end
request.on_complete { |response| response_handler(response, url, filenames) }
@hydra.queue(request)
end
def response_handler(response, url, filenames)
method = response.request.options[:method]
href = response.request.base_url.to_s
response_code = response.code
response.body.delete!("\x00")
@logger.log(:debug, "Received a #{response_code} for #{href}")
return if @runner.options[:ignore_status_codes].include?(response_code)
if response_code.between?(200, 299)
@cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response(
href,
url,
response,
filenames,
)
elsif response.timed_out?
handle_timeout(href, filenames, response_code)
elsif response_code.zero?
handle_connection_failure(href, filenames, response_code, response.status_message)
elsif method == :head # some servers don't support HEAD
queue_request(:get, url, filenames)
else
return if @runner.options[:only_4xx] && !response_code.between?(400, 499)
# Received a non-successful http response.
status_message = blank?(response.status_message) ? "" : ": #{response.status_message}"
msg = "External link #{href} failed#{status_message}"
add_failure(filenames, msg, response_code)
@cache.add_external(href, filenames, response_code, msg, false)
end
end
# Even though the response was a success, we may have been asked to check
# if the hash on the URL exists on the page
def check_hash_in_2xx_response(href, url, response, filenames)
return false if @runner.options[:only_4xx]
return false unless @runner.options[:check_external_hash]
return false unless url.hash?
hash = url.hash
headers = response.options.fetch(:headers, {})
content_type = headers.find { |k, _| k.casecmp("content-type").zero? }
# attempt to verify PDF hash ref; see #787 for more details
# FIXME: this is re-reading the PDF response
if content_type && content_type[1].include?("pdf")
io = URI.parse(url.to_s).open
reader = PDF::Reader.new(io)
pages = reader.pages
if hash =~ /\Apage=(\d+)\z/
page = Regexp.last_match[1].to_i
unless pages[page - 1]
msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not"
add_failure(filenames, msg, response.code)
@cache.add_external(href, filenames, response.code, msg, false)
end
return true
end
end
body_doc = create_nokogiri(response.body)
unencoded_hash = Addressable::URI.unescape(hash)
xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
# user-content is a special addition by GitHub.
if url.host =~ /github\.com/i
xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
# when linking to a file on GitHub, like #L12-L34, only the first "L" portion
# will be identified as a linkable portion
xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
end
return unless body_doc.xpath(xpath.join("|")).empty?
msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not"
add_failure(filenames, msg, response.code)
@cache.add_external(href, filenames, response.code, msg, false)
true
end
def handle_timeout(href, filenames, response_code)
msg = "External link #{href} failed: got a time out (response code #{response_code})"
@cache.add_external(href, filenames, 0, msg, false)
return if @runner.options[:only_4xx]
add_failure(filenames, msg, response_code)
end
def handle_connection_failure(href, metadata, response_code, status_message)
msgs = [<<~MSG,
External link #{href} failed with something very wrong.
It's possible libcurl couldn't connect to the server, or perhaps the request timed out.
Sometimes, making too many requests at once also breaks things.
MSG
]
msgs << "Either way, the return message from the server is: #{status_message}" unless blank?(status_message)
msg = msgs.join("\n").chomp
@cache.add_external(href, metadata, 0, msg, false)
return if @runner.options[:only_4xx]
add_failure(metadata, msg, response_code)
end
def add_failure(metadata, description, status = nil)
if blank?(metadata) # possible if we're checking an array of links
@failed_checks << Failure.new("", "Links > External", description, status: status)
else
metadata.each do |m|
@failed_checks << Failure.new(m[:filename], "Links > External", description, line: m[:line], status: status)
end
end
end
# remember queries we've seen, ignore future ones
private def new_url_query_values?(url)
return true if (query_values = url.query_values).nil?
queries = query_values.keys.join("-")
domain_path = url.domain_path
if @paths_with_queries[domain_path].nil?
@paths_with_queries[domain_path] = [queries]
true
elsif !@paths_with_queries[domain_path].include?(queries)
@paths_with_queries[domain_path] << queries
true
else
false
end
end
end
end
end