class HTMLProofer::UrlValidator::External

def add_failure(metadata, description, status = nil)

def add_failure(metadata, description, status = nil)
  if blank?(metadata) # possible if we're checking an array of links
    @failed_checks << Failure.new("", "Links > External", description, status: status)
  else
    metadata.each do |m|
      @failed_checks << Failure.new(m[:filename], "Links > External", description, line: m[:line], status: status)
    end
  end
end

def check_hash_in_2xx_response(href, url, response, filenames)

if the hash on the URL exists on the page
Even though the response was a success, we may have been asked to check
def check_hash_in_2xx_response(href, url, response, filenames)
  return false if @runner.options[:only_4xx]
  return false unless @runner.options[:check_external_hash]
  return false unless url.hash?
  hash = url.hash
  headers = response.options.fetch(:headers, {})
  content_type = headers.find { |k, _| k.casecmp("content-type").zero? }
  # attempt to verify PDF hash ref; see #787 for more details
  # FIXME: this is re-reading the PDF response
  if content_type && content_type[1].include?("pdf")
    io = URI.parse(url.to_s).open
    reader = PDF::Reader.new(io)
    pages = reader.pages
    if hash =~ /\Apage=(\d+)\z/
      page = Regexp.last_match[1].to_i
      unless pages[page - 1]
        msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not"
        add_failure(filenames, msg, response.code)
        @cache.add_external(href, filenames, response.code, msg, false)
      end
      return true
    end
  end
  body_doc = create_nokogiri(response.body)
  unencoded_hash = Addressable::URI.unescape(hash)
  xpath = [%(//*[@name="#{hash}"]|/*[@name="#{unencoded_hash}"]|//*[@id="#{hash}"]|//*[@id="#{unencoded_hash}"])]
  # user-content is a special addition by GitHub.
  if url.host =~ /github\.com/i
    xpath << [%(//*[@name="user-content-#{hash}"]|//*[@id="user-content-#{hash}"])]
    # when linking to a file on GitHub, like #L12-L34, only the first "L" portion
    # will be identified as a linkable portion
    xpath << [%(//td[@id="#{Regexp.last_match[1]}"])] if hash =~ /\A(L\d)+/
  end
  return unless body_doc.xpath(xpath.join("|")).empty?
  msg = "External link #{href} failed: #{url.without_hash} exists, but the hash '#{hash}' does not"
  add_failure(filenames, msg, response.code)
  @cache.add_external(href, filenames, response.code, msg, false)
  true
end

def handle_connection_failure(href, metadata, response_code, status_message)

def handle_connection_failure(href, metadata, response_code, status_message)
  msgs = [<<~MSG,
    External link #{href} failed with something very wrong.
    It's possible libcurl couldn't connect to the server, or perhaps the request timed out.
    Sometimes, making too many requests at once also breaks things.
  MSG
  ]
  msgs << "Either way, the return message from the server is: #{status_message}" unless blank?(status_message)
  msg = msgs.join("\n").chomp
  @cache.add_external(href, metadata, 0, msg, false)
  return if @runner.options[:only_4xx]
  add_failure(metadata, msg, response_code)
end

def handle_timeout(href, filenames, response_code)

def handle_timeout(href, filenames, response_code)
  msg = "External link #{href} failed: got a time out (response code #{response_code})"
  @cache.add_external(href, filenames, 0, msg, false)
  return if @runner.options[:only_4xx]
  add_failure(filenames, msg, response_code)
end

def initialize(runner, external_urls)

def initialize(runner, external_urls)
  super(runner)
  @external_urls = external_urls
  @hydra = Typhoeus::Hydra.new(@runner.options[:hydra])
  @before_request = []
  @paths_with_queries = {}
end

def new_url_query_values?(url)

remember queries we've seen, ignore future ones
def new_url_query_values?(url)
 true if (query_values = url.query_values).nil?
s = query_values.keys.join("-")
_path = url.domain_path
ths_with_queries[domain_path].nil?
hs_with_queries[domain_path] = [queries]

!@paths_with_queries[domain_path].include?(queries)
hs_with_queries[domain_path] << queries

e

def queue_request(method, url, filenames)

def queue_request(method, url, filenames)
  opts = @runner.options[:typhoeus].merge(method: method)
  request = Typhoeus::Request.new(url.url, opts)
  @before_request.each do |callback|
    callback.call(request)
  end
  request.on_complete { |response| response_handler(response, url, filenames) }
  @hydra.queue(request)
end

def response_handler(response, url, filenames)

def response_handler(response, url, filenames)
  method = response.request.options[:method]
  href = response.request.base_url.to_s
  response_code = response.code
  response.body.delete!("\x00")
  @logger.log(:debug, "Received a #{response_code} for #{href}")
  return if @runner.options[:ignore_status_codes].include?(response_code)
  if response_code.between?(200, 299)
    @cache.add_external(href, filenames, response_code, "OK", true) unless check_hash_in_2xx_response(
      href,
      url,
      response,
      filenames,
    )
  elsif response.timed_out?
    handle_timeout(href, filenames, response_code)
  elsif response_code.zero?
    handle_connection_failure(href, filenames, response_code, response.status_message)
  elsif method == :head # some servers don't support HEAD
    queue_request(:get, url, filenames)
  else
    return if @runner.options[:only_4xx] && !response_code.between?(400, 499)
    # Received a non-successful http response.
    status_message = blank?(response.status_message) ? "" : ": #{response.status_message}"
    msg = "External link #{href} failed#{status_message}"
    add_failure(filenames, msg, response_code)
    @cache.add_external(href, filenames, response_code, msg, false)
  end
end

def run_external_link_checker(external_urls)

not available as an option.
for HEAD. If we've decided to check for hashes, we must do a GET--HEAD is
If the HEAD fails, we'll fall back to GET, as some servers are not configured
Finally, we'll first make a HEAD request, rather than GETing all the contents.

In addition, sorting the list lets libcurl keep connections to the same hosts alive.

`process_files`.
at the end. Otherwise, we're halting the consuming process for every file during
Proofer runs faster if we pull out all the external URLs and run the checks
def run_external_link_checker(external_urls)
  # Route log from Typhoeus/Ethon to our own logger
  Ethon.logger = @logger
  external_urls.each_pair do |external_url, metadata|
    url = Attribute::Url.new(@runner, external_url, base_url: nil)
    unless url.valid?
      add_failure(metadata, "#{url} is an invalid URL", 0)
      next
    end
    next unless new_url_query_values?(url)
    method = if @runner.options[:check_external_hash] && url.hash?
      :get
    else
      :head
    end
    queue_request(method, url, metadata)
  end
  @hydra.run
end

def validate

def validate
  urls_to_check = @cache.external_enabled? ? @runner.load_external_cache : @external_urls
  urls_detected = pluralize(urls_to_check.count, "external link", "external links")
  @logger.log(:info, "Checking #{urls_detected}")
  run_external_link_checker(urls_to_check)
  @failed_checks
end