lib/html_proofer/attribute/url.rb



# frozen_string_literal: true

module HTMLProofer
  class Attribute
    class Url < HTMLProofer::Attribute
      attr_reader :url, :size, :source, :filename

      REMOTE_SCHEMES = ["http", "https"].freeze

      def initialize(runner, link_attribute, base_url: nil, source: nil, filename: nil, extract_size: false)
        super

        @source = source
        @filename = filename

        if @raw_attribute.nil?
          @url = nil
        else
          @url = @raw_attribute.delete("\u200b").strip
          @url, @size = @url.split(/\s+/) if extract_size
          @url = Addressable::URI.join(base_url, @url).to_s unless blank?(base_url)
          @url = "" if @url.nil?

          swap_urls!
          clean_url!
        end
      end

      def protocol_relative?
        url.start_with?("//")
      end

      def to_s
        @url
      end

      def known_extension?
        return true if hash_link?
        return true if path.end_with?("/")

        ext = File.extname(path)

        # no extension means we use the assumed one
        return @runner.options[:extensions].include?(@runner.options[:assume_extension]) if blank?(ext)

        @runner.options[:extensions].include?(ext)
      end

      def unknown_extension?
        !known_extension?
      end

      def ignore?
        return true if /^javascript:/.match?(@url)

        true if ignores_pattern?(@runner.options[:ignore_urls])
      end

      def valid?
        !parts.nil?
      end

      def path?
        !parts.host.nil? && !parts.path.nil?
      end

      def parts
        @parts ||= Addressable::URI.parse(@url)
      rescue URI::Error, Addressable::URI::InvalidURIError
        @parts = nil
      end

      def path
        Addressable::URI.unencode(parts.path) unless parts.nil?
      end

      def hash
        parts&.fragment
      end

      # Does the URL have a hash?
      def hash?
        !blank?(hash)
      end

      def scheme
        parts&.scheme
      end

      def remote?
        REMOTE_SCHEMES.include?(scheme)
      end

      def http?
        scheme == "http"
      end

      def https?
        scheme == "https"
      end

      def non_http_remote?
        !scheme.nil? && !remote?
      end

      def host
        parts&.host
      end

      def domain_path
        (host || "") + path
      end

      def query_values
        parts&.query_values
      end

      # checks if a file exists relative to the current pwd
      def exists?
        return true if base64?

        !resolved_path.nil?
      end

      def resolved_path
        path_to_resolve = absolute_path

        return @runner.resolved_paths[path_to_resolve] if @runner.resolved_paths.key?(path_to_resolve)

        # extensionless URLs
        path_with_extension = "#{path_to_resolve}#{@runner.options[:assume_extension]}"
        resolved = if @runner.options[:assume_extension] && File.file?(path_with_extension)
          path_with_extension # existence checked implicitly by File.file?
        # implicit index support
        elsif File.directory?(path_to_resolve) && !unslashed_directory?(path_to_resolve)
          path_with_index = File.join(path_to_resolve, @runner.options[:directory_index_file])
          path_with_index if File.file?(path_with_index)
        # explicit file or directory
        elsif File.exist?(path_to_resolve)
          path_to_resolve
        end
        @runner.resolved_paths[path_to_resolve] = resolved

        resolved
      end

      def base64?
        /^data:image/.match?(@raw_attribute)
      end

      def absolute_path
        path = full_path || @filename

        File.expand_path(path, Dir.pwd)
      end

      def full_path
        return if path.nil? || path.empty?

        base = if absolute_path?(path) # path relative to root
          # either overwrite with root_dir; or, if source is directory, use that; or, just get the source file's dirname
          @runner.options[:root_dir] || (File.directory?(@source) ? @source : File.dirname(@source))
        else
          # path relative to the file where the link is defined
          File.dirname(@filename)
        end

        File.join(base, path)
      end

      def unslashed_directory?(file)
        return false unless File.directory?(file)

        !file.end_with?(File::SEPARATOR) && !follow_location?
      end

      def follow_location?
        @runner.options[:typhoeus] && @runner.options[:typhoeus][:followlocation]
      end

      def absolute_path?(path)
        path.start_with?("/")
      end

      # path is external to the file
      def external?
        !internal?
      end

      def internal?
        relative_link? || internal_absolute_link? || hash_link?
      end

      def internal_absolute_link?
        url.start_with?("/")
      end

      def relative_link?
        return false if remote?

        hash_link? || param_link? || url.start_with?(".") || url =~ /^\S/
      end

      def link_points_to_same_page?
        hash_link || param_link
      end

      def hash_link?
        url.start_with?("#")
      end

      def has_hash?
        url.include?("#")
      end

      def param_link?
        url.start_with?("?")
      end

      def without_hash
        @url.to_s.sub(/##{hash}/, "")
      end

      # catch any obvious issues
      private def clean_url!
        parsed_url = Addressable::URI.parse(@url)
        url = if parsed_url.scheme.nil?
          parsed_url
        else
          parsed_url.normalize
        end.to_s

        # normalize strips this off, which causes issues with cache
        @url = if @url.end_with?("/") && !url.end_with?("/")
          "#{url}/"
        elsif !@url.end_with?("/") && url.end_with?("/")
          url.chop
        else
          url
        end
      rescue Addressable::URI::InvalidURIError # rubocop:disable Lint/SuppressedException -- error will be reported at check time
      end

      private def swap_urls!
        return @url if blank?(replacements = @runner.options[:swap_urls])

        replacements.each do |link, replace|
          @url = @url.gsub(link, replace)
        end
      end

      private def ignores_pattern?(links_to_ignore)
        return false unless links_to_ignore.is_a?(Array)

        links_to_ignore.each do |link_to_ignore|
          case link_to_ignore
          when String
            return true if link_to_ignore == @raw_attribute
          when Regexp
            return true if link_to_ignore&.match?(@raw_attribute)
          end
        end

        false
      end
    end
  end
end