lib/nokogiri/html5/document.rb



# coding: utf-8
# frozen_string_literal: true

#
#  Copyright 2013-2021 Sam Ruby, Stephen Checkoway
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#

require_relative "../html4/document"

module Nokogiri
  module HTML5
    # Enum for the HTML5 parser quirks mode values. Values returned by HTML5::Document#quirks_mode
    #
    # See https://dom.spec.whatwg.org/#concept-document-quirks for more information on HTML5 quirks
    # mode.
    #
    # Since v1.14.0
    module QuirksMode
      NO_QUIRKS = 0 # The document was parsed in "no-quirks" mode
      QUIRKS = 1 # The document was parsed in "quirks" mode
      LIMITED_QUIRKS = 2 # The document was parsed in "limited-quirks" mode
    end

    # Since v1.12.0
    #
    # 💡 HTML5 functionality is not available when running JRuby.
    class Document < Nokogiri::HTML4::Document
      # Get the url name for this document, as passed into Document.parse, Document.read_io, or
      # Document.read_memory
      attr_reader :url

      # Get the parser's quirks mode value. See HTML5::QuirksMode.
      #
      # This method returns +nil+ if the parser was not invoked (e.g., Nokogiri::HTML5::Document.new).
      #
      # Since v1.14.0
      attr_reader :quirks_mode

      class << self
        # :call-seq:
        #   parse(input) { |options| ... } → HTML5::Document
        #   parse(input, url: encoding:) { |options| ... } → HTML5::Document
        #   parse(input, **options) → HTML5::Document
        #
        # Parse \HTML input with a parser compliant with the HTML5 spec. This method uses the
        # encoding of +input+ if it can be determined, or else falls back to the +encoding:+
        # parameter.
        #
        # [Required Parameters]
        # - +input+ (String | IO) the \HTML content to be parsed.
        #
        # [Optional Parameters]
        # - +url:+ (String) the base URI of the document.
        #
        # [Optional Keyword Arguments]
        # - +encoding:+ (Encoding) The name of the encoding that should be used when processing the
        #   document. When not provided, the encoding will be determined based on the document
        #   content.
        #
        # - +max_errors:+ (Integer) The maximum number of parse errors to record. (default
        #   +Nokogiri::Gumbo::DEFAULT_MAX_ERRORS+ which is currently 0)
        #
        # - +max_tree_depth:+ (Integer) The maximum depth of the parse tree. (default
        #   +Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH+)
        #
        # - +max_attributes:+ (Integer) The maximum number of attributes allowed on an
        #   element. (default +Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES+)
        #
        # - +parse_noscript_content_as_text:+ (Boolean) Whether to parse the content of +noscript+
        #   elements as text. (default +false+)
        #
        # See rdoc-ref:HTML5@Parsing+options for a complete description of these parsing options.
        #
        # [Yields]
        #   If present, the block will be passed a Hash object to modify with parse options before the
        #   input is parsed. See rdoc-ref:HTML5@Parsing+options for a list of available options.
        #
        #   âš  Note that +url:+ and +encoding:+ cannot be set by the configuration block.
        #
        # [Returns] Nokogiri::HTML5::Document
        #
        # *Example:* Parse a string with a specific encoding and custom max errors limit.
        #
        #   Nokogiri::HTML5::Document.parse(socket, encoding: "ISO-8859-1", max_errors: 10)
        #
        # *Example:* Parse a string setting the +:parse_noscript_content_as_text+ option using the
        # configuration block parameter.
        #
        #   Nokogiri::HTML5::Document.parse(input) { |c| c[:parse_noscript_content_as_text] = true }
        #
        def parse(
          string_or_io,
          url_ = nil, encoding_ = nil,
          url: url_, encoding: encoding_,
          **options, &block
        )
          yield options if block
          string_or_io = "" unless string_or_io

          if string_or_io.respond_to?(:encoding) && string_or_io.encoding != Encoding::ASCII_8BIT
            encoding ||= string_or_io.encoding.name
          end

          if string_or_io.respond_to?(:read) && string_or_io.respond_to?(:path)
            url ||= string_or_io.path
          end
          unless string_or_io.respond_to?(:read) || string_or_io.respond_to?(:to_str)
            raise ArgumentError, "not a string or IO object"
          end

          do_parse(string_or_io, url, encoding, **options)
        end

        # Create a new document from an IO object.
        #
        # 💡 Most users should prefer Document.parse to this method.
        def read_io(io, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
          raise ArgumentError, "io object doesn't respond to :read" unless io.respond_to?(:read)

          do_parse(io, url, encoding, **options)
        end

        # Create a new document from a String.
        #
        # 💡 Most users should prefer Document.parse to this method.
        def read_memory(string, url_ = nil, encoding_ = nil, url: url_, encoding: encoding_, **options)
          raise ArgumentError, "string object doesn't respond to :to_str" unless string.respond_to?(:to_str)

          do_parse(string, url, encoding, **options)
        end

        private

        def do_parse(string_or_io, url, encoding, **options)
          string = HTML5.read_and_encode(string_or_io, encoding)

          options[:max_attributes] ||= Nokogiri::Gumbo::DEFAULT_MAX_ATTRIBUTES
          options[:max_errors] ||= options.delete(:max_parse_errors) || Nokogiri::Gumbo::DEFAULT_MAX_ERRORS
          options[:max_tree_depth] ||= Nokogiri::Gumbo::DEFAULT_MAX_TREE_DEPTH

          doc = Nokogiri::Gumbo.parse(string, url, self, **options)
          doc.encoding = "UTF-8"
          doc
        end
      end

      def initialize(*args) # :nodoc:
        super
        @url = nil
        @quirks_mode = nil
      end

      # :call-seq:
      #   fragment() → Nokogiri::HTML5::DocumentFragment
      #   fragment(markup) → Nokogiri::HTML5::DocumentFragment
      #
      # Parse a HTML5 document fragment from +markup+, returning a Nokogiri::HTML5::DocumentFragment.
      #
      # [Properties]
      # - +markup+ (String) The HTML5 markup fragment to be parsed
      #
      # [Returns]
      #   Nokogiri::HTML5::DocumentFragment. This object's children will be empty if +markup+ is not
      #   passed, is empty, or is +nil+.
      #
      def fragment(markup = nil)
        DocumentFragment.new(self, markup)
      end

      def to_xml(options = {}, &block) # :nodoc:
        # Bypass XML::Document#to_xml which doesn't add
        # XML::Node::SaveOptions::AS_XML like XML::Node#to_xml does.
        XML::Node.instance_method(:to_xml).bind_call(self, options, &block)
      end

      # :call-seq:
      #   xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig
      #
      # [Returns] The document type which determines CSS-to-XPath translation.
      #
      # See CSS::XPathVisitor for more information.
      def xpath_doctype
        Nokogiri::CSS::XPathVisitor::DoctypeConfig::HTML5
      end
    end
  end
end