module Nokogiri::HTML5

def fragment(string, encoding = nil, **options)

{Nokogiri::HTML5::DocumentFragment.parse}.
Parse a fragment from +string+. Convenience method for

def fragment(string, encoding = nil, **options)
  DocumentFragment.parse(string, encoding, options)
end

def get(uri, options = {})

* :basic_auth => [username, password]
* :follow_limit => number of redirects which are followed
special option is considered a header. Special options include:
http headers and special options. Everything which is not a
rules. +uri+ may be a +String+ or a +URI+. +options+ contains
handling https, and determining the character encoding using HTML5
Fetch and parse a HTML document from the web, following redirects,

def get(uri, options = {})
  # TODO: deprecate
  warn(
    "Nokogiri::HTML5.get is deprecated and will be removed in a future version of Nokogiri.",
    uplevel: 1,
    category: :deprecated,
  )
  get_impl(uri, options)
end

def get_impl(uri, options = {})

def get_impl(uri, options = {})
  headers = options.clone
  headers = { follow_limit: headers } if Numeric === headers # deprecated
  limit = headers[:follow_limit] ? headers.delete(:follow_limit).to_i : 10
  require "net/http"
  uri = URI(uri) unless URI === uri
  http = Net::HTTP.new(uri.host, uri.port)
  # TLS / SSL support
  http.use_ssl = true if uri.scheme == "https"
  # Pass through Net::HTTP override values, which currently include:
  #   :ca_file, :ca_path, :cert, :cert_store, :ciphers,
  #   :close_on_empty_response, :continue_timeout, :key, :open_timeout,
  #   :read_timeout, :ssl_timeout, :ssl_version, :use_ssl,
  #   :verify_callback, :verify_depth, :verify_mode
  options.each do |key, _value|
    http.send("#{key}=", headers.delete(key)) if http.respond_to?("#{key}=")
  end
  request = Net::HTTP::Get.new(uri.request_uri)
  # basic authentication
  auth = headers.delete(:basic_auth)
  auth ||= [uri.user, uri.password] if uri.user && uri.password
  request.basic_auth(auth.first, auth.last) if auth
  # remaining options are treated as headers
  headers.each { |key, value| request[key.to_s] = value.to_s }
  response = http.request(request)
  case response
  when Net::HTTPSuccess
    doc = parse(reencode(response.body, response["content-type"]), options)
    doc.instance_variable_set(:@response, response)
    doc.class.send(:attr_reader, :response)
    doc
  when Net::HTTPRedirection
    response.value if limit <= 1
    location = URI.join(uri, response["location"])
    get_impl(location, options.merge(follow_limit: limit - 1))
  else
    response.value
  end
end

def parse(string, url = nil, encoding = nil, **options, &block)

Parse an HTML 5 document. Convenience method for {Nokogiri::HTML5::Document.parse}

def parse(string, url = nil, encoding = nil, **options, &block)
  Document.parse(string, url, encoding, **options, &block)
end

def read_and_encode(string, encoding)

:nodoc:

def read_and_encode(string, encoding)
  # Read the string with the given encoding.
  if string.respond_to?(:read)
    string = if encoding.nil?
      string.read
    else
      string.read(encoding: encoding)
    end
  else
    # Otherwise the string has the given encoding.
    string = string.to_s
    if encoding
      string = string.dup
      string.force_encoding(encoding)
    end
  end
  # convert to UTF-8
  if string.encoding != Encoding::UTF_8
    string = reencode(string)
  end
  string
end

def reencode(body, content_type = nil)

http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
http://bugs.ruby-lang.org/issues/2567

the HTML5 standard.
this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following

the Gumbo parser *only* supports utf-8.
consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
Charset sniffing is a complex and controversial topic that understandably isn't done _by

def reencode(body, content_type = nil)
  if body.encoding == Encoding::ASCII_8BIT
    encoding = nil
    # look for a Byte Order Mark (BOM)
    initial_bytes = body[0..2].bytes
    if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF]
      encoding = Encoding::UTF_8
    elsif initial_bytes[0..1] == [0xFE, 0xFF]
      encoding = Encoding::UTF_16BE
    elsif initial_bytes[0..1] == [0xFF, 0xFE]
      encoding = Encoding::UTF_16LE
    end
    # look for a charset in a content-encoding header
    if content_type
      encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1]
    end
    # look for a charset in a meta tag in the first 1024 bytes
    unless encoding
      data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "")
      data.scan(/<meta.*?>/im).each do |meta|
        encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1]
      end
    end
    # if all else fails, default to the official default encoding for HTML
    encoding ||= Encoding::ISO_8859_1
    # change the encoding to match the detected or inferred encoding
    body = body.dup
    begin
      body.force_encoding(encoding)
    rescue ArgumentError
      body.force_encoding(Encoding::ISO_8859_1)
    end
  end
  body.encode(Encoding::UTF_8)
end

Modules

Classes