module Nokogiri::HTML5
def fragment(...)
def fragment(...) DocumentFragment.parse(...) end
def parse(...)
def parse(...) Document.parse(...) end
def read_and_encode(string, encoding)
def read_and_encode(string, encoding) # Read the string with the given encoding. if string.respond_to?(:read) string = if encoding.nil? string.read else string.read(encoding: encoding) end else # Otherwise the string has the given encoding. string = string.to_s if encoding string = string.dup string.force_encoding(encoding) end end # convert to UTF-8 if string.encoding != Encoding::UTF_8 string = reencode(string) end string end
def reencode(body, content_type = nil)
http://www.w3.org/TR/html5/syntax.html#determining-the-character-encoding
http://bugs.ruby-lang.org/issues/2567
the HTML5 standard.
this lead, Nokogiri::HTML5 attempts to do likewise, while attempting to more closely follow
Accordingly, Nokogiri::HTML4::Document.parse provides limited encoding detection. Following
the Gumbo parser *only* supports utf-8.
consumers of HTML as the default for HTML is iso-8859-1, most "good" producers use utf-8, and
default_ by the Ruby Net::HTTP library. This being said, it is a very real problem for
Charset sniffing is a complex and controversial topic that understandably isn't done _by
def reencode(body, content_type = nil) if body.encoding == Encoding::ASCII_8BIT encoding = nil # look for a Byte Order Mark (BOM) initial_bytes = body[0..2].bytes if initial_bytes[0..2] == [0xEF, 0xBB, 0xBF] encoding = Encoding::UTF_8 elsif initial_bytes[0..1] == [0xFE, 0xFF] encoding = Encoding::UTF_16BE elsif initial_bytes[0..1] == [0xFF, 0xFE] encoding = Encoding::UTF_16LE end # look for a charset in a content-encoding header if content_type encoding ||= content_type[/charset=["']?(.*?)($|["';\s])/i, 1] end # look for a charset in a meta tag in the first 1024 bytes unless encoding data = body[0..1023].gsub(/<!--.*?(-->|\Z)/m, "") data.scan(/<meta.*?>/im).each do |meta| encoding ||= meta[/charset=["']?([^>]*?)($|["'\s>])/im, 1] end end # if all else fails, default to the official default encoding for HTML encoding ||= Encoding::ISO_8859_1 # change the encoding to match the detected or inferred encoding body = body.dup begin body.force_encoding(encoding) rescue ArgumentError body.force_encoding(Encoding::ISO_8859_1) end end body.encode(Encoding::UTF_8) end