class Nokogiri::HTML::Document::EncodingReader
:nodoc:
def self.detect_encoding(chunk)
def self.detect_encoding(chunk) m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and return Nokogiri.XML(m[1]).encoding if Nokogiri.jruby? m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and return m[4] end handler = SAXHandler.new parser = Nokogiri::HTML::SAX::Parser.new(handler) catch(:found) { parser.parse(chunk) } handler.encoding rescue => e nil end
def initialize(io)
def initialize(io) @io = io @firstchunk = nil end
def read(len)
def read(len) # no support for a call without len if !@firstchunk @firstchunk = @io.read(len) or return nil # This implementation expects and assumes that the first # call from htmlReadIO() is made with a length long enough # (~1KB) to achieve further encoding detection that # libxml2 does not do. if encoding = EncodingReader.detect_encoding(@firstchunk) raise EncodingFoundException, encoding end # This chunk is stored for the next read in retry. return @firstchunk end ret = @firstchunk.slice!(0, len) if (len -= ret.length) > 0 rest = @io.read(len) and ret << rest end if ret.empty? nil else ret end end