class Nokogiri::HTML4::Document::EncodingReader
:nodoc:
def self.detect_encoding(chunk)
def self.detect_encoding(chunk) m = chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/) and return Nokogiri.XML(m[1]).encoding if Nokogiri.jruby? m = chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i) and return m[4] catch(:encoding_found) { Nokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk) nil } else handler = SAXHandler.new parser = Nokogiri::HTML4::SAX::PushParser.new(handler) parser << chunk rescue Nokogiri::SyntaxError handler.encoding end end
def initialize(io)
def initialize(io) @io = io @firstchunk = nil @encoding_found = nil end
def read(len)
def read(len) # no support for a call without len if !@firstchunk @firstchunk = @io.read(len) or return nil # This implementation expects that the first call from # htmlReadIO() is made with a length long enough (~1KB) to # achieve advanced encoding detection. if encoding = EncodingReader.detect_encoding(@firstchunk) # The first chunk is stored for the next read in retry. raise @encoding_found = EncodingFound.new(encoding) end end @encoding_found = nil ret = @firstchunk.slice!(0, len) if (len -= ret.length) > 0 rest = @io.read(len) and ret << rest end if ret.empty? nil else ret end end