# frozen_string_literal: truerequire'pathname'moduleNokogirimoduleHTMLclassDocument<Nokogiri::XML::Document#### Get the meta tag encoding for this document. If there is no meta tag,# then nil is returned.defmeta_encodingcasewhenmeta=at('//meta[@charset]')meta[:charset]whenmeta=meta_content_typemeta['content'][/charset\s*=\s*([\w-]+)/i,1]endend#### Set the meta tag encoding for this document.## If an meta encoding tag is already present, its content is# replaced with the given text.## Otherwise, this method tries to create one at an appropriate# place supplying head and/or html elements as necessary, which# is inside a head element if any, and before any text node or# content element (typically <body>) if any.## The result when trying to set an encoding that is different# from the document encoding is undefined.## Beware in CRuby, that libxml2 automatically inserts a meta tag# into a head element.defmeta_encoding=encodingcasewhenmeta=meta_content_typemeta['content']='text/html; charset=%s'%encodingencodingwhenmeta=at('//meta[@charset]')meta['charset']=encodingelsemeta=XML::Node.new('meta',self)ifdtd=internal_subsetanddtd.html5_dtd?meta['charset']=encodingelsemeta['http-equiv']='Content-Type'meta['content']='text/html; charset=%s'%encodingendcasewhenhead=at('//head')head.prepend_child(meta)elseset_metadata_element(meta)endencodingendenddefmeta_content_typexpath('//meta[@http-equiv and boolean(@content)]').find{|node|node['http-equiv']=~/\AContent-Type\z/i}endprivate:meta_content_type#### Get the title string of this document. Return nil if there is# no title tag.deftitletitle=at('//title')andtitle.inner_textend#### Set the title string of this document.## If a title element is already present, its content is replaced# with the given text.## Otherwise, this method tries to create one at an appropriate# place supplying head and/or html elements as necessary, which# is inside a head element if any, right after a meta# encoding/charset tag if any, and before any text node or# content element (typically <body>) if any.deftitle=(text)tnode=XML::Text.new(text,self)iftitle=at('//title')title.children=tnodereturntextendtitle=XML::Node.new('title',self)<<tnodecasewhenhead=at('//head')head<<titlewhenmeta=at('//meta[@charset]')||meta_content_type# better put after charset declarationmeta.add_next_sibling(title)elseset_metadata_element(title)endtextenddefset_metadata_element(element)casewhenhead=at('//head')head<<elementwhenhtml=at('//html')head=html.prepend_child(XML::Node.new('head',self))head.prepend_child(element)whenfirst=children.find{|node|casenodewhenXML::Element,XML::Texttrueend}# We reach here only if the underlying document model# allows <html>/<head> elements to be omitted and does not# automatically supply them.first.add_previous_sibling(element)elsehtml=add_child(XML::Node.new('html',self))head=html.add_child(XML::Node.new('head',self))head.prepend_child(element)endendprivate:set_metadata_element##### Serialize Node using +options+. Save options can also be set using a# block. See SaveOptions.## These two statements are equivalent:## node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)## or## node.serialize(:encoding => 'UTF-8') do |config|# config.format.as_xml# end#defserializeoptions={}options[:save_with]||=XML::Node::SaveOptions::DEFAULT_HTMLsuperend##### Create a Nokogiri::XML::DocumentFragment from +tags+deffragmenttags=nilDocumentFragment.new(self,tags,self.root)endclass<<self#### Parse HTML. +string_or_io+ may be a String, or any object that# responds to _read_ and _close_ such as an IO, or StringIO.# +url+ is resource where this document is located. +encoding+ is the# encoding that should be used when processing the document. +options+# is a number that sets options in the parser, such as# Nokogiri::XML::ParseOptions::RECOVER. See the constants in# Nokogiri::XML::ParseOptions.defparsestring_or_io,url=nil,encoding=nil,options=XML::ParseOptions::DEFAULT_HTMLoptions=Nokogiri::XML::ParseOptions.new(options)ifInteger===optionsyieldoptionsifblock_given?url||=string_or_io.respond_to?(:path)?string_or_io.path:nilifstring_or_io.respond_to?(:encoding)unlessstring_or_io.encoding.name=="ASCII-8BIT"encoding||=string_or_io.encoding.nameendendifstring_or_io.respond_to?(:read)ifstring_or_io.is_a?(Pathname)# resolve the Pathname to the file and open it as an IO object, see #2110string_or_io=string_or_io.expand_path.openurl||=string_or_io.pathendunlessencoding# Libxml2's parser has poor support for encoding# detection. First, it does not recognize the HTML5# style meta charset declaration. Secondly, even if it# successfully detects an encoding hint, it does not# re-decode or re-parse the preceding part which may be# garbled.## EncodingReader aims to perform advanced encoding# detection beyond what Libxml2 does, and to emulate# rewinding of a stream and make Libxml2 redo parsing# from the start when an encoding hint is found.string_or_io=EncodingReader.new(string_or_io)beginreturnread_io(string_or_io,url,encoding,options.to_i)rescueEncodingFound=>eencoding=e.found_encodingendendreturnread_io(string_or_io,url,encoding,options.to_i)end# read_memory pukes on empty docsifstring_or_io.nil?orstring_or_io.empty?returnencoding?new.tap{|i|i.encoding=encoding}:newendencoding||=EncodingReader.detect_encoding(string_or_io)read_memory(string_or_io,url,encoding,options.to_i)endendclassEncodingFound<StandardError# :nodoc:attr_reader:found_encodingdefinitialize(encoding)@found_encoding=encodingsuper("encoding found: %s"%encoding)endendclassEncodingReader# :nodoc:classSAXHandler<Nokogiri::XML::SAX::Document# :nodoc:attr_reader:encodingdefinitialize@encoding=nilsuper()enddefstart_element(name,attrs=[])returnunlessname=='meta'attr=Hash[attrs]charset=attr['charset']and@encoding=charsethttp_equiv=attr['http-equiv']andhttp_equiv.match(/\AContent-Type\z/i)andcontent=attr['content']andm=content.match(/;\s*charset\s*=\s*([\w-]+)/)and@encoding=m[1]endendclassJumpSAXHandler<SAXHandlerdefinitialize(jumptag)@jumptag=jumptagsuper()enddefstart_element(name,attrs=[])superthrow@jumptag,@encodingif@encodingthrow@jumptag,nilifname=~/\A(?:div|h1|img|p|br)\z/endenddefself.detect_encoding(chunk)m=chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/)andreturnNokogiri.XML(m[1]).encodingifNokogiri.jruby?m=chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i)andreturnm[4]catch(:encoding_found){Nokogiri::HTML::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)nil}elsehandler=SAXHandler.newparser=Nokogiri::HTML::SAX::PushParser.new(handler)parser<<chunkrescueNokogiri::SyntaxErrorhandler.encodingendenddefinitialize(io)@io=io@firstchunk=nil@encoding_found=nilend# This method is used by the C extension so that# Nokogiri::HTML::Document#read_io() does not leak memory when# EncodingFound is raised.attr_reader:encoding_founddefread(len)# no support for a call without lenif!@firstchunk@firstchunk=@io.read(len)orreturnnil# This implementation expects that the first call from# htmlReadIO() is made with a length long enough (~1KB) to# achieve advanced encoding detection.ifencoding=EncodingReader.detect_encoding(@firstchunk)# The first chunk is stored for the next read in retry.raise@encoding_found=EncodingFound.new(encoding)endend@encoding_found=nilret=@firstchunk.slice!(0,len)if(len-=ret.length)>0rest=@io.read(len)andret<<restendifret.empty?nilelseretendendendendendend