# coding: utf-8# frozen_string_literal: truerequire"pathname"moduleNokogirimoduleHTML4classDocument<Nokogiri::XML::Document#### Get the meta tag encoding for this document. If there is no meta tag,# then nil is returned.defmeta_encodingif(meta=at_xpath("//meta[@charset]"))meta[:charset]elsif(meta=meta_content_type)meta["content"][/charset\s*=\s*([\w-]+)/i,1]endend#### Set the meta tag encoding for this document.## If an meta encoding tag is already present, its content is# replaced with the given text.## Otherwise, this method tries to create one at an appropriate# place supplying head and/or html elements as necessary, which# is inside a head element if any, and before any text node or# content element (typically <body>) if any.## The result when trying to set an encoding that is different# from the document encoding is undefined.## Beware in CRuby, that libxml2 automatically inserts a meta tag# into a head element.defmeta_encoding=(encoding)if(meta=meta_content_type)meta["content"]=format("text/html; charset=%s",encoding)encodingelsif(meta=at_xpath("//meta[@charset]"))meta["charset"]=encodingelsemeta=XML::Node.new("meta",self)if(dtd=internal_subset)&&dtd.html5_dtd?meta["charset"]=encodingelsemeta["http-equiv"]="Content-Type"meta["content"]=format("text/html; charset=%s",encoding)endif(head=at_xpath("//head"))head.prepend_child(meta)elseset_metadata_element(meta)endencodingendenddefmeta_content_typexpath("//meta[@http-equiv and boolean(@content)]").finddo|node|node["http-equiv"]=~/\AContent-Type\z/iendendprivate:meta_content_type#### Get the title string of this document. Return nil if there is# no title tag.deftitle(title=at_xpath("//title"))&&title.inner_textend#### Set the title string of this document.## If a title element is already present, its content is replaced# with the given text.## Otherwise, this method tries to create one at an appropriate# place supplying head and/or html elements as necessary, which# is inside a head element if any, right after a meta# encoding/charset tag if any, and before any text node or# content element (typically <body>) if any.deftitle=(text)tnode=XML::Text.new(text,self)if(title=at_xpath("//title"))title.children=tnodereturntextendtitle=XML::Node.new("title",self)<<tnodeif(head=at_xpath("//head"))head<<titleelsif(meta=(at_xpath("//meta[@charset]")||meta_content_type))# better put after charset declarationmeta.add_next_sibling(title)elseset_metadata_element(title)endenddefset_metadata_element(element)# rubocop:disable Naming/AccessorMethodNameif(head=at_xpath("//head"))head<<elementelsif(html=at_xpath("//html"))head=html.prepend_child(XML::Node.new("head",self))head.prepend_child(element)elsif(first=children.finddo|node|casenodewhenXML::Element,XML::Texttrueendend)# We reach here only if the underlying document model# allows <html>/<head> elements to be omitted and does not# automatically supply them.first.add_previous_sibling(element)elsehtml=add_child(XML::Node.new("html",self))head=html.add_child(XML::Node.new("head",self))head.prepend_child(element)endendprivate:set_metadata_element##### Serialize Node using +options+. Save options can also be set using a# block. See SaveOptions.## These two statements are equivalent:## node.serialize(:encoding => 'UTF-8', :save_with => FORMAT | AS_XML)## or## node.serialize(:encoding => 'UTF-8') do |config|# config.format.as_xml# end#defserialize(options={})options[:save_with]||=XML::Node::SaveOptions::DEFAULT_HTMLsuperend##### Create a Nokogiri::XML::DocumentFragment from +tags+deffragment(tags=nil)DocumentFragment.new(self,tags,root)end# :call-seq:# xpath_doctype() → Nokogiri::CSS::XPathVisitor::DoctypeConfig## [Returns] The document type which determines CSS-to-XPath translation.## See XPathVisitor for more information.defxpath_doctypeNokogiri::CSS::XPathVisitor::DoctypeConfig::HTML4endclass<<self#### Parse HTML. +string_or_io+ may be a String, or any object that# responds to _read_ and _close_ such as an IO, or StringIO.# +url+ is resource where this document is located. +encoding+ is the# encoding that should be used when processing the document. +options+# is a number that sets options in the parser, such as# Nokogiri::XML::ParseOptions::RECOVER. See the constants in# Nokogiri::XML::ParseOptions.defparse(string_or_io,url=nil,encoding=nil,options=XML::ParseOptions::DEFAULT_HTML)options=Nokogiri::XML::ParseOptions.new(options)ifInteger===optionsyieldoptionsifblock_given?url||=string_or_io.respond_to?(:path)?string_or_io.path:nilifstring_or_io.respond_to?(:encoding)unlessstring_or_io.encoding.name=="ASCII-8BIT"encoding||=string_or_io.encoding.nameendendifstring_or_io.respond_to?(:read)ifstring_or_io.is_a?(Pathname)# resolve the Pathname to the file and open it as an IO object, see #2110string_or_io=string_or_io.expand_path.openurl||=string_or_io.pathendunlessencoding# Libxml2's parser has poor support for encoding# detection. First, it does not recognize the HTML5# style meta charset declaration. Secondly, even if it# successfully detects an encoding hint, it does not# re-decode or re-parse the preceding part which may be# garbled.## EncodingReader aims to perform advanced encoding# detection beyond what Libxml2 does, and to emulate# rewinding of a stream and make Libxml2 redo parsing# from the start when an encoding hint is found.string_or_io=EncodingReader.new(string_or_io)beginreturnread_io(string_or_io,url,encoding,options.to_i)rescueEncodingFound=>eencoding=e.found_encodingendendreturnread_io(string_or_io,url,encoding,options.to_i)end# read_memory pukes on empty docsifstring_or_io.nil?||string_or_io.empty?returnencoding?new.tap{|i|i.encoding=encoding}:newendencoding||=EncodingReader.detect_encoding(string_or_io)read_memory(string_or_io,url,encoding,options.to_i)endendclassEncodingFound<StandardError# :nodoc: allattr_reader:found_encodingdefinitialize(encoding)@found_encoding=encodingsuper(format("encoding found: %s",encoding))endend# :nodoc: allclassEncodingReaderclassSAXHandler<Nokogiri::XML::SAX::Documentattr_reader:encodingdefinitialize@encoding=nilsuper()enddefstart_element(name,attrs=[])returnunlessname=="meta"attr=Hash[attrs](charset=attr["charset"])&&(@encoding=charset)(http_equiv=attr["http-equiv"])&&http_equiv.match(/\AContent-Type\z/i)&&(content=attr["content"])&&(m=content.match(/;\s*charset\s*=\s*([\w-]+)/))&&(@encoding=m[1])endendclassJumpSAXHandler<SAXHandlerdefinitialize(jumptag)@jumptag=jumptagsuper()enddefstart_element(name,attrs=[])superthrow(@jumptag,@encoding)if@encodingthrow(@jumptag,nil)if/\A(?:div|h1|img|p|br)\z/.match?(name)endenddefself.detect_encoding(chunk)(m=chunk.match(/\A(<\?xml[ \t\r\n]+[^>]*>)/))&&(returnNokogiri.XML(m[1]).encoding)ifNokogiri.jruby?(m=chunk.match(/(<meta\s)(.*)(charset\s*=\s*([\w-]+))(.*)/i))&&(returnm[4])catch(:encoding_found)doNokogiri::HTML4::SAX::Parser.new(JumpSAXHandler.new(:encoding_found)).parse(chunk)nilendelsehandler=SAXHandler.newparser=Nokogiri::HTML4::SAX::PushParser.new(handler)beginparser<<chunkrescueNokogiri::SyntaxErrorendhandler.encodingendenddefinitialize(io)@io=io@firstchunk=nil@encoding_found=nilend# This method is used by the C extension so that# Nokogiri::HTML4::Document#read_io() does not leak memory when# EncodingFound is raised.attr_reader:encoding_founddefread(len)# no support for a call without lenunless@firstchunk(@firstchunk=@io.read(len))||(returnnil)# This implementation expects that the first call from# htmlReadIO() is made with a length long enough (~1KB) to# achieve advanced encoding detection.if(encoding=EncodingReader.detect_encoding(@firstchunk))# The first chunk is stored for the next read in retry.raise@encoding_found=EncodingFound.new(encoding)endend@encoding_found=nilret=@firstchunk.slice!(0,len)if(len-=ret.length)>0(rest=@io.read(len))&&ret<<(rest)endifret.empty?nilelseretendendendendendend