# encoding: utf-8require'nokogumbo'require'set'require_relative'sanitize/version'require_relative'sanitize/config'require_relative'sanitize/config/default'require_relative'sanitize/config/restricted'require_relative'sanitize/config/basic'require_relative'sanitize/config/relaxed'require_relative'sanitize/css'require_relative'sanitize/transformers/clean_cdata'require_relative'sanitize/transformers/clean_comment'require_relative'sanitize/transformers/clean_css'require_relative'sanitize/transformers/clean_doctype'require_relative'sanitize/transformers/clean_element'classSanitizeattr_reader:config# Matches an attribute value that could be treated by a browser as a URL# with a protocol prefix, such as "http:" or "javascript:". Any string of zero# or more characters followed by a colon is considered a match, even if the# colon is encoded as an entity and even if it's an incomplete entity (which# IE6 and Opera will still parse).REGEX_PROTOCOL=/\A([^\/#]*?)(?:\:|�*58|�*3a)/i# Matches Unicode characters that should be stripped from HTML before passing# it to the parser.## http://www.w3.org/TR/unicode-xml/#CharlistREGEX_UNSUITABLE_CHARS=/[\u0340\u0341\u17a3\u17d3\u2028\u2029\u202a-\u202e\u206a-\u206f\ufff9-\ufffb\ufeff\ufffc\u{1d173}-\u{1d17a}\u{e0000}-\u{e007f}]/u#--# Class Methods#++# Returns a sanitized copy of the given full _html_ document, using the# settings in _config_ if specified.## When sanitizing a document, the `<html>` element must be whitelisted or an# error will be raised. If this is undesirable, you should probably use# {#fragment} instead.defself.document(html,config={})Sanitize.new(config).document(html)end# Returns a sanitized copy of the given _html_ fragment, using the settings in# _config_ if specified.defself.fragment(html,config={})Sanitize.new(config).fragment(html)end# Sanitizes the given `Nokogiri::XML::Node` instance and all its children.defself.node!(node,config={})Sanitize.new(config).node!(node)end# Aliases for pre-3.0.0 backcompat.class<<Sanitize# @deprecated Use {.document} instead.alias_method:clean_document,:document# @deprecated Use {.fragment} instead.alias_method:clean,:fragment# @deprecated Use {.node!} instead.alias_method:clean_node!,:node!end#--# Instance Methods#++# Returns a new Sanitize object initialized with the settings in _config_.definitialize(config={})@config=Config.merge(Config::DEFAULT,config)@transformers=Array(@config[:transformers].dup)# Default transformers always run at the end of the chain, after any custom# transformers.@transformers<<Transformers::CleanCommentunless@config[:allow_comments]@transformers<<Transformers::CleanDoctypeunless@config[:allow_doctype]if@config[:elements].include?('style')scss=Sanitize::CSS.new(config)@transformers<<Transformers::CSS::CleanElement.new(scss)endif@config[:attributes].values.any?{|attr|attr.include?('style')}scss||=Sanitize::CSS.new(config)@transformers<<Transformers::CSS::CleanAttribute.new(scss)end@transformers<<Transformers::CleanCDATA<<Transformers::CleanElement.new(@config)end# Returns a sanitized copy of the given _html_ document.## When sanitizing a document, the `<html>` element must be whitelisted or an# error will be raised. If this is undesirable, you should probably use# {#fragment} instead.defdocument(html)return''unlesshtmldoc=Nokogiri::HTML5.parse(preprocess(html))node!(doc)to_html(doc)end# @deprecated Use {#document} instead.alias_method:clean_document,:document# Returns a sanitized copy of the given _html_ fragment.deffragment(html)return''unlesshtmlhtml=preprocess(html)doc=Nokogiri::HTML5.parse("<html><body>#{html}")# Hack to allow fragments containing <body>. Borrowed from# Nokogiri::HTML::DocumentFragment.ifhtml=~/\A<body(?:\s|>)/ipath='/html/body'elsepath='/html/body/node()'endfrag=doc.fragmentdoc.xpath(path).each{|node|frag<<node}node!(frag)to_html(frag)end# @deprecated Use {#fragment} instead.alias_method:clean,:fragment# Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it# in place.## If _node_ is a `Nokogiri::XML::Document`, the `<html>` element must be# whitelisted or an error will be raised.defnode!(node)raiseArgumentErrorunlessnode.is_a?(Nokogiri::XML::Node)ifnode.is_a?(Nokogiri::XML::Document)unless@config[:elements].include?('html')raiseError,'When sanitizing a document, "<html>" must be whitelisted.'endendnode_whitelist=Set.newtraverse(node)do|n|transform_node!(n,node_whitelist)endnodeend# @deprecated Use {#node!} instead.alias_method:clean_node!,:node!private# Preprocesses HTML before parsing to remove undesirable Unicode chars.defpreprocess(html)html.to_s.dupunlesshtml.encoding.name=='UTF-8'html.encode!('UTF-8',:invalid=>:replace,:undef=>:replace)endhtml.gsub!(REGEX_UNSUITABLE_CHARS,'')htmlenddefto_html(node)replace_meta=false# Hacky workaround for a libxml2 bug that adds an undesired Content-Type# meta tag to all serialized HTML documents.## https://github.com/sparklemotion/nokogiri/issues/1008ifnode.type==Nokogiri::XML::Node::DOCUMENT_NODE||node.type==Nokogiri::XML::Node::HTML_DOCUMENT_NODEregex_meta=%r|(<html[^>]*>\s*<head[^>]*>\s*)<meta http-equiv="Content-Type" content="text/html; charset=utf-8">|i# Only replace the content-type meta tag if <meta> isn't whitelisted or# the original document didn't actually include a content-type meta tag.replace_meta=!@config[:elements].include?('meta')||node.xpath('/html/head/meta[@http-equiv]').none?do|meta|meta['http-equiv'].downcase=='content-type'endendso=Nokogiri::XML::Node::SaveOptions# Serialize to HTML without any formatting to prevent Nokogiri from adding# newlines after certain tags.html=node.to_html(:encoding=>'utf-8',:indent=>0,:save_with=>so::NO_DECLARATION|so::NO_EMPTY_TAGS|so::AS_HTML)html.gsub!(regex_meta,'\1')ifreplace_metahtmlenddeftransform_node!(node,node_whitelist)@transformers.eachdo|transformer|result=transformer.call(:config=>@config,:is_whitelisted=>node_whitelist.include?(node),:node=>node,:node_name=>node.name.downcase,:node_whitelist=>node_whitelist)ifresult.is_a?(Hash)&&result[:node_whitelist].respond_to?(:each)node_whitelist.merge(result[:node_whitelist])endendnodeend# Performs top-down traversal of the given node, operating first on the node# itself, then traversing each child (if any) in order.deftraverse(node,&block)block.call(node)child=node.childwhilechilddoprev=child.previous_siblingtraverse(child,&block)ifchild.parent!=node# The child was unlinked or reparented, so traverse the previous node's# next sibling, or the parent's first child if there is no previous# node.child=prev?prev.next_sibling:node.childelsechild=child.next_siblingendendendclassError<StandardError;endend