# encoding: utf-8#--# Copyright (c) 2010 Ryan Grove <ryan@wonko.com>## Permission is hereby granted, free of charge, to any person obtaining a copy# of this software and associated documentation files (the 'Software'), to deal# in the Software without restriction, including without limitation the rights# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell# copies of the Software, and to permit persons to whom the Software is# furnished to do so, subject to the following conditions:## The above copyright notice and this permission notice shall be included in all# copies or substantial portions of the Software.## THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE# SOFTWARE.#++require'nokogiri'require'sanitize/version'require'sanitize/config'require'sanitize/config/restricted'require'sanitize/config/basic'require'sanitize/config/relaxed'classSanitizeattr_reader:config# Matches an attribute value that could be treated by a browser as a URL# with a protocol prefix, such as "http:" or "javascript:". Any string of zero# or more characters followed by a colon is considered a match, even if the# colon is encoded as an entity and even if it's an incomplete entity (which# IE6 and Opera will still parse).REGEX_PROTOCOL=/^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|�*58|�*3a)/i#--# Class Methods#++# Returns a sanitized copy of _html_, using the settings in _config_ if# specified.defself.clean(html,config={})sanitize=Sanitize.new(config)sanitize.clean(html)end# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes# were made.defself.clean!(html,config={})sanitize=Sanitize.new(config)sanitize.clean!(html)end# Sanitizes the specified Nokogiri::XML::Node and all its children.defself.clean_node!(node,config={})sanitize=Sanitize.new(config)sanitize.clean_node!(node)end#--# Instance Methods#++# Returns a new Sanitize object initialized with the settings in _config_.definitialize(config={})# Sanitize configuration.@config=Config::DEFAULT.merge(config)@config[:transformers]=Array(@config[:transformers].dup)# Convert the list of allowed elements to a Hash for faster lookup.@allowed_elements={}@config[:elements].each{|el|@allowed_elements[el]=true}# Convert the list of :remove_contents elements to a Hash for faster lookup.@remove_all_contents=false@remove_element_contents={}if@config[:remove_contents].is_a?(Array)@config[:remove_contents].each{|el|@remove_element_contents[el]=true}else@remove_all_contents=!!@config[:remove_contents]end# Specific nodes to whitelist (along with all their attributes). This array# is generated at runtime by transformers, and is cleared before and after# a fragment is cleaned (so it applies only to a specific fragment).@whitelist_nodes=[]end# Returns a sanitized copy of _html_.defclean(html)ifhtmldupe=html.dupclean!(dupe)||dupeendend# Performs clean in place, returning _html_, or +nil+ if no changes were# made.defclean!(html)fragment=Nokogiri::HTML::DocumentFragment.parse(html)clean_node!(fragment)output_method_params={:encoding=>@config[:output_encoding],:indent=>0}if@config[:output]==:xhtmloutput_method=fragment.method(:to_xhtml)output_method_params[:save_with]=Nokogiri::XML::Node::SaveOptions::AS_XHTMLelsif@config[:output]==:htmloutput_method=fragment.method(:to_html)elseraiseError,"unsupported output format: #{@config[:output]}"endresult=output_method.call(output_method_params)returnresult==html?nil:html[0,html.length]=resultend# Sanitizes the specified Nokogiri::XML::Node and all its children.defclean_node!(node)raiseArgumentErrorunlessnode.is_a?(Nokogiri::XML::Node)@whitelist_nodes=[]node.traversedo|child|ifchild.element?||(child.text?&&@config[:process_text_nodes])clean_element!(child)elsifchild.comment?child.unlinkunless@config[:allow_comments]elsifchild.cdata?child.replace(Nokogiri::XML::Text.new(child.text,child.document))endend@whitelist_nodes=[]nodeendprivatedefclean_element!(node)# Run this node through all configured transformers.transform=transform_element!(node)# If this node is in the dynamic whitelist array (built at runtime by# transformers), let it live with all of its attributes intact.returnif@whitelist_nodes.include?(node)name=node.name.to_s.downcase# Delete any element that isn't in the whitelist.unlesstransform[:whitelist]||@allowed_elements[name]unless@remove_all_contents||@remove_element_contents[name]node.children.each{|n|node.add_previous_sibling(n)}endnode.unlinkreturnendattr_whitelist=(transform[:attr_whitelist]+(@config[:attributes][name]||[])+(@config[:attributes][:all]||[])).uniqifattr_whitelist.empty?# Delete all attributes from elements with no whitelisted attributes.node.attribute_nodes.each{|attr|attr.remove}else# Delete any attribute that isn't in the whitelist for this element.node.attribute_nodes.eachdo|attr|attr.unlinkunlessattr_whitelist.include?(attr.name.downcase)end# Delete remaining attributes that use unacceptable protocols.if@config[:protocols].has_key?(name)protocol=@config[:protocols][name]node.attribute_nodes.eachdo|attr|attr_name=attr.name.downcasenextfalseunlessprotocol.has_key?(attr_name)del=ifattr.value.to_s.downcase=~REGEX_PROTOCOL!protocol[attr_name].include?($1.downcase)else!protocol[attr_name].include?(:relative)endattr.unlinkifdelendendend# Add required attributes.if@config[:add_attributes].has_key?(name)@config[:add_attributes][name].eachdo|key,val|node[key]=valendendtransformenddeftransform_element!(node)output={:attr_whitelist=>[],:node=>node,:whitelist=>false}@config[:transformers].inject(node)do|transformer_node,transformer|transform=transformer.call({:allowed_elements=>@allowed_elements,:config=>@config,:node=>transformer_node,:node_name=>transformer_node.name.downcase,:whitelist_nodes=>@whitelist_nodes})iftransform.nil?transformer_nodeelsiftransform.is_a?(Hash)iftransform[:whitelist_nodes].is_a?(Array)@whitelist_nodes+=transform[:whitelist_nodes]@whitelist_nodes.uniq!endoutput[:attr_whitelist]+=transform[:attr_whitelist]iftransform[:attr_whitelist].is_a?(Array)output[:whitelist]||=trueiftransform[:whitelist]output[:node]=transform[:node].is_a?(Nokogiri::XML::Node)?transform[:node]:output[:node]elseraiseError,"transformer output must be a Hash or nil"endendnode.replace(output[:node])ifnode!=output[:node]returnoutputendclassError<StandardError;endend