#--# Copyright (c) 2009 Ryan Grove <ryan@wonko.com>## Permission is hereby granted, free of charge, to any person obtaining a copy# of this software and associated documentation files (the 'Software'), to deal# in the Software without restriction, including without limitation the rights# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell# copies of the Software, and to permit persons to whom the Software is# furnished to do so, subject to the following conditions:## The above copyright notice and this permission notice shall be included in all# copies or substantial portions of the Software.## THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE# SOFTWARE.#++# Append this file's directory to the include path if it's not there already.$:.unshift(File.dirname(File.expand_path(__FILE__)))$:.uniq!require'rubygems'gem'hpricot','~> 0.6'gem'htmlentities','~> 4.0.0'require'hpricot'require'htmlentities'require'sanitize/config'require'sanitize/config/restricted'require'sanitize/config/basic'require'sanitize/config/relaxed'require'sanitize/monkeypatch/hpricot'classSanitize# Matches an attribute value that could be treated by a browser as a URL# with a protocol prefix, such as "http:" or "javascript:". Any string of zero# or more characters followed by a colon is considered a match, even if the# colon is encoded as an entity and even if it's an incomplete entity (which# IE6 and Opera will still parse).REGEX_PROTOCOL=/^([^:]*)(?:\:|�*58|�*3a)/i#--# Class Methods#++# Returns a sanitized copy of _html_, using the settings in _config_ if# specified.defself.clean(html,config={})sanitize=Sanitize.new(config)sanitize.clean(html)end# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes# were made.defself.clean!(html,config={})sanitize=Sanitize.new(config)sanitize.clean!(html)end#--# Instance Methods#++# Returns a new Sanitize object initialized with the settings in _config_.definitialize(config={})@config=Config::DEFAULT.merge(config)end# Returns a sanitized copy of _html_.defclean(html)dupe=html.dupclean!(dupe)||dupeend# Performs clean in place, returning _html_, or +nil+ if no changes were# made.defclean!(html)fragment=Hpricot(html)fragment.search('*')do|node|ifnode.bogusetag?||node.doctype?||node.procins?||node.xmldecl?node.parent.replace_child(node,'')nextendifnode.comment?node.parent.replace_child(node,'')unless@config[:allow_comments]elsifnode.elem?name=node.name.to_s.downcase# Delete any element that isn't in the whitelist.unless@config[:elements].include?(name)node.parent.replace_child(node,node.children||'')nextendif@config[:attributes].has_key?(name)# Delete any attribute that isn't in the whitelist for this element.node.raw_attributes.delete_ifdo|key,value|!@config[:attributes][name].include?(key.to_s.downcase)end# Delete remaining attributes that use unacceptable protocols.if@config[:protocols].has_key?(name)protocol=@config[:protocols][name]node.raw_attributes.delete_ifdo|key,value|nextfalseunlessprotocol.has_key?(key)nexttrueifvalue.nil?ifvalue.to_s.downcase=~REGEX_PROTOCOL!protocol[key].include?($1.downcase)else!protocol[key].include?(:relative)endendendelse# Delete all attributes from elements with no whitelisted# attributes.node.raw_attributes={}end# Add required attributes.if@config[:add_attributes].has_key?(name)node.raw_attributes.merge!(@config[:add_attributes][name])endendend# Make one last pass through the fragment and encode all special HTML chars# and non-ASCII chars as entities. This eliminates certain types of# maliciously-malformed nested tags and also compensates for Hpricot's# burning desire to decode all entities.coder=HTMLEntities.newfragment.traverse_elementdo|node|ifnode.text?node.swap(coder.encode(node.inner_text,:named))endendresult=fragment.to_sreturnresult==html?nil:html[0,html.length]=resultendend