# encoding: utf-8
#--
# Copyright (c) 2010 Ryan Grove <ryan@wonko.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the 'Software'), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#++
require 'nokogiri'
require 'sanitize/version'
require 'sanitize/config'
require 'sanitize/config/restricted'
require 'sanitize/config/basic'
require 'sanitize/config/relaxed'
class Sanitize
attr_reader :config
# Matches an attribute value that could be treated by a browser as a URL
# with a protocol prefix, such as "http:" or "javascript:". Any string of zero
# or more characters followed by a colon is considered a match, even if the
# colon is encoded as an entity and even if it's an incomplete entity (which
# IE6 and Opera will still parse).
REGEX_PROTOCOL = /^([A-Za-z0-9\+\-\.\&\;\#\s]*?)(?:\:|�*58|�*3a)/i
#--
# Class Methods
#++
# Returns a sanitized copy of _html_, using the settings in _config_ if
# specified.
def self.clean(html, config = {})
sanitize = Sanitize.new(config)
sanitize.clean(html)
end
# Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
# were made.
def self.clean!(html, config = {})
sanitize = Sanitize.new(config)
sanitize.clean!(html)
end
# Sanitizes the specified Nokogiri::XML::Node and all its children.
def self.clean_node!(node, config = {})
sanitize = Sanitize.new(config)
sanitize.clean_node!(node)
end
#--
# Instance Methods
#++
# Returns a new Sanitize object initialized with the settings in _config_.
def initialize(config = {})
# Sanitize configuration.
@config = Config::DEFAULT.merge(config)
@config[:transformers] = Array(@config[:transformers].dup)
# Convert arrays to hashes for faster lookups.
@allowed_elements = {}
@whitespace_elements = {}
@config[:elements].each {|el| @allowed_elements[el] = true }
@config[:whitespace_elements].each {|el| @whitespace_elements[el] = true }
# Convert the list of :remove_contents elements to a Hash for faster lookup.
@remove_all_contents = false
@remove_element_contents = {}
if @config[:remove_contents].is_a?(Array)
@config[:remove_contents].each {|el| @remove_element_contents[el] = true }
else
@remove_all_contents = !!@config[:remove_contents]
end
# Specific nodes to whitelist (along with all their attributes). This array
# is generated at runtime by transformers, and is cleared before and after
# a fragment is cleaned (so it applies only to a specific fragment).
@whitelist_nodes = []
end
# Returns a sanitized copy of _html_.
def clean(html)
if html
dupe = html.dup
clean!(dupe) || dupe
end
end
# Performs clean in place, returning _html_, or +nil+ if no changes were
# made.
def clean!(html)
fragment = Nokogiri::HTML::DocumentFragment.parse(html)
clean_node!(fragment)
output_method_params = {:encoding => @config[:output_encoding], :indent => 0}
if @config[:output] == :xhtml
output_method = fragment.method(:to_xhtml)
output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
elsif @config[:output] == :html
output_method = fragment.method(:to_html)
else
raise Error, "unsupported output format: #{@config[:output]}"
end
result = output_method.call(output_method_params)
return result == html ? nil : html[0, html.length] = result
end
# Sanitizes the specified Nokogiri::XML::Node and all its children.
def clean_node!(node)
raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
@whitelist_nodes = []
node.traverse do |child|
if child.element? || (child.text? && @config[:process_text_nodes])
clean_element!(child)
elsif child.comment?
child.unlink unless @config[:allow_comments]
elsif child.cdata?
child.replace(Nokogiri::XML::Text.new(child.text, child.document))
end
end
@whitelist_nodes = []
node
end
private
def clean_element!(node)
# Run this node through all configured transformers.
transform = transform_element!(node)
# If this node is in the dynamic whitelist array (built at runtime by
# transformers), let it live with all of its attributes intact.
return if @whitelist_nodes.include?(node)
name = node.name.to_s.downcase
# Delete any element that isn't in the whitelist.
unless transform[:whitelist] || @allowed_elements[name]
# Elements like br, div, p, etc. need to be replaced with whitespace in
# order to preserve readability.
if @whitespace_elements[name]
node.add_previous_sibling(' ')
node.add_next_sibling(' ') unless node.children.empty?
end
unless @remove_all_contents || @remove_element_contents[name]
node.children.each { |n| node.add_previous_sibling(n) }
end
node.unlink
return
end
attr_whitelist = (transform[:attr_whitelist] +
(@config[:attributes][name] || []) +
(@config[:attributes][:all] || [])).uniq
if attr_whitelist.empty?
# Delete all attributes from elements with no whitelisted attributes.
node.attribute_nodes.each {|attr| attr.remove }
else
# Delete any attribute that isn't in the whitelist for this element.
node.attribute_nodes.each do |attr|
attr.unlink unless attr_whitelist.include?(attr.name.downcase)
end
# Delete remaining attributes that use unacceptable protocols.
if @config[:protocols].has_key?(name)
protocol = @config[:protocols][name]
node.attribute_nodes.each do |attr|
attr_name = attr.name.downcase
next false unless protocol.has_key?(attr_name)
del = if attr.value.to_s.downcase =~ REGEX_PROTOCOL
!protocol[attr_name].include?($1.downcase)
else
!protocol[attr_name].include?(:relative)
end
attr.unlink if del
end
end
end
# Add required attributes.
if @config[:add_attributes].has_key?(name)
@config[:add_attributes][name].each do |key, val|
node[key] = val
end
end
transform
end
def transform_element!(node)
output = {
:attr_whitelist => [],
:node => node,
:whitelist => false
}
@config[:transformers].inject(node) do |transformer_node, transformer|
transform = transformer.call({
:allowed_elements => @allowed_elements,
:config => @config,
:node => transformer_node,
:node_name => transformer_node.name.downcase,
:whitelist_nodes => @whitelist_nodes
})
if transform.nil?
transformer_node
elsif transform.is_a?(Hash)
if transform[:whitelist_nodes].is_a?(Array)
@whitelist_nodes += transform[:whitelist_nodes]
@whitelist_nodes.uniq!
end
output[:attr_whitelist] += transform[:attr_whitelist] if transform[:attr_whitelist].is_a?(Array)
output[:whitelist] ||= true if transform[:whitelist]
output[:node] = transform[:node].is_a?(Nokogiri::XML::Node) ? transform[:node] : output[:node]
else
raise Error, "transformer output must be a Hash or nil"
end
end
node.replace(output[:node]) if node != output[:node]
return output
end
class Error < StandardError; end
end