gem.sh

lib/sanitize.rb

# encoding: utf-8
#--
# Copyright (c) 2013 Ryan Grove <ryan@wonko.com>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the 'Software'), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED 'AS IS', WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#++

require 'set'

require 'nokogiri'
require 'sanitize/version'
require 'sanitize/config'
require 'sanitize/config/restricted'
require 'sanitize/config/basic'
require 'sanitize/config/relaxed'
require 'sanitize/transformers/clean_cdata'
require 'sanitize/transformers/clean_comment'
require 'sanitize/transformers/clean_element'

class Sanitize
  attr_reader :config

  # Matches an attribute value that could be treated by a browser as a URL
  # with a protocol prefix, such as "http:" or "javascript:". Any string of zero
  # or more characters followed by a colon is considered a match, even if the
  # colon is encoded as an entity and even if it's an incomplete entity (which
  # IE6 and Opera will still parse).
  REGEX_PROTOCOL = /\A([^\/]*?)(?:\:|&#0*58|&#x0*3a)/i

  #--
  # Class Methods
  #++

  # Returns a sanitized copy of _html_, using the settings in _config_ if
  # specified.
  def self.clean(html, config = {})
    Sanitize.new(config).clean(html)
  end

  # Performs Sanitize#clean in place, returning _html_, or +nil+ if no changes
  # were made.
  def self.clean!(html, config = {})
    Sanitize.new(config).clean!(html)
  end

  # Performs a Sanitize#clean using a full-document HTML parser instead of
  # the default fragment parser. This will add a DOCTYPE and html tag
  # unless they are already present
  def self.clean_document(html, config = {})
    Sanitize.new(config).clean_document(html)
  end

  # Performs Sanitize#clean_document in place, returning _html_, or +nil+ if no
  # changes were made.
  def self.clean_document!(html, config = {})
    Sanitize.new(config).clean_document!(html)
  end

  # Sanitizes the specified Nokogiri::XML::Node and all its children.
  def self.clean_node!(node, config = {})
    Sanitize.new(config).clean_node!(node)
  end

  #--
  # Instance Methods
  #++

  # Returns a new Sanitize object initialized with the settings in _config_.
  def initialize(config = {})
    @config = Config::DEFAULT.merge(config)

    @transformers = {
      :breadth => Array(@config[:transformers_breadth].dup),
      :depth   => Array(@config[:transformers]) + Array(@config[:transformers_depth])
    }

    # Default depth transformers. These always run at the end of the chain,
    # after any custom transformers.
    @transformers[:depth] << Transformers::CleanComment unless @config[:allow_comments]

    @transformers[:depth] <<
        Transformers::CleanCDATA <<
        Transformers::CleanElement.new(@config)
  end

  # Returns a sanitized copy of _html_.
  def clean(html)
    if html
      dupe = html.dup
      clean!(dupe) || dupe
    end
  end

  # Performs clean in place, returning _html_, or +nil+ if no changes were
  # made.
  def clean!(html, parser = Nokogiri::HTML::DocumentFragment)
    fragment = parser.parse(html)
    clean_node!(fragment)

    output_method_params = {:encoding => @config[:output_encoding], :indent => 0}

    if @config[:output] == :xhtml
      output_method = fragment.method(:to_xhtml)
      output_method_params[:save_with] = Nokogiri::XML::Node::SaveOptions::AS_XHTML
    elsif @config[:output] == :html
      output_method = fragment.method(:to_html)
    else
      raise Error, "unsupported output format: #{@config[:output]}"
    end

    result = output_method.call(output_method_params)

    return result == html ? nil : html[0, html.length] = result
  end

  def clean_document(html)
    unless html.nil?
      clean_document!(html.dup) || html
    end
  end

  def clean_document!(html)
    if !@config[:elements].include?('html') && !@config[:remove_contents]
      raise 'You must have the HTML element whitelisted to call #clean_document unless remove_contents is set to true'
      # otherwise Nokogiri will raise for having multiple root nodes when
      # it moves its children to the root document context
    end

    clean!(html, Nokogiri::HTML::Document)
  end

  # Sanitizes the specified Nokogiri::XML::Node and all its children.
  def clean_node!(node)
    raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)

    node_whitelist = Set.new

    unless @transformers[:breadth].empty?
      traverse_breadth(node) {|n| transform_node!(n, node_whitelist, :breadth) }
    end

    traverse_depth(node) {|n| transform_node!(n, node_whitelist, :depth) }
    node
  end

  private

  def transform_node!(node, node_whitelist, mode)
    @transformers[mode].each do |transformer|
      result = transformer.call({
        :config         => @config,
        :is_whitelisted => node_whitelist.include?(node),
        :node           => node,
        :node_name      => node.name.downcase,
        :node_whitelist => node_whitelist,
        :traversal_mode => mode
      })

      if result.is_a?(Hash) && result[:node_whitelist].respond_to?(:each)
        node_whitelist.merge(result[:node_whitelist])
      end
    end

    node
  end

  # Performs breadth-first traversal, operating first on the root node, then
  # traversing downwards.
  def traverse_breadth(node, &block)
    block.call(node)
    node.children.each {|child| traverse_breadth(child, &block) }
  end

  # Performs depth-first traversal, operating first on the deepest nodes in the
  # document, then traversing upwards to the root.
  def traverse_depth(node, &block)
    node.children.each {|child| traverse_depth(child, &block) }
    block.call(node)
  end

  class Error < StandardError; end
end
Modules

Classes

lib/sanitize.rb

Source Files