class Sanitize::Transformers::CleanElement

def call(env)

def call(env)
  node = env[:node]
  return if node.type != Nokogiri::XML::Node::ELEMENT_NODE || env[:is_allowlisted]
  name = env[:node_name]
  # Delete any element that isn't in the config allowlist, unless the node
  # has already been deleted from the document.
  #
  # It's important that we not try to reparent the children of a node that
  # has already been deleted, since that seems to trigger a memory leak in
  # Nokogiri.
  unless @elements.include?(name) || node.parent.nil?
    # Elements like br, div, p, etc. need to be replaced with whitespace
    # in order to preserve readability.
    if @whitespace_elements.include?(name)
      node.add_previous_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:before].to_s, node.document))
      unless node.children.empty?
        node.add_next_sibling(Nokogiri::XML::Text.new(@whitespace_elements[name][:after].to_s, node.document))
      end
    end
    unless node.children.empty?
      unless @remove_all_contents || @remove_element_contents.include?(name)
        node.add_previous_sibling(node.children)
      end
    end
    node.unlink
    return
  end
  attr_allowlist = @attributes[name] || @attributes[:all]
  if attr_allowlist.nil?
    # Delete all attributes from elements with no allowlisted attributes.
    node.attribute_nodes.each { |attr| attr.unlink }
  else
    allow_data_attributes = attr_allowlist.include?(:data)
    # Delete any attribute that isn't allowed on this element.
    node.attribute_nodes.each do |attr|
      attr_name = attr.name.downcase
      unless attr_allowlist.include?(attr_name)
        # The attribute isn't in the allowlist, but may still be allowed
        # if it's a data attribute.
        unless allow_data_attributes && attr_name.start_with?("data-") && attr_name =~ REGEX_DATA_ATTR
          # Either the attribute isn't a data attribute or arbitrary data
          # attributes aren't allowed. Remove the attribute.
          attr.unlink
          next
        end
      end
      # The attribute is allowed.
      # Remove any attributes that use unacceptable protocols.
      if @protocols.include?(name) && @protocols[name].include?(attr_name)
        attr_protocols = @protocols[name][attr_name]
        if attr.value =~ REGEX_PROTOCOL
          unless attr_protocols.include?($1.downcase)
            attr.unlink
            next
          end
        else
          unless attr_protocols.include?(:relative)
            attr.unlink
            next
          end
        end
        # Leading and trailing whitespace around URLs is ignored at parse
        # time. Stripping it here prevents it from being escaped by the
        # libxml2 workaround below.
        attr.value = attr.value.strip
      end
      # libxml2 >= 2.9.2 doesn't escape comments within some attributes,
      # in an attempt to preserve server-side includes. This can result in
      # XSS since an unescaped double quote can allow an attacker to
      # inject a non-allowlisted attribute.
      #
      # Sanitize works around this by implementing its own escaping for
      # affected attributes, some of which can exist on any element and
      # some of which can only exist on `<a>` elements.
      #
      # This fix is technically no longer necessary with Nokogumbo >= 2.0
      # since it no longer uses libxml2's serializer, but it's retained to
      # avoid breaking use cases where people might be sanitizing
      # individual Nokogiri nodes and then serializing them manually
      # without Nokogumbo.
      #
      # The relevant libxml2 code is here:
      # <https://github.com/GNOME/libxml2/commit/960f0e275616cadc29671a218d7fb9b69eb35588>
      if UNSAFE_LIBXML_ATTRS_GLOBAL.include?(attr_name) ||
          (name == "a" && UNSAFE_LIBXML_ATTRS_A.include?(attr_name))
        attr.value = attr.value.gsub(UNSAFE_LIBXML_ESCAPE_REGEX, UNSAFE_LIBXML_ESCAPE_CHARS)
      end
    end
  end
  # Add required attributes.
  if @add_attributes.include?(name)
    @add_attributes[name].each { |key, val| node[key] = val }
  end
  # Element-specific special cases.
  case name
  # If this is an allowlisted iframe that has children, remove all its
  # children. The HTML standard says iframes shouldn't have content, but
  # when they do, this content is parsed as text and is serialized
  # verbatim without being escaped, which is unsafe because legacy
  # browsers may still render it and execute `<script>` content. So the
  # safe and correct thing to do is to always remove iframe content.
  when "iframe"
    if !node.children.empty?
      node.children.each do |child|
        child.unlink
      end
    end
  # Prevent the use of `<meta>` elements that set a charset other than
  # UTF-8, since Sanitize's output is always UTF-8.
  when "meta"
    if node.has_attribute?("charset") &&
        node["charset"].downcase != "utf-8"
      node["charset"] = "utf-8"
    end
    if node.has_attribute?("http-equiv") &&
        node.has_attribute?("content") &&
        node["http-equiv"].downcase == "content-type" &&
        node["content"].downcase =~ /;\s*charset\s*=\s*(?!utf-8)/
      node["content"] = node["content"].gsub(/;\s*charset\s*=.+\z/, ";charset=utf-8")
    end
  # A `<noscript>` element's content is parsed differently in browsers
  # depending on whether or not scripting is enabled. Since Nokogiri
  # doesn't support scripting, it always parses `<noscript>` elements as
  # if scripting is disabled. This results in edge cases where it's not
  # possible to reliably sanitize the contents of a `<noscript>` element
  # because Nokogiri can't fully replicate the parsing behavior of a
  # scripting-enabled browser. The safest thing to do is to simply remove
  # all `<noscript>` elements.
  when "noscript"
    node.unlink
  end
end

def initialize(config)

def initialize(config)
  @add_attributes = config[:add_attributes]
  @attributes = config[:attributes].dup
  @elements = config[:elements]
  @protocols = config[:protocols]
  @remove_all_contents = false
  @remove_element_contents = Set.new
  @whitespace_elements = {}
  @attributes.each do |element_name, attrs|
    unless element_name == :all
      @attributes[element_name] = Set.new(attrs).merge(@attributes[:all] || [])
    end
  end
  # Backcompat: if :whitespace_elements is a Set, convert it to a hash.
  if config[:whitespace_elements].is_a?(Set)
    config[:whitespace_elements].each do |element|
      @whitespace_elements[element] = {before: " ", after: " "}
    end
  else
    @whitespace_elements = config[:whitespace_elements]
  end
  if config[:remove_contents].is_a?(Enumerable)
    @remove_element_contents.merge(config[:remove_contents].map(&:to_s))
  else
    @remove_all_contents = !!config[:remove_contents]
  end
end

Modules

Classes

class Sanitize::Transformers::CleanElement

def call(env)

def initialize(config)

Namespace

Instance Methods

Defined in