class Sanitize

def self.document(html, config = {})

{#fragment} instead.
error will be raised. If this is undesirable, you should probably use
When sanitizing a document, the `` element must be allowlisted or an

settings in _config_ if specified.
Returns a sanitized copy of the given full _html_ document, using the

def self.document(html, config = {})
  Sanitize.new(config).document(html)
end

def self.fragment(html, config = {})

_config_ if specified.
Returns a sanitized copy of the given _html_ fragment, using the settings in

def self.fragment(html, config = {})
  Sanitize.new(config).fragment(html)
end

def self.node!(node, config = {})

Sanitizes the given `Nokogiri::XML::Node` instance and all its children.

def self.node!(node, config = {})
  Sanitize.new(config).node!(node)
end

def document(html)

{#fragment} instead.
error will be raised. If this is undesirable, you should probably use
When sanitizing a document, the `` element must be allowlisted or an

Returns a sanitized copy of the given _html_ document.

def document(html)
  return '' unless html
  doc = Nokogiri::HTML5.parse(preprocess(html), **@config[:parser_options])
  node!(doc)
  to_html(doc)
end

def fragment(html)

Returns a sanitized copy of the given _html_ fragment.

def fragment(html)
  return '' unless html
  frag = Nokogiri::HTML5.fragment(preprocess(html), **@config[:parser_options])
  node!(frag)
  to_html(frag)
end

def initialize(config = {})

Returns a new Sanitize object initialized with the settings in _config_.

def initialize(config = {})
  @config = Config.merge(Config::DEFAULT, config)
  @transformers = Array(@config[:transformers]).dup
  # Default transformers always run at the end of the chain, after any custom
  # transformers.
  @transformers << Transformers::CleanElement.new(@config)
  @transformers << Transformers::CleanComment unless @config[:allow_comments]
  if @config[:elements].include?('style')
    scss = Sanitize::CSS.new(config)
    @transformers << Transformers::CSS::CleanElement.new(scss)
  end
  if @config[:attributes].values.any? {|attr| attr.include?('style') }
    scss ||= Sanitize::CSS.new(config)
    @transformers << Transformers::CSS::CleanAttribute.new(scss)
  end
  @transformers << Transformers::CleanDoctype
  @transformers << Transformers::CleanCDATA
  @transformer_config = { config: @config }
end

def node!(node)

allowlisted or an error will be raised.
If _node_ is a `Nokogiri::XML::Document`, the `` element must be

in place.
Sanitizes the given `Nokogiri::XML::Node` and all its children, modifying it

def node!(node)
  raise ArgumentError unless node.is_a?(Nokogiri::XML::Node)
  if node.is_a?(Nokogiri::XML::Document)
    unless @config[:elements].include?('html')
      raise Error, 'When sanitizing a document, "<html>" must be allowlisted.'
    end
  end
  node_allowlist = Set.new
  traverse(node) do |n|
    transform_node!(n, node_allowlist)
  end
  node
end

def preprocess(html)

Preprocesses HTML before parsing to remove undesirable Unicode chars.

def preprocess(html)
  html = html.to_s.dup
  unless html.encoding.name == 'UTF-8'
    html.encode!('UTF-8',
      :invalid => :replace,
      :undef   => :replace)
  end
  html.gsub!(REGEX_UNSUITABLE_CHARS, '')
  html
end

def to_html(node)

def to_html(node)
  node.to_html(preserve_newline: true)
end

def transform_node!(node, node_allowlist)

def transform_node!(node, node_allowlist)
  @transformers.each do |transformer|
    # Since transform_node! may be called in a tight loop to process thousands
    # of items, we can optimize both memory and CPU performance by:
    #
    # 1. Reusing the same config hash for each transformer
    # 2. Directly assigning values to hash instead of using merge!. Not only
    # does merge! create a new hash, it is also 2.6x slower:
    # https://github.com/JuanitoFatas/fast-ruby#hashmerge-vs-hashmerge-code
    config = @transformer_config
    config[:is_allowlisted] = config[:is_whitelisted] = node_allowlist.include?(node)
    config[:node] = node
    config[:node_name] = node.name.downcase
    config[:node_allowlist] = config[:node_whitelist] = node_allowlist
    result = transformer.call(**config)
    if result.is_a?(Hash)
      result_allowlist = result[:node_allowlist] || result[:node_whitelist]
      if result_allowlist.respond_to?(:each)
        node_allowlist.merge(result_allowlist)
      end
    end
  end
  node
end

def traverse(node, &block)

itself, then traversing each child (if any) in order.
Performs top-down traversal of the given node, operating first on the node

def traverse(node, &block)
  yield node
  child = node.child
  while child do
    prev = child.previous_sibling
    traverse(child, &block)
    if child.parent == node
      child = child.next_sibling
    else
      # The child was unlinked or reparented, so traverse the previous node's
      # next sibling, or the parent's first child if there is no previous
      # node.
      child = prev ? prev.next_sibling : node.child
    end
  end
end

Modules

Classes