lib/nokogiri/xml/fragment_handler.rb
module Nokogiri module XML class FragmentHandler < Nokogiri::XML::SAX::Document # :nodoc: QNAME_REGEX = /(.*):(.*)/ def initialize node, original_html @doc_started = false @document = node.document @stack = [node] @html_eh = node.kind_of? HTML::DocumentFragment # the regexes used in start_element() and characters() anchor at # start-of-line, but we really only want them to anchor at # start-of-doc. so let's only save up to the first newline. # # this implementation choice was the result of some benchmarks, if # you're curious: http://gist.github.com/115936 # @original_html = original_html.lstrip newline_index = @original_html.index("\n") @original_html = @original_html[0,newline_index] if newline_index end def start_element name, attrs = [] regex = @html_eh ? %r{^\s*<#{Regexp.escape(name)}}i : %r{^\s*<#{Regexp.escape(name)}} @doc_started = true if @original_html =~ regex return unless @doc_started if match = name.match(QNAME_REGEX) prefix, name = match[1], match[2] ns = @document.root.namespace_definitions.detect { |x| x.prefix == prefix } else ns = nil end node = Element.new(name, @document) attrs << "" unless (attrs.length % 2) == 0 Hash[*attrs].each do |k,v| node[k] = v end node.namespace = ns if ns @stack.last << node @stack << node end def characters string @doc_started = true if @original_html.strip =~ %r{^\s*#{Regexp.escape(string.strip)}} @stack.last << Text.new(string, @document) end def comment string @stack.last << Comment.new(@document, string) end def cdata_block string @stack.last << CDATA.new(@document, string) end def end_element name return unless @stack.last.name == name @stack.pop end end end end