require 'hpricot/elements'
require 'uri'
module Hpricot
module Traverse
# Is this object the enclosing HTML or XML document?
def doc?() Doc::Trav === self end
# Is this object an HTML or XML element?
def elem?() Elem::Trav === self end
# Is this object an HTML text node?
def text?() Text::Trav === self end
# Is this object an XML declaration?
def xmldecl?() XMLDecl::Trav === self end
# Is this object a doctype tag?
def doctype?() DocType::Trav === self end
# Is this object an XML processing instruction?
def procins?() ProcIns::Trav === self end
# Is this object a comment?
def comment?() Comment::Trav === self end
# Is this object a stranded end tag?
def bogusetag?() BogusETag::Trav === self end
# Parses an HTML string, making an HTML fragment based on
# the options used to create the container document.
def make(input = nil, &blk)
if parent and parent.respond_to? :make
parent.make(input, &blk)
else
Hpricot.make(input, &blk).children
end
end
# Builds an HTML string from this node and its contents.
# If you need to write to a stream, try calling <tt>output(io)</tt>
# as a method on this object.
def to_html
output("")
end
alias_method :to_s, :to_html
# Attempts to preserve the original HTML of the document, only
# outputing new tags for elements which have changed.
def to_original_html
output("", :preserve => true)
end
def index(name)
i = 0
return i if name == "*"
children.each do |x|
return i if (x.respond_to?(:name) and name == x.name) or
(x.text? and name == "text()")
i += 1
end if children
-1
end
# Puts together an array of neighboring nodes based on their proximity
# to this node. So, for example, to get the next node, you could use
# <tt>nodes_at(1). Or, to get the previous node, use <tt>nodes_at(1)</tt>.
#
# This method also accepts ranges and sets of numbers.
#
# ele.nodes_at(-3..-1, 1..3) # gets three nodes before and three after
# ele.nodes_at(1, 5, 7) # gets three nodes at offsets below the current node
# ele.nodes_at(0, 5..6) # the current node and two others
def nodes_at(*pos)
sib = parent.children
i, si = 0, sib.index(self)
pos.map! do |r|
if r.is_a?(Range) and r.begin.is_a?(String)
r = Range.new(parent.index(r.begin)-si, parent.index(r.end)-si, r.exclude_end?)
end
r
end
p pos
Elements[*
sib.select do |x|
sel =
case i - si when *pos
true
end
i += 1
sel
end
]
end
# Returns the node neighboring this node to the south: just below it.
# This method includes text nodes and comments and such.
def next
sib = parent.children
sib[sib.index(self) + 1] if parent
end
alias_method :next_node, :next
# Returns to node neighboring this node to the north: just above it.
# This method includes text nodes and comments and such.
def previous
sib = parent.children
x = sib.index(self) - 1
sib[x] if sib and x >= 0
end
alias_method :previous_node, :previous
# Find all preceding nodes.
def preceding
sibs = parent.children
si = sibs.index(self)
return Elements[*sibs[0...si]]
end
# Find all nodes which follow the current one.
def following
sibs = parent.children
si = sibs.index(self) + 1
return Elements[*sibs[si...sibs.length]]
end
# Adds elements immediately after this element, contained in the +html+ string.
def after(html = nil, &blk)
parent.insert_after(make(html, &blk), self)
end
# Adds elements immediately before this element, contained in the +html+ string.
def before(html = nil, &blk)
parent.insert_before(make(html, &blk), self)
end
# Replace this element and its contents with the nodes contained
# in the +html+ string.
def swap(html = nil, &blk)
parent.altered!
parent.replace_child(self, make(html, &blk))
end
def get_subnode(*indexes)
n = self
indexes.each {|index|
n = n.get_subnode_internal(index)
}
n
end
# Builds a string from the text contained in this node. All
# HTML elements are removed.
def to_plain_text
if respond_to?(:children) and children
children.map { |x| x.to_plain_text }.join.strip.gsub(/\n{2,}/, "\n\n")
else
""
end
end
# Builds a string from the text contained in this node. All
# HTML elements are removed.
def inner_text
if respond_to?(:children) and children
children.map { |x| x.inner_text }.join
else
""
end
end
alias_method :innerText, :inner_text
# Builds an HTML string from the contents of this node.
def html(inner = nil, &blk)
if inner or blk
altered!
case inner
when Array
self.children = inner
else
self.children = make(inner, &blk)
end
reparent self.children
else
if respond_to?(:children) and children
children.map { |x| x.output("") }.join
else
""
end
end
end
alias_method :inner_html, :html
alias_method :innerHTML, :inner_html
# Inserts new contents into the current node, based on
# the HTML contained in string +inner+.
def inner_html=(inner)
html(inner || [])
end
alias_method :innerHTML=, :inner_html=
def reparent(nodes)
altered!
[*nodes].each { |e| e.parent = self }
end
private :reparent
def clean_path(path)
path.gsub(/^\s+|\s+$/, '')
end
# Builds a unique XPath string for this node, from the
# root of the document containing it.
def xpath
if elem? and has_attribute? 'id'
"//#{self.name}[@id='#{get_attribute('id')}']"
else
sim, id = 0, 0, 0
parent.children.each do |e|
id = sim if e == self
sim += 1 if e.pathname == self.pathname
end if parent.children
p = File.join(parent.xpath, self.pathname)
p += "[#{id+1}]" if sim >= 2
p
end
end
# Builds a unique CSS string for this node, from the
# root of the document containing it.
def css_path
if elem? and has_attribute? 'id'
"##{get_attribute('id')}"
else
sim, i, id = 0, 0, 0
parent.children.each do |e|
id = sim if e == self
sim += 1 if e.pathname == self.pathname
end if parent.children
p = parent.css_path
p = p ? "#{p} > #{self.pathname}" : self.pathname
p += ":nth(#{id})" if sim >= 2
p
end
end
def node_position
parent.children.index(self)
end
def position
parent.children_of_type(self.pathname).index(self)
end
# Searches this node for all elements matching
# the CSS or XPath +expr+. Returns an Elements array
# containing the matching nodes. If +blk+ is given, it
# is used to iterate through the matching set.
def search(expr, &blk)
if Range === expr
return Elements.expand(at(expr.begin), at(expr.end), expr.exclude_end?)
end
last = nil
nodes = [self]
done = []
expr = expr.to_s
hist = []
until expr.empty?
expr = clean_path(expr)
expr.gsub!(%r!^//!, '')
case expr
when %r!^/?\.\.!
last = expr = $'
nodes.map! { |node| node.parent }
when %r!^[>/]\s*!
last = expr = $'
nodes = Elements[*nodes.map { |node| node.children if node.respond_to? :children }.flatten.compact]
when %r!^\+!
last = expr = $'
nodes.map! do |node|
siblings = node.parent.children
siblings[siblings.index(node)+1]
end
nodes.compact!
when %r!^~!
last = expr = $'
nodes.map! do |node|
siblings = node.parent.children
siblings[(siblings.index(node)+1)..-1]
end
nodes.flatten!
when %r!^[|,]!
last = expr = " #$'"
nodes.shift if nodes.first == self
done += nodes
nodes = [self]
else
m = expr.match(%r!^([#.]?)([a-z0-9\\*_-]*)!i).to_a
after = $'
mt = after[%r!:[a-z0-9\\*_-]+!i, 0]
oop = false
if mt and not (mt == ":not" or Traverse.method_defined? "filter[#{mt}]")
after = $'
m[2] += mt
expr = after
end
if m[1] == '#'
oid = get_element_by_id(m[2])
nodes = oid ? [oid] : []
expr = after
else
m[2] = "*" if after =~ /^\(\)/ || m[2] == "" || m[1] == "."
ret = []
nodes.each do |node|
case m[2]
when '*'
node.traverse_element { |n| ret << n }
else
if node.respond_to? :get_elements_by_tag_name
ret += [*node.get_elements_by_tag_name(m[2])] - [*(node unless last)]
end
end
end
nodes = ret
end
last = nil
end
hist << expr
break if hist[-1] == hist[-2]
nodes, expr = Elements.filter(nodes, expr)
end
nodes = done + nodes.flatten.uniq
if blk
nodes.each(&blk)
self
else
Elements[*nodes]
end
end
alias_method :/, :search
# Find the first matching node for the CSS or XPath
# +expr+ string.
def at(expr)
search(expr).first
end
alias_method :%, :at
# +traverse_element+ traverses elements in the tree.
# It yields elements in depth first order.
#
# If _names_ are empty, it yields all elements.
# If non-empty _names_ are given, it should be list of universal names.
#
# A nested element is yielded in depth first order as follows.
#
# t = Hpricot('<a id=0><b><a id=1 /></b><c id=2 /></a>')
# t.traverse_element("a", "c") {|e| p e}
# # =>
# {elem <a id="0"> {elem <b> {emptyelem <a id="1">} </b>} {emptyelem <c id="2">} </a>}
# {emptyelem <a id="1">}
# {emptyelem <c id="2">}
#
# Universal names are specified as follows.
#
# t = Hpricot(<<'End')
# <html>
# <meta name="robots" content="index,nofollow">
# <meta name="author" content="Who am I?">
# </html>
# End
# t.traverse_element("{http://www.w3.org/1999/xhtml}meta") {|e| p e}
# # =>
# {emptyelem <{http://www.w3.org/1999/xhtml}meta name="robots" content="index,nofollow">}
# {emptyelem <{http://www.w3.org/1999/xhtml}meta name="author" content="Who am I?">}
#
def traverse_element(*names, &block) # :yields: element
if names.empty?
traverse_all_element(&block)
else
name_set = {}
names.each {|n| name_set[n] = true }
traverse_some_element(name_set, &block)
end
nil
end
# Find children of a given +tag_name+.
#
# ele.children_of_type('p')
# #=> [...array of paragraphs...]
#
def children_of_type(tag_name)
if respond_to? :children
children.find_all do |x|
x.respond_to?(:pathname) && x.pathname == tag_name
end
end
end
end
module Container::Trav
# Return all children of this node which can contain other
# nodes. This is a good way to get all HTML elements which
# aren't text, comment, doctype or processing instruction nodes.
def containers
children.grep(Container::Trav)
end
# Returns the container node neighboring this node to the south: just below it.
# By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
# See Hpricot::Traverse#next_node if you need to hunt out all kinds of nodes.
def next_sibling
sib = parent.containers
sib[sib.index(self) + 1] if parent
end
# Returns the container node neighboring this node to the north: just above it.
# By "container" node, I mean: this method does not find text nodes or comments or cdata or any of that.
# See Hpricot::Traverse#previous_node if you need to hunt out all kinds of nodes.
def previous_sibling
sib = parent.containers
x = sib.index(self) - 1
sib[x] if sib and x >= 0
end
# Find all preceding sibling elements. Like the other "sibling" methods, this weeds
# out text and comment nodes.
def preceding_siblings()
sibs = parent.containers
si = sibs.index(self)
return Elements[*sibs[0...si]]
end
# Find sibling elements which follow the current one. Like the other "sibling" methods, this weeds
# out text and comment nodes.
def following_siblings()
sibs = parent.containers
si = sibs.index(self) + 1
return Elements[*sibs[si...sibs.length]]
end
# Puts together an array of neighboring sibling elements based on their proximity
# to this element.
#
# This method accepts ranges and sets of numbers.
#
# ele.siblings_at(-3..-1, 1..3) # gets three elements before and three after
# ele.siblings_at(1, 5, 7) # gets three elements at offsets below the current element
# ele.siblings_at(0, 5..6) # the current element and two others
#
# Like the other "sibling" methods, this doesn't find text and comment nodes.
# Use nodes_at to include those nodes.
def siblings_at(*pos)
sib = parent.containers
i, si = 0, sib.index(self)
Elements[*
sib.select do |x|
sel = case i - si when *pos
true
end
i += 1
sel
end
]
end
# Replace +old+, a child of the current node, with +new+ node.
def replace_child(old, new)
reparent new
children[children.index(old), 1] = [*new]
end
# Insert +nodes+, an array of HTML elements or a single element,
# before the node +ele+, a child of the current node.
def insert_before(nodes, ele)
case nodes
when Array
nodes.each { |n| insert_before(n, ele) }
else
reparent nodes
children[children.index(ele) || 0, 0] = nodes
end
end
# Insert +nodes+, an array of HTML elements or a single element,
# after the node +ele+, a child of the current node.
def insert_after(nodes, ele)
case nodes
when Array
nodes.reverse_each { |n| insert_after(n, ele) }
else
reparent nodes
idx = children.index(ele)
children[idx ? idx + 1 : children.length, 0] = nodes
end
end
# +each_child+ iterates over each child.
def each_child(&block) # :yields: child_node
children.each(&block) if children
nil
end
# +each_child_with_index+ iterates over each child.
def each_child_with_index(&block) # :yields: child_node, index
children.each_with_index(&block) if children
nil
end
# +find_element+ searches an element which universal name is specified by
# the arguments.
# It returns nil if not found.
def find_element(*names)
traverse_element(*names) {|e| return e }
nil
end
# Returns a list of CSS classes to which this element belongs.
def classes
get_attribute('class').to_s.strip.split(/\s+/)
end
def get_element_by_id(id)
traverse_all_element do |ele|
if ele.elem? and eid = ele.get_attribute('id')
return ele if eid.to_s == id
end
end
nil
end
def get_elements_by_tag_name(*a)
list = Elements[]
a.delete("*")
traverse_element(*a.map { |tag| [tag, "{http://www.w3.org/1999/xhtml}#{tag}"] }.flatten) do |e|
list << e if e.elem?
end
list
end
def each_hyperlink_attribute
traverse_element(
'{http://www.w3.org/1999/xhtml}a',
'{http://www.w3.org/1999/xhtml}area',
'{http://www.w3.org/1999/xhtml}link',
'{http://www.w3.org/1999/xhtml}img',
'{http://www.w3.org/1999/xhtml}object',
'{http://www.w3.org/1999/xhtml}q',
'{http://www.w3.org/1999/xhtml}blockquote',
'{http://www.w3.org/1999/xhtml}ins',
'{http://www.w3.org/1999/xhtml}del',
'{http://www.w3.org/1999/xhtml}form',
'{http://www.w3.org/1999/xhtml}input',
'{http://www.w3.org/1999/xhtml}head',
'{http://www.w3.org/1999/xhtml}base',
'{http://www.w3.org/1999/xhtml}script') {|elem|
case elem.name
when %r{\{http://www.w3.org/1999/xhtml\}(?:base|a|area|link)\z}i
attrs = ['href']
when %r{\{http://www.w3.org/1999/xhtml\}(?:img)\z}i
attrs = ['src', 'longdesc', 'usemap']
when %r{\{http://www.w3.org/1999/xhtml\}(?:object)\z}i
attrs = ['classid', 'codebase', 'data', 'usemap']
when %r{\{http://www.w3.org/1999/xhtml\}(?:q|blockquote|ins|del)\z}i
attrs = ['cite']
when %r{\{http://www.w3.org/1999/xhtml\}(?:form)\z}i
attrs = ['action']
when %r{\{http://www.w3.org/1999/xhtml\}(?:input)\z}i
attrs = ['src', 'usemap']
when %r{\{http://www.w3.org/1999/xhtml\}(?:head)\z}i
attrs = ['profile']
when %r{\{http://www.w3.org/1999/xhtml\}(?:script)\z}i
attrs = ['src', 'for']
end
attrs.each {|attr|
if hyperlink = elem.get_attribute(attr)
yield elem, attr, hyperlink
end
}
}
end
private :each_hyperlink_attribute
# +each_hyperlink_uri+ traverses hyperlinks such as HTML href attribute
# of A element.
#
# It yields Hpricot::Text and URI for each hyperlink.
#
# The URI objects are created with a base URI which is given by
# HTML BASE element or the argument ((|base_uri|)).
# +each_hyperlink_uri+ doesn't yields href of the BASE element.
def each_hyperlink_uri(base_uri=nil) # :yields: hyperlink, uri
base_uri = URI.parse(base_uri) if String === base_uri
links = []
each_hyperlink_attribute {|elem, attr, hyperlink|
if %r{\{http://www.w3.org/1999/xhtml\}(?:base)\z}i =~ elem.name
base_uri = URI.parse(hyperlink.to_s)
else
links << hyperlink
end
}
if base_uri
links.each {|hyperlink| yield hyperlink, base_uri + hyperlink.to_s }
else
links.each {|hyperlink| yield hyperlink, URI.parse(hyperlink.to_s) }
end
end
# +each_hyperlink+ traverses hyperlinks such as HTML href attribute
# of A element.
#
# It yields Hpricot::Text.
#
# Note that +each_hyperlink+ yields HTML href attribute of BASE element.
def each_hyperlink # :yields: text
links = []
each_hyperlink_attribute {|elem, attr, hyperlink|
yield hyperlink
}
end
# +each_uri+ traverses hyperlinks such as HTML href attribute
# of A element.
#
# It yields URI for each hyperlink.
#
# The URI objects are created with a base URI which is given by
# HTML BASE element or the argument ((|base_uri|)).
def each_uri(base_uri=nil) # :yields: URI
each_hyperlink_uri(base_uri) {|hyperlink, uri| yield uri }
end
end
# :stopdoc:
module Doc::Trav
def traverse_all_element(&block)
children.each {|c| c.traverse_all_element(&block) } if children
end
def xpath
"/"
end
def css_path
nil
end
end
module Elem::Trav
def traverse_all_element(&block)
yield self
children.each {|c| c.traverse_all_element(&block) } if children
end
end
module Leaf::Trav
def traverse_all_element
yield self
end
end
module Doc::Trav
def traverse_some_element(name_set, &block)
children.each {|c| c.traverse_some_element(name_set, &block) } if children
end
end
module Elem::Trav
def traverse_some_element(name_set, &block)
yield self if name_set.include? self.name
children.each {|c| c.traverse_some_element(name_set, &block) } if children
end
end
module Leaf::Trav
def traverse_some_element(name_set)
end
end
# :startdoc:
module Traverse
# +traverse_text+ traverses texts in the tree
def traverse_text(&block) # :yields: text
traverse_text_internal(&block)
nil
end
end
# :stopdoc:
module Container::Trav
def traverse_text_internal(&block)
each_child {|c| c.traverse_text_internal(&block) }
end
end
module Leaf::Trav
def traverse_text_internal
end
end
module Text::Trav
def traverse_text_internal
yield self
end
end
# :startdoc:
module Container::Trav
# +filter+ rebuilds the tree without some components.
#
# node.filter {|descendant_node| predicate } -> node
# loc.filter {|descendant_loc| predicate } -> node
#
# +filter+ yields each node except top node.
# If given block returns false, corresponding node is dropped.
# If given block returns true, corresponding node is retained and
# inner nodes are examined.
#
# +filter+ returns an node.
# It doesn't return location object even if self is location object.
#
def filter(&block)
subst = {}
each_child_with_index {|descendant, i|
if yield descendant
if descendant.elem?
subst[i] = descendant.filter(&block)
else
subst[i] = descendant
end
else
subst[i] = nil
end
}
to_node.subst_subnode(subst)
end
end
module Doc::Trav
# +title+ searches title and return it as a text.
# It returns nil if not found.
#
# +title+ searchs following information.
#
# - <title>...</title> in HTML
# - <title>...</title> in RSS
def title
e = find_element('title',
'{http://www.w3.org/1999/xhtml}title',
'{http://purl.org/rss/1.0/}title',
'{http://my.netscape.com/rdf/simple/0.9/}title')
e && e.extract_text
end
# +author+ searches author and return it as a text.
# It returns nil if not found.
#
# +author+ searchs following information.
#
# - <meta name="author" content="author-name"> in HTML
# - <link rev="made" title="author-name"> in HTML
# - <dc:creator>author-name</dc:creator> in RSS
# - <dc:publisher>author-name</dc:publisher> in RSS
def author
traverse_element('meta',
'{http://www.w3.org/1999/xhtml}meta') {|e|
begin
next unless e.fetch_attr('name').downcase == 'author'
author = e.fetch_attribute('content').strip
return author if !author.empty?
rescue IndexError
end
}
traverse_element('link',
'{http://www.w3.org/1999/xhtml}link') {|e|
begin
next unless e.fetch_attr('rev').downcase == 'made'
author = e.fetch_attribute('title').strip
return author if !author.empty?
rescue IndexError
end
}
if channel = find_element('{http://purl.org/rss/1.0/}channel')
channel.traverse_element('{http://purl.org/dc/elements/1.1/}creator') {|e|
begin
author = e.extract_text.strip
return author if !author.empty?
rescue IndexError
end
}
channel.traverse_element('{http://purl.org/dc/elements/1.1/}publisher') {|e|
begin
author = e.extract_text.strip
return author if !author.empty?
rescue IndexError
end
}
end
nil
end
end
module Doc::Trav
def root
es = []
children.each {|c| es << c if c.elem? } if children
raise Hpricot::Error, "no element" if es.empty?
raise Hpricot::Error, "multiple top elements" if 1 < es.length
es[0]
end
end
module Elem::Trav
def has_attribute?(name)
self.raw_attributes && self.raw_attributes.has_key?(name.to_s)
end
def get_attribute(name)
a = self.raw_attributes && self.raw_attributes[name.to_s]
a = Hpricot.uxs(a) if a
a
end
alias_method :[], :get_attribute
def set_attribute(name, val)
altered!
self.raw_attributes ||= {}
self.raw_attributes[name.to_s] = val.fast_xs
end
alias_method :[]=, :set_attribute
def remove_attribute(name)
name = name.to_s
if has_attribute? name
altered!
self.raw_attributes.delete(name)
end
end
end
end