module AlgoliaHTMLExtractor
def self.extract_anchor(node)
Foo
=> anchorFoo
=> anchorFoo
=> anchoreg.
Returns the anchor to the node
def self.extract_anchor(node) anchor = node.attr('name') || node.attr('id') || nil return anchor unless anchor.nil? # No anchor found directly in the header, search on children subelement = node.css('[name],[id]') return extract_anchor(subelement[0]) unless subelement.empty? nil end
def self.extract_html(node)
foo
=>foo
eg.
Returns the outer HTML of a given node
def self.extract_html(node) node.to_s.strip end
def self.extract_tag_name(node)
foo
=> peg
Returns the tag name of a given node
def self.extract_tag_name(node) node.name.downcase end
def self.extract_text(node)
foo
=> fooeg.
Returns the inner HTML of a given node
def self.extract_text(node) node.content end
def self.heading_weight(heading_level)
Get a relative numeric value of the importance of the heading
#
def self.heading_weight(heading_level) weight = 100 return weight if heading_level.nil? weight - ((heading_level + 1) * 10) end
def self.run(input, options: {})
def self.run(input, options: {}) default_options = { css_selector: 'p' } options = default_options.merge(options) heading_selector = 'h1,h2,h3,h4,h5,h6' # We select all nodes that match either the headings or the elements to # extract. This will allow us to loop over it in order it appears in the DOM all_selector = "#{heading_selector},#{options[:css_selector]}" items = [] current_hierarchy = { lvl0: nil, lvl1: nil, lvl2: nil, lvl3: nil, lvl4: nil, lvl5: nil } current_position = 0 # Position of the DOM node in the tree current_lvl = nil # Current closest hierarchy level current_anchor = nil # Current closest anchor dom = Nokogiri::HTML(input) dom.css(all_selector).each do |node| # If it's a heading, we update our current hierarchy if node.matches?(heading_selector) # Which level heading is it? current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1 # Update this level, and set all the following ones to nil current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node) (current_lvl + 1..6).each do |lvl| current_hierarchy["lvl#{lvl}".to_sym] = nil end # Update the anchor, if the new heading has one new_anchor = extract_anchor(node) current_anchor = new_anchor if new_anchor end # Stop if node is not to be extracted next unless node.matches?(options[:css_selector]) # Stop if node is empty content = extract_text(node) next if content.empty? item = { html: extract_html(node), content: content, tag_name: extract_tag_name(node), hierarchy: current_hierarchy.clone, anchor: current_anchor, node: node, weight: { position: current_position, heading: heading_weight(current_lvl) } } item[:objectID] = uuid(item) items << item current_position += 1 end items end
def self.uuid(item)
#
def self.uuid(item) # We don't use the objectID as part of the hash algorithm item.delete(:objectID) # We first get all the keys of the object, sorted alphabetically... ordered_keys = item.keys.sort # ...then we build a huge array of "key=value" pairs... ordered_array = ordered_keys.map do |key| value = item[key] # We apply the method recursively on other hashes value = uuid(value) if value.is_a?(Hash) "#{key}=#{value}" end # ...then we build a unique md5 hash of it Digest::MD5.hexdigest(ordered_array.join(',')) end