class AlgoliaHTMLExtractor
def extract
def extract heading_selector = 'h1,h2,h3,h4,h5,h6' # We select all nodes that match either the headings or the elements to # extract. This will allow us to loop over it in order it appears in the DOM all_selector = "#{heading_selector},#{@options[:css_selector]}" items = [] current_hierarchy = { lvl0: nil, lvl1: nil, lvl2: nil, lvl3: nil, lvl4: nil, lvl5: nil } current_position = 0 # Position of the DOM node in the tree current_lvl = nil # Current closest hierarchy level current_anchor = nil # Current closest anchor @dom.css(all_selector).each do |node| # If it's a heading, we update our current hierarchy if node.matches?(heading_selector) # Which level heading is it? current_lvl = extract_tag_name(node).gsub(/^h/, '').to_i - 1 # Update this level, and set all the following ones to nil current_hierarchy["lvl#{current_lvl}".to_sym] = extract_text(node) (current_lvl + 1..6).each do |lvl| current_hierarchy["lvl#{lvl}".to_sym] = nil end # Update the anchor, if the new heading has one new_anchor = extract_anchor(node) current_anchor = new_anchor if new_anchor end # Stop if node is not to be extracted next unless node.matches?(@options[:css_selector]) # Stop if node is empty text = extract_text(node) next if text.empty? item = { html: extract_html(node), text: text, tag_name: extract_tag_name(node), hierarchy: current_hierarchy.clone, anchor: current_anchor, node: node, weight: { position: current_position, heading: heading_weight(current_lvl) } } item[:uuid] = uuid(item) items << item current_position += 1 end items end