class RandomWords::HTML2Markdown

Convert HTML to Markdown

def add_link(link)

Parameters:
  • link (Hash) -- The link to add
def add_link(link)
  if @baseuri
    begin
      link[:href] = URI.parse(link[:href])
    rescue Exception
      link[:href] = URI.parse('')
    end
    link[:href].scheme = @baseuri.scheme unless link[:href].scheme
    unless link[:href].opaque
      link[:href].host = @baseuri.host unless link[:href].host
      link[:href].path = @baseuri.path.to_s + '/' + link[:href].path.to_s if link[:href].path.to_s[0] != '/'
    end
    link[:href] = link[:href].to_s
  end
  @links.each_with_index do |l, i|
    return i + 1 if l[:href] == link[:href]
  end
  @links << link
  @links.length
end

def initialize(str, baseurl = nil)

Parameters:
  • baseurl (String) -- The base URL for resolving relative links
  • str (String) -- The HTML string to convert
def initialize(str, baseurl = nil)
  @links = []
  @baseuri = (baseurl ? URI.parse(baseurl) : nil)
  @section_level = 0
  @encoding = str.encoding
  @markdown = output_for(Nokogiri::HTML(str, baseurl).root).gsub(/\n\n\n+/, "\n\n\n")
end

def output_for(node)

Returns:
  • (String) - The converted Markdown string

Parameters:
  • node (Nokogiri::XML::Node) -- The node to convert
def output_for(node)
  case node.name
  when 'head', 'style', 'script'
    ''
  when 'br'
    "  \n"
  when 'p', 'div'
    "\n\n#{output_for_children(node)}\n\n"
  when 'section', 'article'
    @section_level += 1
    o = "\n\n----\n\n#{output_for_children(node)}\n\n"
    @section_level -= 1
    o
  when /h(\d+)/
    "\n\n" + (('#' * (::Regexp.last_match(1).to_i + @section_level)) + ' ' + output_for_children(node)) + "\n\n"
  when /mark/
    "==#{output_for_children(node)}=="
  when 'blockquote'
    @section_level += 1
    o = "\n\n> #{output_for_children(node).lstrip.gsub("\n", "\n> ")}\n\n".gsub(/> \n(> \n)+/, "> \n")
    @section_level -= 1
    o
  when 'cite'
    "*#{output_for_children(node)}*"
  when 'ul'
    "\n\n" + node.children.map { |el|
      next if el.name == 'text'
      "* #{output_for_children(el).gsub(/^(\t)|(    )/, "\t\t").gsub(/^>/, "\t>")}\n"
    }.join + "\n\n"
  when 'ol'
    i = 0
    "\n\n" + node.children.map { |el|
      next if el.name == 'text'
      i += 1
      "#{i}. #{output_for_children(el).gsub(/^(\t)|(    )/, "\t\t").gsub(/^>/, "\t>")}\n"
    }.join + "\n\n"
  when 'code'
    if @in_pre
      @in_pre = false
      node.text.strip
    else
      "`#{output_for_children(node).gsub("\n", ' ')}`"
    end
  when 'pre'
    @in_pre = true
    lang = node['lang'] || node['language']
    lang = '' if lang.nil? || lang.empty?
    "\n\n```#{lang}\n" + output_for_children(node).lstrip + "\n```\n\n"
  when 'hr'
    "\n\n----\n\n"
  when 'a', 'link'
    link = { href: node['href'], title: node['title'] }
    "[#{output_for_children(node).gsub("\n", ' ')}][#{add_link(link)}]"
  when 'img'
    link = { href: node['src'], title: node['title'] }
    "![#{node['alt']}][#{add_link(link)}]"
  when 'video', 'audio', 'embed'
    link = { href: node['src'], title: node['title'] }
    "[#{output_for_children(node).gsub("\n", ' ')}][#{add_link(link)}]"
  when 'object'
    link = { href: node['data'], title: node['title'] }
    "[#{output_for_children(node).gsub("\n", ' ')}][#{add_link(link)}]"
  when 'i', 'em', 'u'
    "_#{output_for_children(node)}_"
  when 'b', 'strong'
    "**#{output_for_children(node)}**"
  when 'dl'
    "\n\n" + node.children.map do |el|
      case el.name
      when 'dt'
        "#{output_for_children(el)}\n"
      when 'dd'
        ": #{output_for_children(el)}\n\n"
      end
    end.join + "\n\n"
  # Tables are not part of Markdown, so we output WikiCreole
  when 'table'
    @table_header = true
    node.children.select do |c|
      %w[tr tbody thead].include?(c.name)
    end.map do |c|
      output_for(c)
    end.join
  when 'tr'
    header = "\n"
    if @table_header
      @table_header = false
      cells = node.children.select do |c|
        %w[th td].include?(c.name)
      end.count
      header = "\n|#{cells.times.map { '-------' }.join('|')}|\n"
    end
    node.children.select do |c|
      %w[th td].include?(c.name)
    end.map do |c|
      output_for(c)
    end.join.gsub('||', '|') + header
  when 'th', 'td'
    "| #{output_for_children(node)} |"
  when 'text'
    # Sometimes Nokogiri lies. Force the encoding back to what we know it is
    if (c = node.content.force_encoding(@encoding)) =~ /\S/
      c.gsub!(/\n\n+/, '<$PreserveDouble$>')
      c.gsub!(/\s+/, ' ')
      c.gsub('<$PreserveDouble$>', "\n\n")
    else
      c
    end
  else
    output_for_children(node)
  end
end

def output_for_children(node)

Parameters:
  • node (Nokogiri::XML::Node) -- The parent node
def output_for_children(node)
  node.children.map do |el|
    output_for(el)
  end.join
end

def to_s

Returns:
  • (String) - The converted Markdown string
def to_s
  i = 0
  @markdown = TableCleanup.new(@markdown).clean
  "#{@markdown}\n\n" + @links.map { |link|
    i += 1
    "[#{i}]: #{link[:href]}" + (link[:title] ? %( "#{link[:title]}") : '')
  }.join("\n")
end

def wrap(str)

Returns:
  • (String) - The wrapped string

Parameters:
  • str (String) -- The string to wrap
def wrap(str)
  return str if str =~ /\n/
  out = []
  line = []
  str.split(/[ \t]+/).each do |word|
    line << word
    if line.join(' ').length >= 74
      out << line.join(' ')
      line = []
    end
  end
  out << (line.join(' ') + (str[-1..-1] =~ /[ \t\n]/ ? str[-1..-1] : ''))
  out.join("\n")
end