lib/roadie/markup_improver.rb
# frozen_string_literal: true module Roadie # @api private # Class that improves the markup of a HTML DOM tree # # This class will improve the following aspects of the DOM: # * A HTML5 doctype will be added if missing, other doctypes will be left as-is. # * Basic HTML elements will be added if missing. # * `<html>` # * `<head>` # * `<body>` # * `<meta>` declaring charset and content-type (text/html) class MarkupImprover # The original HTML must also be passed in in order to handle the doctypes # since a +Nokogiri::HTML::Document+ will always have a doctype, no matter if # the original source had it or not. Reading the raw HTML is the only way to # determine if we want to add a HTML5 doctype or not. def initialize(dom, original_html) @dom = dom @html = original_html end # @return [nil] passed DOM will be mutated def improve ensure_doctype_present ensure_html_element_present head = ensure_head_element_present ensure_declared_charset head end protected attr_reader :dom private def ensure_doctype_present return if @html.include?("<!DOCTYPE ") # Nokogiri adds a "default" doctype to the DOM, which we will remove dom.internal_subset&.remove dom.create_internal_subset "html", nil, nil end def ensure_html_element_present return if dom.at_xpath("html") html = Nokogiri::XML::Node.new "html", dom dom << html end def ensure_head_element_present if (head = dom.at_xpath("html/head")) head else create_head_element dom.at_xpath("html") end end def create_head_element(parent) head = Nokogiri::XML::Node.new "head", dom if parent.children.empty? parent << head else # Crashes when no children are present parent.children.before head end head end def ensure_declared_charset(parent) if content_type_meta_element_missing? parent.add_child make_content_type_element end end def content_type_meta_element_missing? dom.xpath("html/head/meta").none? do |meta| meta["http-equiv"].to_s.downcase == "content-type" end end def make_content_type_element meta = Nokogiri::XML::Node.new("meta", dom) meta["http-equiv"] = "Content-Type" meta["content"] = "text/html; charset=UTF-8" meta end end end