lib/roadie/markup_improver.rb



# frozen_string_literal: true

module Roadie
  # @api private
  # Class that improves the markup of a HTML DOM tree
  #
  # This class will improve the following aspects of the DOM:
  # * A HTML5 doctype will be added if missing, other doctypes will be left as-is.
  # * Basic HTML elements will be added if missing.
  #   * +<html>+
  #   * +<head>+
  #   * +<body>+
  #   * +<meta>+ declaring charset and content-type (text/html)
  #
  # @note Due to a Nokogiri bug, the HTML5 doctype cannot be added under JRuby. No doctype is outputted under JRuby.
  #   See https://github.com/sparklemotion/nokogiri/issues/984
  class MarkupImprover
    # The original HTML must also be passed in in order to handle the doctypes
    # since a +Nokogiri::HTML::Document+ will always have a doctype, no matter if
    # the original source had it or not. Reading the raw HTML is the only way to
    # determine if we want to add a HTML5 doctype or not.
    def initialize(dom, original_html)
      @dom = dom
      @html = original_html
    end

    # @return [nil] passed DOM will be mutated
    def improve
      ensure_doctype_present
      ensure_html_element_present
      head = ensure_head_element_present
      ensure_declared_charset head
    end

    protected
    attr_reader :dom

    private
    def ensure_doctype_present
      return if uses_buggy_jruby?
      return if @html.include?('<!DOCTYPE ')
      # Nokogiri adds a "default" doctype to the DOM, which we will remove
      dom.internal_subset.remove unless dom.internal_subset.nil?
      dom.create_internal_subset 'html', nil, nil
    end

    # JRuby up to at least 1.6.0 has a bug where the doctype of a document cannot be changed.
    # See https://github.com/sparklemotion/nokogiri/issues/984
    def uses_buggy_jruby?
      # No reason to check for version yet since no existing version has a fix.
      defined?(JRuby)
    end

    def ensure_html_element_present
      return if dom.at_xpath('html')
      html = Nokogiri::XML::Node.new 'html', dom
      dom << html
    end

    def ensure_head_element_present
      if (head = dom.at_xpath('html/head'))
        head
      else
        create_head_element dom.at_xpath('html')
      end
    end

    def create_head_element(parent)
      head = Nokogiri::XML::Node.new 'head', dom
      unless parent.children.empty?
        # Crashes when no children are present
        parent.children.before head
      else
        parent << head
      end
      head
    end

    def ensure_declared_charset(parent)
      if content_type_meta_element_missing?
        parent.add_child make_content_type_element
      end
    end

    def content_type_meta_element_missing?
      dom.xpath('html/head/meta').none? do |meta|
        meta['http-equiv'].to_s.downcase == 'content-type'
      end
    end

    def make_content_type_element
      meta = Nokogiri::XML::Node.new('meta', dom)
      meta['http-equiv'] = 'Content-Type'
      meta['content'] = 'text/html; charset=UTF-8'
      meta
    end
  end
end