lib/prawn_html/html_parser.rb



# frozen_string_literal: true

require 'oga'

module PrawnHtml
  class HtmlParser
    REGEXP_STYLES = /\s*([^{\s]+)\s*{\s*([^}]*?)\s*}/m.freeze

    # Init the HtmlParser
    #
    # @param renderer [DocumentRenderer] document renderer
    # @param ignore_content_tags [Array] array of tags (symbols) to skip their contents while preparing the PDF document
    def initialize(renderer, ignore_content_tags: %i[script style])
      @processing = false
      @ignore = false
      @ignore_content_tags = ignore_content_tags
      @renderer = renderer
      @raw_styles = {}
    end

    # Processes HTML and renders it
    #
    # @param html [String] The HTML content to process
    def process(html)
      @styles = {}
      @processing = !html.include?('<body')
      @document = Oga.parse_html(html)
      process_styles # apply previously loaded styles
      traverse_nodes(document.children)
      renderer.flush
    end

    # Parses CSS styles
    #
    # @param text_styles [String] The CSS styles to evaluate
    def parse_styles(text_styles)
      @raw_styles = text_styles.scan(REGEXP_STYLES).to_h
    end

    private

    attr_reader :document, :ignore, :processing, :renderer, :styles

    def traverse_nodes(nodes)
      nodes.each do |node|
        next if node.is_a?(Oga::XML::Comment)

        element = node_open(node)
        traverse_nodes(node.children) if node.children.any?
        node_close(element) if element
      end
    end

    def node_open(node)
      tag = node.is_a?(Oga::XML::Element) && init_element(node)
      return unless processing
      return IgnoredTag.new(tag) if ignore
      return renderer.on_text_node(node.text) unless tag

      renderer.on_tag_open(tag, attributes: prepare_attributes(node), element_styles: styles[node])
    end

    def init_element(node)
      node.name.downcase.to_sym.tap do |tag_name|
        @processing = true if tag_name == :body
        @ignore = true if @processing && @ignore_content_tags.include?(tag_name)
        process_styles(node.text) if tag_name == :style
      end
    end

    def process_styles(text_styles = nil)
      parse_styles(text_styles) if text_styles
      @raw_styles.each do |selector, rule|
        document.css(selector).each do |node|
          styles[node] = rule
        end
      end
    end

    def prepare_attributes(node)
      node.attributes.each_with_object({}) do |attr, res|
        res[attr.name] = attr.value
      end
    end

    def node_close(element)
      if processing
        renderer.on_tag_close(element) unless ignore
        @ignore = false if ignore && @ignore_content_tags.include?(element.tag)
      end
      @processing = false if element.tag == :body
    end
  end

  class IgnoredTag
    attr_accessor :tag

    def initialize(tag_name)
      @tag = tag_name
    end
  end

  HtmlHandler = HtmlParser
end