lib/nokogiri/xml/sax/parser.rb



# frozen_string_literal: true

module Nokogiri
  module XML
    module SAX
      ###
      # This parser is a SAX style parser that reads its input as it deems necessary. The parser
      # takes a Nokogiri::XML::SAX::Document, an optional encoding, then given an XML input, sends
      # messages to the Nokogiri::XML::SAX::Document.
      #
      # Here is an example of using this parser:
      #
      #   # Create a subclass of Nokogiri::XML::SAX::Document and implement
      #   # the events we care about:
      #   class MyHandler < Nokogiri::XML::SAX::Document
      #     def start_element name, attrs = []
      #       puts "starting: #{name}"
      #     end
      #
      #     def end_element name
      #       puts "ending: #{name}"
      #     end
      #   end
      #
      #   parser = Nokogiri::XML::SAX::Parser.new(MyHandler.new)
      #
      #   # Hand an IO object to the parser, which will read the XML from the IO.
      #   File.open(path_to_xml) do |f|
      #     parser.parse(f)
      #   end
      #
      # For more information about \SAX parsers, see Nokogiri::XML::SAX.
      #
      # Also see Nokogiri::XML::SAX::Document for the available events.
      #
      # For \HTML documents, use the subclass Nokogiri::HTML4::SAX::Parser.
      #
      class Parser
        # to dynamically resolve ParserContext in inherited methods
        include Nokogiri::ClassResolver

        # Structure used for marshalling attributes for some callbacks in XML::SAX::Document.
        class Attribute < Struct.new(:localname, :prefix, :uri, :value)
        end

        ENCODINGS = { # :nodoc:
          "NONE" => 0, # No char encoding detected
          "UTF-8" => 1, # UTF-8
          "UTF16LE" => 2, # UTF-16 little endian
          "UTF16BE" => 3, # UTF-16 big endian
          "UCS4LE" => 4, # UCS-4 little endian
          "UCS4BE" => 5, # UCS-4 big endian
          "EBCDIC" => 6, # EBCDIC uh!
          "UCS4-2143" => 7, # UCS-4 unusual ordering
          "UCS4-3412" => 8, # UCS-4 unusual ordering
          "UCS2" => 9, # UCS-2
          "ISO-8859-1" => 10, # ISO-8859-1 ISO Latin 1
          "ISO-8859-2" => 11, # ISO-8859-2 ISO Latin 2
          "ISO-8859-3" => 12, # ISO-8859-3
          "ISO-8859-4" => 13, # ISO-8859-4
          "ISO-8859-5" => 14, # ISO-8859-5
          "ISO-8859-6" => 15, # ISO-8859-6
          "ISO-8859-7" => 16, # ISO-8859-7
          "ISO-8859-8" => 17, # ISO-8859-8
          "ISO-8859-9" => 18, # ISO-8859-9
          "ISO-2022-JP" => 19, # ISO-2022-JP
          "SHIFT-JIS" => 20, # Shift_JIS
          "EUC-JP" => 21, # EUC-JP
          "ASCII" => 22, # pure ASCII
        }
        REVERSE_ENCODINGS = ENCODINGS.invert # :nodoc:
        deprecate_constant :ENCODINGS

        # The Nokogiri::XML::SAX::Document where events will be sent.
        attr_accessor :document

        # The encoding beings used for this document.
        attr_accessor :encoding

        ###
        # :call-seq:
        #   new ⇒ SAX::Parser
        #   new(handler) ⇒ SAX::Parser
        #   new(handler, encoding) ⇒ SAX::Parser
        #
        # Create a new Parser.
        #
        # [Parameters]
        # - +handler+ (optional Nokogiri::XML::SAX::Document) The document that will receive
        #   events. Will create a new Nokogiri::XML::SAX::Document if not given, which is accessible
        #   through the #document attribute.
        # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
        #   parsing the input. (default +nil+ for auto-detection)
        #
        def initialize(doc = Nokogiri::XML::SAX::Document.new, encoding = nil)
          @encoding = encoding
          @document = doc
          @warned   = false

          initialize_native unless Nokogiri.jruby?
        end

        ###
        # :call-seq:
        #   parse(input) { |parser_context| ... }
        #
        # Parse the input, sending events to the SAX::Document at #document.
        #
        # [Parameters]
        # - +input+ (String, IO) The input to parse.
        #
        # If +input+ quacks like a readable IO object, this method forwards to Parser.parse_io,
        # otherwise it forwards to Parser.parse_memory.
        #
        # [Yields]
        # If a block is given, the underlying ParserContext object will be yielded. This can be used
        # to set options on the parser context before parsing begins.
        #
        def parse(input, &block)
          if input.respond_to?(:read) && input.respond_to?(:close)
            parse_io(input, &block)
          else
            parse_memory(input, &block)
          end
        end

        ###
        # :call-seq:
        #   parse_io(io) { |parser_context| ... }
        #   parse_io(io, encoding) { |parser_context| ... }
        #
        # Parse an input stream.
        #
        # [Parameters]
        # - +io+ (IO) The readable IO object from which to read input
        # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
        #   parsing the input, or +nil+ for auto-detection. (default #encoding)
        #
        # [Yields]
        # If a block is given, the underlying ParserContext object will be yielded. This can be used
        # to set options on the parser context before parsing begins.
        #
        def parse_io(io, encoding = @encoding)
          ctx = related_class("ParserContext").io(io, encoding)
          yield ctx if block_given?
          ctx.parse_with(self)
        end

        ###
        # :call-seq:
        #   parse_memory(input) { |parser_context| ... }
        #   parse_memory(input, encoding) { |parser_context| ... }
        #
        # Parse an input string.
        #
        # [Parameters]
        # - +input+ (String) The input string to be parsed.
        # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
        #   parsing the input, or +nil+ for auto-detection. (default #encoding)
        #
        # [Yields]
        # If a block is given, the underlying ParserContext object will be yielded. This can be used
        # to set options on the parser context before parsing begins.
        #
        def parse_memory(input, encoding = @encoding)
          ctx = related_class("ParserContext").memory(input, encoding)
          yield ctx if block_given?
          ctx.parse_with(self)
        end

        ###
        # :call-seq:
        #   parse_file(filename) { |parser_context| ... }
        #   parse_file(filename, encoding) { |parser_context| ... }
        #
        # Parse a file.
        #
        # [Parameters]
        # - +filename+ (String) The path to the file to be parsed.
        # - +encoding+ (optional Encoding, String, nil) An Encoding or encoding name to use when
        #   parsing the input, or +nil+ for auto-detection. (default #encoding)
        #
        # [Yields]
        # If a block is given, the underlying ParserContext object will be yielded. This can be used
        # to set options on the parser context before parsing begins.
        #
        def parse_file(filename, encoding = @encoding)
          raise ArgumentError, "no filename provided" unless filename
          raise Errno::ENOENT unless File.exist?(filename)
          raise Errno::EISDIR if File.directory?(filename)

          ctx = related_class("ParserContext").file(filename, encoding)
          yield ctx if block_given?
          ctx.parse_with(self)
        end
      end
    end
  end
end