lib/pdf/reader/object_hash.rb



# coding: utf-8
# typed: true
# frozen_string_literal: true

require 'tempfile'

class PDF::Reader
  # Provides low level access to the objects in a PDF file via a hash-like
  # object.
  #
  # A PDF file can be viewed as a large hash map. It is a series of objects
  # stored at precise byte offsets, and a table that maps object IDs to byte
  # offsets. Given an object ID, looking up an object is an O(1) operation.
  #
  # Each PDF object can be mapped to a ruby object, so by passing an object
  # ID to the [] method, a ruby representation of that object will be
  # retrieved.
  #
  # The class behaves much like a standard Ruby hash, including the use of
  # the Enumerable mixin. The key difference is no []= method - the hash
  # is read only.
  #
  # == Basic Usage
  #
  #     h = PDF::Reader::ObjectHash.new("somefile.pdf")
  #     h[1]
  #     => 3469
  #
  #     h[PDF::Reader::Reference.new(1,0)]
  #     => 3469
  #
  class ObjectHash
    include Enumerable

    attr_accessor :default
    attr_reader :trailer, :pdf_version
    attr_reader :sec_handler

    # Creates a new ObjectHash object. Input can be a string with a valid filename
    # or an IO-like object.
    #
    # Valid options:
    #
    #   :password - the user password to decrypt the source PDF
    #
    def initialize(input, opts = {})
      @io          = extract_io_from(input)
      @xref        = PDF::Reader::XRef.new(@io)
      @pdf_version = read_version
      @trailer     = @xref.trailer
      @cache       = opts[:cache] || PDF::Reader::ObjectCache.new
      @sec_handler = NullSecurityHandler.new
      @sec_handler = SecurityHandlerFactory.build(
        deref(trailer[:Encrypt]),
        deref(trailer[:ID]),
        opts[:password]
      )
    end

    # returns the type of object a ref points to
    def obj_type(ref)
      self[ref].class.to_s.to_sym
    rescue
      nil
    end

    # returns true if the supplied references points to an object with a stream
    def stream?(ref)
      self.has_key?(ref) && self[ref].is_a?(PDF::Reader::Stream)
    end

    # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
    # object.
    #
    # If an int is used, the object with that ID and a generation number of 0 will
    # be returned.
    #
    # If a PDF::Reader::Reference object is used the exact ID and generation number
    # can be specified.
    #
    def [](key)
      return default if key.to_i <= 0

      unless key.is_a?(PDF::Reader::Reference)
        key = PDF::Reader::Reference.new(key.to_i, 0)
      end

      @cache[key] ||= fetch_object(key) || fetch_object_stream(key)
    rescue InvalidObjectError
      return default
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    def object(key)
      key.is_a?(PDF::Reader::Reference) ? self[key] : key
    end
    alias :deref :object

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return an Array or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting an Array and no other type will do.
    def deref_array(key)
      obj = deref(key)

      return obj if obj.nil?

      obj.tap { |obj|
        raise MalformedPDFError, "expected object to be an Array or nil" if !obj.is_a?(Array)
      }
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return an Array of Numerics or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting an Array and no other type will do.
    #
    # Some effort to cast array elements to a number is made for any non-numeric elements.
    def deref_array_of_numbers(key)
      arr = deref(key)

      return arr if arr.nil?

      raise MalformedPDFError, "expected object to be an Array" unless arr.is_a?(Array)

      arr.map { |item|
        if item.is_a?(Numeric)
          item
        elsif item.respond_to?(:to_f)
          item.to_f
        elsif item.respond_to?(:to_i)
          item.to_i
        else
          raise MalformedPDFError, "expected object to be a number"
        end
      }
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return a Hash or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting an Array and no other type will do.
    def deref_hash(key)
      obj = deref(key)

      return obj if obj.nil?

      obj.tap { |obj|
        raise MalformedPDFError, "expected object to be a Hash or nil" if !obj.is_a?(Hash)
      }
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return a PDF name (Symbol) or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting an Array and no other type will do.
    #
    # Some effort to cast to a symbol is made when the reference points to a non-symbol.
    def deref_name(key)
      obj = deref(key)

      return obj if obj.nil?

      if !obj.is_a?(Symbol)
        if obj.respond_to?(:to_sym)
          obj = obj.to_sym
        else
          raise MalformedPDFError, "expected object to be a Name"
        end
      end

      obj
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return an Integer or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting an Array and no other type will do.
    #
    # Some effort to cast to an int is made when the reference points to a non-integer.
    def deref_integer(key)
      obj = deref(key)

      return obj if obj.nil?

      if !obj.is_a?(Integer)
        if obj.respond_to?(:to_i)
          obj = obj.to_i
        else
          raise MalformedPDFError, "expected object to be an Integer"
        end
      end

      obj
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return a Numeric or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting an Array and no other type will do.
    #
    # Some effort to cast to a number is made when the reference points to a non-number.
    def deref_number(key)
      obj = deref(key)

      return obj if obj.nil?

      if !obj.is_a?(Numeric)
        if obj.respond_to?(:to_f)
          obj = obj.to_f
        elsif obj.respond_to?(:to_i)
          obj.to_i
        else
          raise MalformedPDFError, "expected object to be a number"
        end
      end

      obj
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return a PDF::Reader::Stream or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting a stream and no other type will do.
    def deref_stream(key)
      obj = deref(key)

      return obj if obj.nil?

      obj.tap { |obj|
        if !obj.is_a?(PDF::Reader::Stream)
          raise MalformedPDFError, "expected object to be a Stream or nil"
        end
      }
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return a String or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting a string and no other type will do.
    #
    # Some effort to cast to a string is made when the reference points to a non-string.
    def deref_string(key)
      obj = deref(key)

      return obj if obj.nil?

      if !obj.is_a?(String)
        if obj.respond_to?(:to_s)
          obj = obj.to_s
        else
          raise MalformedPDFError, "expected object to be a string"
        end
      end

      obj
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return a PDF Name (symbol), Array or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting a Name or Array and no other type will do.
    def deref_name_or_array(key)
      obj = deref(key)

      return obj if obj.nil?

      obj.tap { |obj|
        if !obj.is_a?(Symbol) && !obj.is_a?(Array)
          raise MalformedPDFError, "expected object to be an Array or Name"
        end
      }
    end

    # If key is a PDF::Reader::Reference object, lookup the corresponding
    # object in the PDF and return it. Otherwise return key untouched.
    #
    # Guaranteed to only return a PDF::Reader::Stream, Array or nil. If the dereference results in
    # any other type then a MalformedPDFError exception will raise. Useful when
    # expecting a stream or Array and no other type will do.
    def deref_stream_or_array(key)
      obj = deref(key)

      return obj if obj.nil?

      obj.tap { |obj|
        if !obj.is_a?(PDF::Reader::Stream) && !obj.is_a?(Array)
          raise MalformedPDFError, "expected object to be an Array or Stream"
        end
      }
    end

    # Recursively dereferences the object refered to be +key+. If +key+ is not
    # a PDF::Reader::Reference, the key is returned unchanged.
    #
    def deref!(key)
      deref_internal!(key, {})
    end

    def deref_array!(key)
      deref!(key).tap { |obj|
        if !obj.nil? && !obj.is_a?(Array)
          raise MalformedPDFError, "expected object (#{obj.inspect}) to be an Array or nil"
        end
      }
    end

    def deref_hash!(key)
      deref!(key).tap { |obj|
        if !obj.nil? && !obj.is_a?(Hash)
          raise MalformedPDFError, "expected object (#{obj.inspect}) to be a Hash or nil"
        end
      }
    end

    # Access an object from the PDF. key can be an int or a PDF::Reader::Reference
    # object.
    #
    # If an int is used, the object with that ID and a generation number of 0 will
    # be returned.
    #
    # If a PDF::Reader::Reference object is used the exact ID and generation number
    # can be specified.
    #
    # local_default is the object that will be returned if the requested key doesn't
    # exist.
    #
    def fetch(key, local_default = nil)
      obj = self[key]
      if obj
        return obj
      elsif local_default
        return local_default
      else
        raise IndexError, "#{key} is invalid" if key.to_i <= 0
      end
    end

    # iterate over each key, value. Just like a ruby hash.
    #
    def each(&block)
      @xref.each do |ref|
        yield ref, self[ref]
      end
    end
    alias :each_pair :each

    # iterate over each key. Just like a ruby hash.
    #
    def each_key(&block)
      each do |id, obj|
        yield id
      end
    end

    # iterate over each value. Just like a ruby hash.
    #
    def each_value(&block)
      each do |id, obj|
        yield obj
      end
    end

    # return the number of objects in the file. An object with multiple generations
    # is counted once.
    def size
      xref.size
    end
    alias :length :size

    # return true if there are no objects in this file
    #
    def empty?
      size == 0 ? true : false
    end

    # return true if the specified key exists in the file. key
    # can be an int or a PDF::Reader::Reference
    #
    def has_key?(check_key)
      # TODO update from O(n) to O(1)
      each_key do |key|
        if check_key.kind_of?(PDF::Reader::Reference)
          return true if check_key == key
        else
          return true if check_key.to_i == key.id
        end
      end
      return false
    end
    alias :include? :has_key?
    alias :key? :has_key?
    alias :member? :has_key?

    # return true if the specifiedvalue exists in the file
    #
    def has_value?(value)
      # TODO update from O(n) to O(1)
      each_value do |obj|
        return true if obj == value
      end
      return false
    end
    alias :value? :has_key?

    def to_s
      "<PDF::Reader::ObjectHash size: #{self.size}>"
    end

    # return an array of all keys in the file
    #
    def keys
      ret = []
      each_key { |k| ret << k }
      ret
    end

    # return an array of all values in the file
    #
    def values
      ret = []
      each_value { |v| ret << v }
      ret
    end

    # return an array of all values from the specified keys
    #
    def values_at(*ids)
      ids.map { |id| self[id] }
    end

    # return an array of arrays. Each sub array contains a key/value pair.
    #
    def to_a
      ret = []
      each do |id, obj|
        ret << [id, obj]
      end
      ret
    end

    # returns an array of PDF::Reader::References. Each reference in the
    # array points a Page object, one for each page in the PDF. The first
    # reference is page 1, second reference is page 2, etc.
    #
    # Useful for apps that want to extract data from specific pages.
    #
    def page_references
      root  = fetch(trailer[:Root])
      @page_references ||= begin
                             pages_root = deref_hash(root[:Pages]) || {}
                             get_page_objects(pages_root)
                           end
    end

    def encrypted?
      trailer.has_key?(:Encrypt)
    end

    def sec_handler?
      !!sec_handler
    end

    private

    # parse a traditional object from the PDF, starting from the byte offset indicated
    # in the xref table
    #
    def fetch_object(key)
      if xref[key].is_a?(Integer)
        buf = new_buffer(xref[key])
        decrypt(key, Parser.new(buf, self).object(key.id, key.gen))
      end
    end

    # parse a object that's embedded in an object stream in the PDF
    #
    def fetch_object_stream(key)
      if xref[key].is_a?(PDF::Reader::Reference)
        container_key = xref[key]
        stream = deref_stream(container_key)
        raise MalformedPDFError, "Object Stream cannot be nil" if stream.nil?
        object_streams[container_key] ||= PDF::Reader::ObjectStream.new(stream)
        object_streams[container_key][key.id]
      end
    end

    # Private implementation of deref!, which exists to ensure the `seen` argument
    # isn't publicly available. It's used to avoid endless loops in the recursion, and
    # doesn't need to be part of the public API.
    #
    def deref_internal!(key, seen)
      seen_key = key.is_a?(PDF::Reader::Reference) ? key : key.object_id

      return seen[seen_key] if seen.key?(seen_key)

      case object = deref(key)
      when Hash
        seen[seen_key] ||= {}
        object.each do |k, value|
          seen[seen_key][k] = deref_internal!(value, seen)
        end
        seen[seen_key]
      when PDF::Reader::Stream
        seen[seen_key] ||= PDF::Reader::Stream.new({}, object.data)
        object.hash.each do |k,value|
          seen[seen_key].hash[k] = deref_internal!(value, seen)
        end
        seen[seen_key]
      when Array
        seen[seen_key] ||= []
        object.each do |value|
          seen[seen_key] << deref_internal!(value, seen)
        end
        seen[seen_key]
      else
        object
      end
    end

    def decrypt(ref, obj)
      case obj
      when PDF::Reader::Stream then
        # PDF 32000-1:2008 7.5.8.2: "The cross-reference stream shall not be encrypted [...]."
        # Therefore we shouldn't try to decrypt it.
        obj.data = sec_handler.decrypt(obj.data, ref) unless obj.hash[:Type] == :XRef
        obj
      when Hash                then
        arr = obj.map { |key,val| [key, decrypt(ref, val)] }
        arr.each_with_object({}) { |(k,v), accum|
          accum[k] = v
        }
      when Array               then
        obj.collect { |item| decrypt(ref, item) }
      when String
        sec_handler.decrypt(obj, ref)
      else
        obj
      end
    end

    def new_buffer(offset = 0)
      PDF::Reader::Buffer.new(@io, :seek => offset)
    end

    def xref
      @xref
    end

    def object_streams
      @object_streams ||= {}
    end

    # returns an array of object references for all pages in this object store. The ordering of
    # the Array is significant and matches the page ordering of the document
    #
    def get_page_objects(obj)
      derefed_obj = deref_hash(obj)

      if derefed_obj.nil?
        raise MalformedPDFError, "Expected Page or Pages object, got nil"
      elsif derefed_obj[:Type] == :Page
        [obj]
      elsif derefed_obj[:Kids]
        kids = deref_array(derefed_obj[:Kids]) || []
        kids.map { |kid|
          get_page_objects(kid)
        }.flatten
      else
        raise MalformedPDFError, "Expected Page or Pages object"
      end
    end

    def read_version
      @io.seek(0)
      _m, version = *@io.read(10).to_s.match(/PDF-(\d.\d)/)
      @io.seek(0)
      version.to_f
    end

    def extract_io_from(input)
      if input.is_a?(IO) || input.is_a?(StringIO) || input.is_a?(Tempfile)
        input
      elsif File.file?(input.to_s)
        StringIO.new read_as_binary(input.to_s)
      else
        raise ArgumentError, "input must be an IO-like object or a filename (#{input.class})"
      end
    end

    def read_as_binary(input)
      if File.respond_to?(:binread)
        File.binread(input.to_s)
      else
        File.open(input.to_s,"rb") { |f| f.read }
      end
    end

  end
end