lib/pdf/reader/xref.rb



# coding: utf-8
# typed: true
# frozen_string_literal: true

################################################################################
#
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################

class PDF::Reader
  ################################################################################
  # An internal PDF::Reader class that represents the XRef table in a PDF file as a
  # hash-like object.
  #
  # An Xref table is a map of object identifiers and byte offsets. Any time a particular
  # object needs to be found, the Xref table is used to find where it is stored in the
  # file.
  #
  # Hash keys are object ids, values are either:
  #
  # * a byte offset where the object starts (regular PDF objects)
  # * a PDF::Reader::Reference instance that points to a stream that contains the
  #   desired object (PDF objects embedded in an object stream)
  #
  # The class behaves much like a standard Ruby hash, including the use of
  # the Enumerable mixin. The key difference is no []= method - the hash
  # is read only.
  #
  class XRef
    include Enumerable
    attr_reader :trailer

    ################################################################################
    # create a new Xref table based on the contents of the supplied io object
    #
    # io - must be an IO object, generally either a file or a StringIO
    #
    def initialize(io)
      @io = io
      @junk_offset = calc_junk_offset(io) || 0
      @xref = {}
      @trailer = load_offsets
    end

    ################################################################################
    # return the number of objects in this file. Objects with multiple generations are
    # only counter once.
    def size
      @xref.size
    end
    ################################################################################
    # returns the byte offset for the specified PDF object.
    #
    # ref - a PDF::Reader::Reference object containing an object ID and revision number
    def [](ref)
      @xref.fetch(ref.id, {}).fetch(ref.gen)
    rescue
      raise InvalidObjectError, "Object #{ref.id}, Generation #{ref.gen} is invalid"
    end
    ################################################################################
    # iterate over each object in the xref table
    def each(&block)
      ids = @xref.keys.sort
      ids.each do |id|
        gen = @xref.fetch(id, {}).keys.sort[-1]
        yield PDF::Reader::Reference.new(id, gen.to_i)
      end
    end
    ################################################################################
    private
    ################################################################################
    # Read a xref table from the underlying buffer.
    #
    # If offset is specified the table will be loaded from there, otherwise the
    # default offset will be located and used.
    #
    # After seeking to the offset, processing is handed of to either load_xref_table()
    # or load_xref_stream() based on what we find there.
    #
    def load_offsets(offset = nil)
      offset ||= new_buffer.find_first_xref_offset
      offset += @junk_offset

      buf = new_buffer(offset)
      tok_one = buf.token

      # we have a traditional xref table
      return load_xref_table(buf) if tok_one == "xref" || tok_one == "ref"

      tok_two   = buf.token
      tok_three = buf.token

      # we have an XRef stream
      if tok_one.to_i >= 0 && tok_two.to_i >= 0 && tok_three == "obj"
        buf = new_buffer(offset)
        # Maybe we should be parsing the ObjectHash second argument to the Parser here,
        # to handle the case where an XRef Stream has the Length specified via an
        # indirect object
        stream = PDF::Reader::Parser.new(buf).object(tok_one.to_i, tok_two.to_i)
        return load_xref_stream(stream)
      end

      raise PDF::Reader::MalformedPDFError,
        "xref table not found at offset #{offset} (#{tok_one} != xref)"
    end
    ################################################################################
    # Assumes the underlying buffer is positioned at the start of a traditional
    # Xref table and processes it into memory.
    def load_xref_table(buf)
      params = []

      while !params.include?("trailer") && !params.include?(nil)
        if params.size == 2
          unless params[0].to_s.match(/\A\d+\z/)
            raise MalformedPDFError, "invalid xref table, expected object ID"
          end

          objid, count = params[0].to_i, params[1].to_i
          count.times do
            offset = buf.token.to_i
            generation = buf.token.to_i
            state = buf.token

            # Some PDF writers start numbering at 1 instead of 0. Fix up the number.
            # TODO should this fix be logged?
            objid = 0 if objid == 1 and offset == 0 and generation == 65535 and state == 'f'
            store(objid, generation, offset + @junk_offset) if state == "n" && offset > 0
            objid += 1
            params.clear
          end
        end
        params << buf.token
      end

      trailer = Parser.new(buf).parse_token

      unless trailer.kind_of?(Hash)
        raise MalformedPDFError, "PDF malformed, trailer should be a dictionary"
      end

      load_offsets(trailer[:XRefStm])   if trailer.has_key?(:XRefStm)
      # Some PDF creators seem to use '/Prev 0' in trailer if there is no previous xref
      # It's not possible for an xref to appear at offset 0, so can safely skip the ref
      load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev) and trailer[:Prev].to_i != 0

      trailer
    end

    ################################################################################
    # Read an XRef stream from the underlying buffer instead of a traditional xref table.
    #
    def load_xref_stream(stream)
      unless stream.is_a?(PDF::Reader::Stream) && stream.hash[:Type] == :XRef
        raise PDF::Reader::MalformedPDFError, "xref stream not found when expected"
      end
      trailer = Hash[stream.hash.select { |key, value|
        [:Size, :Prev, :Root, :Encrypt, :Info, :ID].include?(key)
      }]

      widths = stream.hash[:W]

      PDF::Reader::Error.validate_type_as_malformed(widths, "xref stream widths", Array)

      entry_length = widths.inject(0) { |s, w|
        unless w.is_a?(Integer)
          w = 0
        end
        s + w
      }
      raw_data     = StringIO.new(stream.unfiltered_data)
      if stream.hash[:Index]
        index = stream.hash[:Index]
      else
        index = [0, stream.hash[:Size]]
      end
      index.each_slice(2) do |start_id, size|
        obj_ids = (start_id..(start_id+(size-1)))
        obj_ids.each do |objid|
          entry = raw_data.read(entry_length) || ""
          f1    = unpack_bytes(entry[0,widths[0]])
          f2    = unpack_bytes(entry[widths[0],widths[1]])
          f3    = unpack_bytes(entry[widths[0]+widths[1],widths[2]])
          if f1 == 1 && f2 > 0
            store(objid, f3, f2 + @junk_offset)
          elsif f1 == 2 && f2 > 0
            store(objid, 0, PDF::Reader::Reference.new(f2, 0))
          end
        end
      end

      load_offsets(trailer[:Prev].to_i) if trailer.has_key?(:Prev)

      trailer
    end
    ################################################################################
    # XRef streams pack info into integers 1-N bytes wide. Depending on the number of
    # bytes they need to be converted to an int in different ways.
    #
    def unpack_bytes(bytes)
      if bytes.to_s.size == 0
        0
      elsif bytes.size == 1
        bytes.unpack("C")[0]
      elsif bytes.size == 2
        bytes.unpack("n")[0]
      elsif bytes.size == 3
        ("\x00" + bytes).unpack("N")[0]
      elsif bytes.size == 4
        bytes.unpack("N")[0]
      elsif bytes.size == 8
        bytes.unpack("Q>")[0]
      else
        raise UnsupportedFeatureError, "Unable to unpack xref stream entries of #{bytes.size} bytes"
      end
    end
    ################################################################################
    # Wrap the io stream we're working with in a buffer that can tokenise it for us.
    #
    # We create multiple buffers so we can be tokenising multiple sections of the file
    # at the same time without worrying about clearing the buffers contents.
    #
    def new_buffer(offset = 0)
      PDF::Reader::Buffer.new(@io, :seek => offset)
    end
    ################################################################################
    # Stores an offset value for a particular PDF object ID and revision number
    #
    def store(id, gen, offset)
      (@xref[id] ||= {})[gen] ||= offset
    end
    ################################################################################
    # Returns the offset of the PDF document in the +stream+. In theory this
    # should always be 0, but all sort of crazy junk is prefixed to PDF files
    # in the real world.
    #
    # Checks up to 1024 chars into the file,
    # returns nil if no PDF data detected.
    # Adobe PDF 1.4 spec (3.4.1) 12. Acrobat viewers require only that the
    # header appear somewhere within the first 1024 bytes of the file
    #
    def calc_junk_offset(io)
      io.rewind
      offset = io.pos
      until (c = io.readchar) == '%' || c == 37 || offset > 1024
        offset += 1
      end
      io.rewind
      offset < 1024 ? offset : nil
    rescue EOFError
      nil
    end
  end
  ################################################################################
end
################################################################################