class HexaPDF::Parser

See: PDF2.0 s7
This class also contains higher-level methods for getting indirect objects and revisions.
Parses an IO stream according to PDF2.0 to get at the contained objects.

def file_header_version

See: PDF2.0 s7.5.2

Returns the PDF version number that is stored in the file header.

def file_header_version
  unless @header_version
    raise_malformed("PDF file header is missing or corrupt", pos: 0)
  end
  @header_version
end

def initialize(io, document)

PDF references are resolved using the associated Document object.

Creates a new parser for the given IO object.

def initialize(io, document)
  @io = io
  on_correctable_error = document.config['parser.on_correctable_error'].curry[document]
  @tokenizer = Tokenizer.new(io, on_correctable_error: on_correctable_error)
  @document = document
  @object_stream_data = {}
  @reconstructed_revision = nil
  @in_reconstruct_revision = false
  retrieve_pdf_header_offset_and_version
end

def linearized?

Returns +true+ if the PDF file is a linearized file.

def linearized?
  @linearized ||=
    begin
      @tokenizer.pos = @header_offset
      3.times { @tokenizer.next_token } # parse: oid gen obj
      obj = @tokenizer.next_object
      obj.kind_of?(Hash) && obj.key?(:Linearized)
    rescue MalformedPDFError
      false
    end
end

def load_compressed_object(xref_entry)

Loads the compressed object identified by the cross-reference entry.

def load_compressed_object(xref_entry)
  unless @object_stream_data.key?(xref_entry.objstm)
    obj = @document.object(xref_entry.objstm)
    unless obj.respond_to?(:parse_stream)
      raise_malformed("Object with oid=#{xref_entry.objstm} is not an object stream")
    end
    @object_stream_data[xref_entry.objstm] = obj.parse_stream
  end
  [*@object_stream_data[xref_entry.objstm].object_by_index(xref_entry.pos), xref_entry.gen, nil]
end

def load_object(xref_entry)

HexaPDF::XRefSection::Entry.
For information about the +xref_entry+ argument, have a look at HexaPDF::XRefSection and

entry.
Loads the indirect (potentially compressed) object specified by the given cross-reference

def load_object(xref_entry)
  obj, oid, gen, stream =
    case xref_entry.type
    when :in_use
      if xref_entry.pos == 0 && xref_entry.oid != 0
        # Handle seen-in-the-wild objects with invalid offset 0
        maybe_raise("Indirect object (#{xref_entry.oid},#{xref_entry.gen}) has offset 0", pos: 0)
        [nil, xref_entry.oid, xref_entry.gen, nil]
      else
        parse_indirect_object(xref_entry.pos)
      end
    when :free
      [nil, xref_entry.oid, xref_entry.gen, nil]
    when :compressed
      load_compressed_object(xref_entry)
    else
      raise_malformed("Invalid cross-reference type '#{xref_entry.type}' encountered")
    end
  if xref_entry.oid != 0 && (oid != xref_entry.oid || gen != xref_entry.gen)
    raise_malformed("The oid,gen (#{oid},#{gen}) values of the indirect object don't match " \
                    "the values (#{xref_entry.oid},#{xref_entry.gen}) from the xref")
  end
  @document.wrap(obj, oid: oid, gen: gen, stream: stream)
rescue HexaPDF::MalformedPDFError
  reconstructed_revision.object(xref_entry) ||
    @document.wrap(nil, oid: xref_entry.oid, gen: xref_entry.gen)
end

def load_revision(pos)

Returns an HexaPDF::XRefSection object and the accompanying trailer dictionary.

position.
Loads a single revision whose cross-reference section/stream is located at the given

def load_revision(pos)
  if xref_section?(pos)
    xref_section, trailer = parse_xref_section_and_trailer(pos)
  else
    obj = load_object(XRefSection.in_use_entry(0, 0, pos))
    unless obj.respond_to?(:xref_section)
      raise_malformed("Object is not a cross-reference stream", pos: pos)
    end
    begin
      xref_section = obj.xref_section
    rescue MalformedPDFError => e
      e.pos = pos
      raise
    end
    trailer = obj.trailer
    unless xref_section.entry?(obj.oid, obj.gen)
      maybe_raise("Cross-reference stream doesn't contain entry for itself", pos: pos)
      xref_section.add_in_use_entry(obj.oid, obj.gen, pos)
    end
  end
  xref_section.delete(0)
  [xref_section, trailer]
end

def maybe_raise(msg, pos:, force: false)

If the option +force+ is used, the block is not called and the error is raised immediately.

HexaPDF::MalformedPDFError. Otherwise the error is corrected and parsing continues.
the given message and the position. If the returned value is +true+, raises a
Calls the block stored in the config option +parser.on_correctable_error+ with the document,

def maybe_raise(msg, pos:, force: false)
  if force || @document.config['parser.on_correctable_error'].call(@document, msg, pos)
    error = HexaPDF::MalformedPDFError.new(msg, pos: pos)
    error.set_backtrace(caller(1))
    raise error
  end
end

def parse_indirect_object(offset = nil)

See: PDF2.0 s7.3.10, s7.3.8

Returns an array containing [object, oid, gen, stream].

other object because invalid object positions lead to errors.
This method is used by a PDF Document to load objects. It should **not** be used by any

Parses the indirect object at the specified offset.

def parse_indirect_object(offset = nil)
  @tokenizer.pos = offset + @header_offset if offset
  oid = @tokenizer.next_token
  gen = @tokenizer.next_token
  tok = @tokenizer.next_token
  unless oid.kind_of?(Integer) && gen.kind_of?(Integer) &&
      tok.kind_of?(Tokenizer::Token) && tok == 'obj'
    raise_malformed("No valid object found", pos: offset)
  end
  if (tok = @tokenizer.peek_token) && tok.kind_of?(Tokenizer::Token) && tok == 'endobj'
    maybe_raise("No indirect object value between 'obj' and 'endobj'", pos: @tokenizer.pos)
    object = nil
  else
    begin
      object = @tokenizer.next_object
    rescue MalformedPDFError
      if tok.kind_of?(Tokenizer::Token) && tok =~ /\A\d+endobj\z/
        # Handle often found invalid indirect object with missing whitespace after number
        maybe_raise("Missing whitespace after number'", pos: @tokenizer.pos)
        object = tok.to_i
        @tokenizer.pos -= 6
      else
        maybe_raise("Invalid value after '#{oid} #{gen} obj', treating as null", pos: @tokenizer.pos)
        return [nil, oid, gen, nil]
      end
    end
  end
  tok = @tokenizer.next_token
  if tok.kind_of?(Tokenizer::Token) && tok == 'stream'
    unless object.kind_of?(Hash)
      raise_malformed("A stream needs a dictionary, not a(n) #{object.class}", pos: offset)
    end
    tok1 = @tokenizer.next_byte
    if tok1 == 32 # space
      maybe_raise("Keyword stream followed by space instead of LF or CR/LF", pos: @tokenizer.pos)
      tok1 = @tokenizer.next_byte
    end
    tok2 = @tokenizer.next_byte if tok1 == 13 # CR
    if tok1 != 10 && tok1 != 13
      raise_malformed("Keyword stream must be followed by LF or CR/LF", pos: @tokenizer.pos)
    elsif tok1 == 13 && tok2 != 10
      maybe_raise("Keyword stream must be followed by LF or CR/LF, not CR alone",
                  pos: @tokenizer.pos)
      @tokenizer.pos -= 1
    end
    # Note that getting :Length might move the IO pointer (when resolving references)
    pos = @tokenizer.pos
    length = if object[:Length].kind_of?(Integer)
               object[:Length]
             elsif object[:Length].kind_of?(Reference)
               @document.deref(object[:Length]).value
             else
               0
             end
    @tokenizer.pos = pos + length rescue pos
    tok = @tokenizer.next_token rescue nil
    unless tok.kind_of?(Tokenizer::Token) && tok == 'endstream'
      maybe_raise("Invalid stream length, keyword endstream not found", pos: @tokenizer.pos)
      @tokenizer.pos = pos
      if @tokenizer.scan_until(/(?=\n?endstream)/)
        length = @tokenizer.pos - pos
        tok = @tokenizer.next_token
      else
        raise_malformed("Stream content must be followed by keyword endstream",
                        pos: @tokenizer.pos)
      end
    end
    tok = @tokenizer.next_token
    object[:Length] = length
    stream = StreamData.new(@tokenizer.io, offset: pos, length: length,
                            filter: @document.unwrap(object[:Filter]),
                            decode_parms: @document.unwrap(object[:DecodeParms]))
  end
  unless tok.kind_of?(Tokenizer::Token) && tok == 'endobj'
    maybe_raise("Indirect object must be followed by keyword endobj", pos: @tokenizer.pos)
  end
  [object, oid, gen, stream]
end

def parse_xref_section_and_trailer(offset)

See: PDF2.0 s7.5.4, s7.5.5; ADB1.7 sH.3-3.4.3

This method can only parse cross-reference sections, not cross-reference streams!

returns them as an array consisting of an HexaPDF::XRefSection instance and a hash.
Parses the cross-reference section at the given position and the following trailer and

def parse_xref_section_and_trailer(offset)
  @tokenizer.pos = offset + @header_offset
  token = @tokenizer.next_token
  unless token.kind_of?(Tokenizer::Token) && token == 'xref'
    raise_malformed("Xref section doesn't start with keyword xref", pos: @tokenizer.pos)
  end
  xref = XRefSection.new
  start = @tokenizer.next_token
  while start.kind_of?(Integer)
    number_of_entries = @tokenizer.next_token
    unless number_of_entries.kind_of?(Integer)
      raise_malformed("Invalid cross-reference subsection start", pos: @tokenizer.pos)
    end
    @tokenizer.skip_whitespace
    start.upto(start + number_of_entries - 1) do |oid|
      pos, gen, type = @tokenizer.next_xref_entry do |recoverable|
        maybe_raise("Invalid cross-reference entry", pos: @tokenizer.pos,
                    force: !recoverable)
      end
      if xref.entry?(oid)
        next
      elsif type == 'n'
        if pos == 0 || gen > 65535
          maybe_raise("Invalid in use cross-reference entry",
                      pos: @tokenizer.pos)
          xref.add_free_entry(oid, gen)
        else
          xref.add_in_use_entry(oid, gen, pos)
        end
      else
        xref.add_free_entry(oid, gen)
      end
    end
    start = @tokenizer.next_token
  end
  unless start.kind_of?(Tokenizer::Token) && start == 'trailer'
    raise_malformed("Trailer doesn't start with keyword trailer", pos: @tokenizer.pos)
  end
  trailer = @tokenizer.next_object
  unless trailer.kind_of?(Hash)
    raise_malformed("Trailer is #{trailer.class} instead of dictionary ", pos: @tokenizer.pos)
  end
  unless trailer[:Prev] || xref.max_oid == 0 || xref.entry?(0)
    first_entry = xref[xref.oids[0]]
    test_entry = xref[xref.oids[-1]]
    @tokenizer.pos = test_entry.pos + @header_offset
    test_oid = @tokenizer.next_token
    first_oid = first_entry.oid
    force_failure = !first_entry.free? || first_entry.gen != 65535 ||
      !test_oid.kind_of?(Integer) || xref.oids[-1] - test_oid != first_oid
    maybe_raise("Main cross-reference section has invalid numbering",
                pos: offset + @header_offset, force: force_failure)
    new_xref = XRefSection.new
    xref.oids.each do |oid|
      entry = xref[oid]
      entry.oid -= first_oid
      new_xref.send(:[]=, entry.oid, entry.gen, entry)
    end
    xref = new_xref
  end
  [xref, trailer]
end

def raise_malformed(msg, pos: nil)

Raises a HexaPDF::MalformedPDFError with the given message and source position.

def raise_malformed(msg, pos: nil)
  raise HexaPDF::MalformedPDFError.new(msg, pos: pos)
end

def reconstruct_revision

cross-reference table, later objects overwriting prior ones.
If the file contains multiple cross-reference sections, all objects will be put into a single

file and returning a Revision object for loading the found objects.
Tries to reconstruct the PDF document's main cross-reference table by serially parsing the

def reconstruct_revision
  return if @in_reconstruct_revision
  @in_reconstruct_revision = true
  @header_offset = 0
  raise unless @document.config['parser.try_xref_reconstruction']
  msg = "#{$!} - trying cross-reference table reconstruction"
  @document.config['parser.on_correctable_error'].call(@document, msg, @tokenizer.pos)
  xref = XRefSection.new
  @tokenizer.pos = 0
  linearized = nil
  while true
    @tokenizer.skip_whitespace
    pos = @tokenizer.pos
    @tokenizer.scan_until(/(\n|\r\n?)+|\z/)
    next_new_line_pos = @tokenizer.pos
    @tokenizer.pos = pos
    token = @tokenizer.next_integer_or_keyword rescue nil
    if token.kind_of?(Integer)
      gen = @tokenizer.next_integer_or_keyword rescue nil
      tok = @tokenizer.next_integer_or_keyword rescue nil
      if @tokenizer.pos > next_new_line_pos
        @tokenizer.pos = next_new_line_pos
      elsif gen.kind_of?(Integer) && tok.kind_of?(Tokenizer::Token) && tok == 'obj'
        xref.add_in_use_entry(token, gen, pos)
        if linearized.nil?
          pos = @tokenizer.pos
          obj = @tokenizer.next_object rescue nil
          linearized = obj.kind_of?(Hash) && obj.key?(:Linearized)
          @tokenizer.pos = pos
        end
        @tokenizer.scan_until(/\bendobj\b/)
      end
    elsif token.kind_of?(Tokenizer::Token) && token == 'trailer'
      obj = @tokenizer.next_object rescue nil
      # Use last trailer found in case of multiple revisions but use first trailer in case of
      # linearized file.
      trailer = obj if obj.kind_of?(Hash) && (!linearized || trailer.nil?)
    elsif token == Tokenizer::NO_MORE_TOKENS
      break
    else
      @tokenizer.pos = next_new_line_pos
    end
  end
  if !trailer || trailer.empty?
    _, trailer = load_revision(startxref_offset) rescue nil
    unless trailer
      xref.each do |_oid, _gen, xref_entry|
        obj, * = parse_indirect_object(xref_entry.pos) rescue nil
        if obj.kind_of?(Hash) && obj[:Type] == :Catalog
          trailer = {Root: HexaPDF::Reference.new(xref_entry.oid, xref_entry.gen)}
          break
        end
      end
    end
    unless trailer
      @in_reconstruct_revision = false
      raise_malformed("Could not reconstruct malformed PDF because trailer was not found", pos: 0)
    end
  end
  trailer&.delete(:Prev) # no need for this and may wreak havoc
  loader = lambda do |xref_entry|
    obj, oid, gen, stream = parse_indirect_object(xref_entry.pos)
    obj = @document.wrap(obj, oid: oid, gen: gen, stream: stream)
    @document.security_handler ? @document.security_handler.decrypt(obj) : obj
  end
  @in_reconstruct_revision = false
  Revision.new(@document.wrap(trailer, type: :XXTrailer), xref_section: xref,
               loader: loader)
end

def reconstructed?

Returns +true+ if the PDF file was damaged and could be reconstructed.

def reconstructed?
  !@reconstructed_revision.nil?
end

def reconstructed_revision

Returns the reconstructed revision.

def reconstructed_revision
  @reconstructed_revision ||= reconstruct_revision
end

def retrieve_pdf_header_offset_and_version

See: PDF2.0 s7.5.2, ADB1.7 sH.3-3.4.1

convention.
restriction so that the header may appear in the first 1024 bytes. We follow the Adobe
The PDF header should normally appear on the first line. However, Adobe relaxes this

Retrieves the offset of the PDF header and the PDF version number in it.

def retrieve_pdf_header_offset_and_version
  @io.seek(0)
  @header_offset = (@io.read(1024) || '').index(/%PDF-(\d\.\d)/) || 0
  @header_version = $1
end

def startxref_offset

See: PDF2.0 s7.5.5, ADB1.7 sH.3-3.4.4

If strict parsing is disabled, the whole file is searched for the offset.

viewers relax this restriction and so do we.
Implementation note: Normally, the %%EOF marker has to be on the last line, however, Adobe

Returns the offset of the main cross-reference section/stream.

def startxref_offset
  return @startxref_offset if defined?(@startxref_offset)
  @io.seek(0, IO::SEEK_END)
  step_size = 1024
  pos = @io.pos
  eof_not_found = pos == 0
  startxref_missing = startxref_mangled = false
  startxref_offset = nil
  while pos != 0
    @io.pos = [pos - step_size, 0].max
    pos = @io.pos
    lines = @io.read(step_size + 40).split(/[\r\n]+/)
    eof_index = lines.rindex {|l| l.strip == '%%EOF' }
    if !eof_index
      eof_not_found = true
    elsif lines[eof_index - 1].strip =~ /\Astartxref\s(\d+)\z/
      startxref_offset = $1.to_i
      startxref_mangled = true
      break # we found it even if it the syntax is not entirely correct
    elsif eof_index < 2 || lines[eof_index - 2].strip != "startxref"
      startxref_missing = true
    else
      startxref_offset = lines[eof_index - 1].to_i
      break # we found it
    end
  end
  if eof_not_found
    maybe_raise("PDF file trailer with end-of-file marker not found", pos: pos,
                force: !eof_index)
  elsif startxref_mangled
    maybe_raise("PDF file trailer keyword startxref on same line as value", pos: pos)
  elsif startxref_missing
    maybe_raise("PDF file trailer is missing startxref keyword", pos: pos,
                force: eof_index < 2 || lines[eof_index - 2].strip != "startxref")
  end
  @startxref_offset = startxref_offset
end

def xref_section?(offset)

position.
Looks at the given offset and returns +true+ if there is a cross-reference section at that

def xref_section?(offset)
  @tokenizer.pos = offset + @header_offset
  token = @tokenizer.peek_token
  token.kind_of?(Tokenizer::Token) && token == 'xref'
end

Modules

Classes