lib/pdf/reader/buffer.rb



# coding: ASCII-8BIT
# typed: strict
# frozen_string_literal: true

################################################################################
#
# Copyright (C) 2010 James Healy (jimmy@deefa.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################

class PDF::Reader

  # A string tokeniser that recognises PDF grammar. When passed an IO stream or a
  # string, repeated calls to token() will return the next token from the source.
  #
  # This is very low level, and getting the raw tokens is not very useful in itself.
  #
  # This will usually be used in conjunction with PDF:Reader::Parser, which converts
  # the raw tokens into objects we can work with (strings, ints, arrays, etc)
  #
  class Buffer
    TOKEN_WHITESPACE=[0x00, 0x09, 0x0A, 0x0C, 0x0D, 0x20]
    TOKEN_DELIMITER=[0x25, 0x3C, 0x3E, 0x28, 0x5B, 0x7B, 0x29, 0x5D, 0x7D, 0x2F]

    # some strings for comparissons. Declaring them here avoids creating new
    # strings that need GC over and over
    LEFT_PAREN = "("
    LESS_THAN = "<"
    STREAM = "stream"
    ID = "ID"
    FWD_SLASH = "/"
    NULL_BYTE = "\x00"
    CR = "\r"
    LF = "\n"
    CRLF = "\r\n"
    WHITE_SPACE = ["\n", "\r", ' ']

    # Quite a few PDFs have trailing junk.
    # This can be several k of nuls in some cases
    # Allow for this here
    TRAILING_BYTECOUNT = 5000

    # must match whole tokens
    DIGITS_ONLY = %r{\A\d+\z}

    attr_reader :pos

    # Creates a new buffer.
    #
    # Params:
    #
    #   io - an IO stream (usually a StringIO) with the raw data to tokenise
    #
    # options:
    #
    #   :seek - a byte offset to seek to before starting to tokenise
    #   :content_stream - set to true if buffer will be tokenising a
    #                     content stream. Defaults to false
    #
    def initialize(io, opts = {})
      @io = io
      @tokens = []
      @in_content_stream = opts[:content_stream]

      @io.seek(opts[:seek]) if opts[:seek]
      @pos = @io.pos
    end

    # return true if there are no more tokens left
    #
    def empty?
      prepare_tokens if @tokens.size < 3

      @tokens.empty?
    end

    # return raw bytes from the underlying IO stream.
    #
    #   bytes - the number of bytes to read
    #
    # options:
    #
    #   :skip_eol - if true, the IO stream is advanced past a CRLF, CR or LF
    #               that is sitting under the io cursor.
    #   Note:
    #   Skipping a bare CR is not spec-compliant.
    #   This is because the data may start with LF.
    #   However we check for CRLF first, so the ambiguity is avoided.
    def read(bytes, opts = {})
      reset_pos

      if opts[:skip_eol]
        @io.seek(-1, IO::SEEK_CUR)
        str = @io.read(2)
        if str.nil?
          return nil
        elsif str == CRLF # This MUST be done before checking for CR alone
          # do nothing
        elsif str[0, 1] == LF || str[0, 1] == CR # LF or CR alone
          @io.seek(-1, IO::SEEK_CUR)
        else
          @io.seek(-2, IO::SEEK_CUR)
        end
      end

      bytes = @io.read(bytes)
      save_pos
      bytes
    end

    # return the next token from the source. Returns a string if a token
    # is found, nil if there are no tokens left.
    #
    def token
      reset_pos
      prepare_tokens if @tokens.size < 3
      merge_indirect_reference
      prepare_tokens if @tokens.size < 3

      @tokens.shift
    end

    # return the byte offset where the first XRef table in th source can be found.
    #
    def find_first_xref_offset
      check_size_is_non_zero
      @io.seek(-TRAILING_BYTECOUNT, IO::SEEK_END) rescue @io.seek(0)
      data = @io.read(TRAILING_BYTECOUNT)

      raise MalformedPDFError, "PDF does not contain EOF marker" if data.nil?

      # the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.
      lines = data.split(/[\n\r]+/).reverse
      eof_index = lines.index { |l| l.strip[/^%%EOF/] }

      raise MalformedPDFError, "PDF does not contain EOF marker" if eof_index.nil?
      raise MalformedPDFError, "PDF EOF marker does not follow offset" if eof_index >= lines.size-1
      offset = lines[eof_index+1].to_i

      # a byte offset < 0 doesn't make much sense. This is unlikely to happen, but in theory some
      # corrupted PDFs might have a line that looks like a negative int preceding the `%%EOF`
      raise MalformedPDFError, "invalid xref offset" if offset < 0
      offset
    end

    private

    def check_size_is_non_zero
      @io.seek(-1, IO::SEEK_END)
      @io.seek(0)
    rescue Errno::EINVAL
      raise MalformedPDFError, "PDF file is empty"
    end

    # Returns true if this buffer is parsing a content stream
    #
    def in_content_stream?
      @in_content_stream ? true : false
    end

    # Some bastard moved our IO stream cursor. Restore it.
    #
    def reset_pos
      @io.seek(@pos) if @io.pos != @pos
    end

    # save the current position of the source IO stream. If someone else (like another buffer)
    # moves the cursor, we can then restore it.
    #
    def save_pos
      @pos = @io.pos
    end

    # attempt to prime the buffer with the next few tokens.
    #
    def prepare_tokens
      10.times do
        case state
        when :literal_string then prepare_literal_token
        when :hex_string     then prepare_hex_token
        when :regular        then prepare_regular_token
        when :inline         then prepare_inline_token
        end
      end

      save_pos
    end

    # tokenising behaves slightly differently based on the current context.
    # Determine the current context/state by examining the last token we found
    #
    def state
      case @tokens.last
      when LEFT_PAREN then :literal_string
      when LESS_THAN then :hex_string
      when STREAM then :stream
      when ID
        if in_content_stream?  && @tokens[-2] != FWD_SLASH
          :inline
        else
          :regular
        end
      else
        :regular
      end
    end

    # detect a series of 3 tokens that make up an indirect object. If we find
    # them, replace the tokens with a PDF::Reader::Reference instance.
    #
    # Merging them into a single string was another option, but that would mean
    # code further up the stack would need to check every token  to see if it looks
    # like an indirect object. For optimisation reasons, I'd rather avoid
    # that extra check.
    #
    # It's incredibly likely that the next 3 tokens in the buffer are NOT an
    # indirect reference, so test for that case first and avoid the relatively
    # expensive regexp checks if possible.
    #
    def merge_indirect_reference
      return if @tokens.size < 3
      return if @tokens[2] != "R"

      token_one = @tokens[0]
      token_two = @tokens[1]
      if token_one.is_a?(String) && token_two.is_a?(String) && token_one.match(DIGITS_ONLY) && token_two.match(DIGITS_ONLY)
        @tokens[0] = PDF::Reader::Reference.new(token_one.to_i, token_two.to_i)
        @tokens.delete_at(2)
        @tokens.delete_at(1)
      end
    end

    # Extract data between ID and EI
    # If the EI follows white-space the space is dropped from the data
    # The EI must followed by white-space or end of buffer
    # This is to reduce the chance of accidentally matching an embedded EI
    def prepare_inline_token
      idstart = @io.pos
      prevchr = ''
      eisize = 0 # how many chars in the end marker
      seeking = 'E' # what are we looking for now?
      loop do
        chr = @io.read(1)
        break if chr.nil?
        case seeking
        when 'E'
          if chr == 'E'
            seeking = 'I'
            if WHITE_SPACE.include? prevchr
              eisize = 3 # include whitespace in delimiter, i.e. drop from data
            else # assume the EI immediately follows the data
              eisize = 2 # leave prevchr in data
            end
          end
        when 'I'
          if chr == 'I'
            seeking = ''
          else
            seeking = 'E'
          end
        when ''
          if WHITE_SPACE.include? chr
            eisize += 1 # Drop trailer
            break
          else
            seeking = 'E'
          end
        end
        prevchr = chr.is_a?(String) ? chr : ''
      end
      unless seeking == ''
        raise MalformedPDFError, "EI terminator not found"
      end
      eiend = @io.pos
      @io.seek(idstart, IO::SEEK_SET)
      str = @io.read(eiend - eisize - idstart) # get the ID content
      @tokens << str.freeze if str
    end

    # if we're currently inside a hex string, read hex nibbles until
    # we find a closing >
    #
    def prepare_hex_token
      str = "".dup

      loop do
        byte = @io.getbyte
        if byte.nil?
          break
        elsif (48..57).include?(byte) || (65..90).include?(byte) || (97..122).include?(byte)
          str << byte
        elsif byte <= 32
          # ignore it
        else
          @tokens << str if str.size > 0
          @tokens << ">" if byte != 0x3E # '>'
          @tokens << byte.chr
          break
        end
      end
    end

    # if we're currently inside a literal string we more or less just read bytes until
    # we find the closing ) delimiter. Lots of bytes that would otherwise indicate the
    # start of a new token in regular mode are left untouched when inside a literal
    # string.
    #
    # The entire literal string will be returned as a single token. It will need further
    # processing to fix things like escaped new lines, but that's someone else's
    # problem.
    #
    def prepare_literal_token
      str = "".dup
      count = 1

      while count > 0
        byte = @io.getbyte
        if byte.nil?
          count = 0 # unbalanced params
        elsif byte == 0x5C
          str << byte << @io.getbyte
        elsif byte == 0x28 # "("
          str << "("
          count += 1
        elsif byte == 0x29 # ")"
          count -= 1
          str << ")" unless count == 0
        else
          str << byte unless count == 0
        end
      end

      @tokens << str if str.size > 0
      @tokens << ")"
    end

    # Extract the next regular token and stock it in our buffer, ready to be returned.
    #
    # What each byte means is complex, check out section "3.1.1 Character Set" of the 1.7 spec
    # to read up on it.
    #
    def prepare_regular_token
      tok = "".dup

      loop do
        byte = @io.getbyte

        case byte
        when nil
          break
        when 0x25
          # comment, ignore everything until the next EOL char
          loop do
            commentbyte = @io.getbyte
            break if commentbyte.nil? || commentbyte == 0x0A || commentbyte == 0x0D
          end
        when *TOKEN_WHITESPACE
          # white space, token finished
          @tokens << tok if tok.size > 0

          #If the token was empty, chomp the rest of the whitespace too
          while TOKEN_WHITESPACE.include?(peek_byte) && tok.size == 0
            @io.getbyte
          end
          tok = "".dup
          break
        when 0x3C
          # opening delimiter '<', start of new token
          @tokens << tok if tok.size > 0
          if peek_byte == 0x3C # check if token is actually '<<'
            @io.getbyte
            @tokens << "<<"
          else
            @tokens << "<"
          end
          tok = "".dup
          break
        when 0x3E
          # closing delimiter '>', start of new token
          @tokens << tok if tok.size > 0
          if peek_byte == 0x3E # check if token is actually '>>'
            @io.getbyte
            @tokens << ">>"
          else
            @tokens << ">"
          end
          tok = "".dup
          break
        when 0x28, 0x5B, 0x7B
          # opening delimiter, start of new token
          @tokens << tok if tok.size > 0
          @tokens << byte.chr
          tok = "".dup
          break
        when 0x29, 0x5D, 0x7D
          # closing delimiter
          @tokens << tok if tok.size > 0
          @tokens << byte.chr
          tok = "".dup
          break
        when 0x2F
          # PDF name, start of new token
          @tokens << tok if tok.size > 0
          @tokens << byte.chr
          @tokens << "" if byte == 0x2F && ([nil, 0x20, 0x0A] + TOKEN_DELIMITER).include?(peek_byte)
          tok = "".dup
          break
        else
          tok << byte
        end
      end

      @tokens << tok if tok.size > 0
    end

    # peek at the next character in the io stream, leaving the stream position
    # untouched
    #
    def peek_byte
      byte = @io.getbyte
      @io.seek(-1, IO::SEEK_CUR) if byte
      byte
    end

  end
end