################################################################################
#
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################
class PDF::Reader
################################################################################
# An internal PDF::Reader class that reads objects from the PDF file and converts
# them into useable ruby objects (hash's, arrays, true, false, etc)
class Parser
################################################################################
# Create a new parser around a PDF::Reader::Buffer object
#
# buffer - a PDF::Reader::Buffer object that contains PDF data
# xref - a PDF::Reader::XRef object that represents the document's object offsets
def initialize (buffer, xref)
@buffer = buffer
@xref = xref
end
################################################################################
# Reads the next token from the underlying buffer and convets it to an appropriate
# object
#
# operators - a hash of supported operators to read from the underlying buffer.
def parse_token (operators={})
ref = Reference.from_buffer(@buffer) and return ref
token = @buffer.token
case token
when nil then return nil
when "/" then return @buffer.token.to_sym
when "<<" then return dictionary()
when "[" then return array()
when "(" then return string()
when "<" then return hex_string()
when "true" then return true
when "false" then return false
when "null" then return nil
when "obj", "endobj" then return Token.new(token)
when "stream", "endstream" then return Token.new(token)
when ">>", "]", ">" then return Token.new(token)
else
if operators.has_key?(token) then return Token.new(token)
elsif token =~ /\d*\.\d/ then return token.to_f
else return token.to_i
end
end
end
################################################################################
# reads a PDF dict from the buffer and converts it to a Ruby Hash.
def dictionary
dict = {}
loop do
key = parse_token
break if key.kind_of?(Token) and key == ">>"
raise MalformedPDFError, "Dictionary key (#{key.inspect}) is not a name" unless key.kind_of?(Symbol)
value = parse_token
value.kind_of?(Token) and Error.str_assert_not(value, ">>")
dict[key] = value
end
dict
end
################################################################################
# reads a PDF array from the buffer and converts it to a Ruby Array.
def array
a = []
loop do
item = parse_token
break if item.kind_of?(Token) and item == "]"
a << item
end
a
end
################################################################################
# Reads a PDF hex string from the buffer and converts it to a Ruby String
def hex_string
str = ""
loop do
token = @buffer.token
break if token == ">"
str << token
end
# add a missing digit if required, as required by the spec
str << "0" unless str.size % 2 == 0
str.scan(/../).map {|i| i.hex.chr}.join
end
################################################################################
# Reads a PDF String from the buffer and converts it to a Ruby String
def string
str = ""
count = 1
while count != 0
@buffer.ready_token(false, false)
# find the first occurance of ( ) [ \ or ]
#
# I originally just used the regexp form of index(), but it seems to be
# buggy on some OSX systems (returns nil when there is a match). This
# version is more reliable and was suggested by Andrès Koetsier.
#
i = nil
@buffer.raw.unpack("C*").each_with_index do |charint, idx|
if [40, 41, 92].include?(charint)
i = idx
break
end
end
if i.nil?
str << @buffer.raw + "\n"
@buffer.raw.replace("")
# if a content stream opens a string, but never closes it, we'll
# hit the end of the stream and still be appending stuff to the
# string. bad! This check prevents a hard loop.
raise MalformedPDFError, 'unterminated string in content stream' if @buffer.eof?
next
end
str << @buffer.head(i, false)
to_remove = 1
case @buffer.raw[0, 1]
when "("
str << "("
count += 1
when ")"
count -= 1
str << ")" unless count == 0
when "\\"
to_remove = 2
case @buffer.raw[1, 1]
when "" then to_remove = 1
when "n" then str << "\n"
when "r" then str << "\r"
when "t" then str << "\t"
when "b" then str << "\b"
when "f" then str << "\f"
when "(" then str << "("
when ")" then str << ")"
when "\\" then str << "\\"
else
if m = @buffer.raw.match(/^\\(\d{1,3})/)
to_remove = m[0].size
str << m[1].oct.chr
end
end
end
@buffer.head(to_remove, false)
end
str
end
################################################################################
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
# If the object is a content stream, returns both the stream and the dictionary
# that describes it
#
# id - the object ID to return
# gen - the object revision number to return
def object (id, gen)
Error.assert_equal(parse_token, id)
Error.assert_equal(parse_token, gen)
Error.str_assert(parse_token, "obj")
obj = parse_token
post_obj = parse_token
case post_obj
when "endobj" then return obj
when "stream" then return stream(obj)
else raise MalformedPDFError, "PDF malformed, unexpected token #{post_obj}"
end
end
################################################################################
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
def stream (dict)
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
data = @buffer.read(@xref.object(dict[:Length]))
Error.str_assert(parse_token, "endstream")
Error.str_assert(parse_token, "endobj")
PDF::Reader::Stream.new(dict, data)
end
################################################################################
end
################################################################################
end
################################################################################