# coding: utf-8
# typed: true
# frozen_string_literal: true
################################################################################
#
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################
class PDF::Reader
################################################################################
# An internal PDF::Reader class that reads objects from the PDF file and converts
# them into useable ruby objects (hash's, arrays, true, false, etc)
class Parser
TOKEN_STRATEGY = proc { |parser, token| Token.new(token) }
STRATEGIES = {
"/" => proc { |parser, token| parser.send(:pdf_name) },
"<<" => proc { |parser, token| parser.send(:dictionary) },
"[" => proc { |parser, token| parser.send(:array) },
"(" => proc { |parser, token| parser.send(:string) },
"<" => proc { |parser, token| parser.send(:hex_string) },
nil => proc { nil },
"true" => proc { true },
"false" => proc { false },
"null" => proc { nil },
"obj" => TOKEN_STRATEGY,
"endobj" => TOKEN_STRATEGY,
"stream" => TOKEN_STRATEGY,
"endstream" => TOKEN_STRATEGY,
">>" => TOKEN_STRATEGY,
"]" => TOKEN_STRATEGY,
">" => TOKEN_STRATEGY,
")" => TOKEN_STRATEGY
}
################################################################################
# Create a new parser around a PDF::Reader::Buffer object
#
# buffer - a PDF::Reader::Buffer object that contains PDF data
# objects - a PDF::Reader::ObjectHash object that can return objects from the PDF file
def initialize(buffer, objects=nil)
@buffer = buffer
@objects = objects
end
################################################################################
# Reads the next token from the underlying buffer and convets it to an appropriate
# object
#
# operators - a hash of supported operators to read from the underlying buffer.
def parse_token(operators={})
token = @buffer.token
if STRATEGIES.has_key? token
STRATEGIES[token].call(self, token)
elsif token.is_a? PDF::Reader::Reference
token
elsif operators.has_key? token
Token.new(token)
elsif token.frozen?
token
elsif token =~ /\d*\.\d/
token.to_f
else
token.to_i
end
end
################################################################################
# Reads an entire PDF object from the buffer and returns it as a Ruby String.
# If the object is a content stream, returns both the stream and the dictionary
# that describes it
#
# id - the object ID to return
# gen - the object revision number to return
def object(id, gen)
idCheck = parse_token
# Sometimes the xref table is corrupt and points to an offset slightly too early in the file.
# check the next token, maybe we can find the start of the object we're looking for
if idCheck != id
Error.assert_equal(parse_token, id)
end
Error.assert_equal(parse_token, gen)
Error.str_assert(parse_token, "obj")
obj = parse_token
post_obj = parse_token
if obj.is_a?(Hash) && post_obj == "stream"
stream(obj)
else
obj
end
end
private
################################################################################
# reads a PDF dict from the buffer and converts it to a Ruby Hash.
def dictionary
dict = {}
loop do
key = parse_token
break if key.kind_of?(Token) and key == ">>"
raise MalformedPDFError, "unterminated dict" if @buffer.empty?
PDF::Reader::Error.validate_type_as_malformed(key, "Dictionary key", Symbol)
value = parse_token
value.kind_of?(Token) and Error.str_assert_not(value, ">>")
dict[key] = value
end
dict
end
################################################################################
# reads a PDF name from the buffer and converts it to a Ruby Symbol
def pdf_name
tok = @buffer.token
tok = tok.dup.gsub(/#([A-Fa-f0-9]{2})/) do |match|
match[1, 2].hex.chr
end
tok.to_sym
end
################################################################################
# reads a PDF array from the buffer and converts it to a Ruby Array.
def array
a = []
loop do
item = parse_token
break if item.kind_of?(Token) and item == "]"
raise MalformedPDFError, "unterminated array" if @buffer.empty?
a << item
end
a
end
################################################################################
# Reads a PDF hex string from the buffer and converts it to a Ruby String
def hex_string
str = "".dup
loop do
token = @buffer.token
break if token == ">"
raise MalformedPDFError, "unterminated hex string" if @buffer.empty?
str << token
end
# add a missing digit if required, as required by the spec
str << "0" unless str.size % 2 == 0
[str].pack('H*')
end
################################################################################
# Reads a PDF String from the buffer and converts it to a Ruby String
def string
str = @buffer.token
return "".dup.force_encoding("binary") if str == ")"
Error.assert_equal(parse_token, ")")
str.gsub!(/\\(\r\n|[nrtbf()\\\n\r]|([0-7]{1,3}))?|\r\n?/m) do |match|
if $2.nil? # not octal digits
MAPPING[match] || "".dup
else # must be octal digits
($2.oct & 0xff).chr # ignore high level overflow
end
end
str.force_encoding("binary")
end
MAPPING = {
"\r" => "\n",
"\r\n" => "\n",
"\\n" => "\n",
"\\r" => "\r",
"\\t" => "\t",
"\\b" => "\b",
"\\f" => "\f",
"\\(" => "(",
"\\)" => ")",
"\\\\" => "\\",
"\\\n" => "",
"\\\r" => "",
"\\\r\n" => "",
}
################################################################################
# Decodes the contents of a PDF Stream and returns it as a Ruby String.
def stream(dict)
raise MalformedPDFError, "PDF malformed, missing stream length" unless dict.has_key?(:Length)
if @objects
length = @objects.deref_integer(dict[:Length])
if dict[:Filter]
dict[:Filter] = @objects.deref_name_or_array(dict[:Filter])
end
else
length = dict[:Length] || 0
end
PDF::Reader::Error.validate_type_as_malformed(length, "length", Numeric)
data = @buffer.read(length, :skip_eol => true)
Error.str_assert(parse_token, "endstream")
# We used to assert that the stream had the correct closing token, but it doesn't *really*
# matter if it's missing, and other readers seems to handle its absence just fine
# Error.str_assert(parse_token, "endobj")
PDF::Reader::Stream.new(dict, data)
end
################################################################################
end
################################################################################
end
################################################################################