################################################################################## Copyright (C) 2006 Peter J Jones (pjones@pmade.com)## Permission is hereby granted, free of charge, to any person obtaining# a copy of this software and associated documentation files (the# "Software"), to deal in the Software without restriction, including# without limitation the rights to use, copy, modify, merge, publish,# distribute, sublicense, and/or sell copies of the Software, and to# permit persons to whom the Software is furnished to do so, subject to# the following conditions:# # The above copyright notice and this permission notice shall be# included in all copies or substantial portions of the Software.# # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.#################################################################################classPDF::Reader################################################################################# An internal PDF::Reader class that mediates access to the underlying PDF File or IO StreamclassBuffer################################################################################# Creates a new buffer around the specified IO objectdefinitialize(io)@io=io@buffer=nilend################################################################################# Seek to the requested byte in the IO stream.defseek(offset)@io.seek(offset,IO::SEEK_SET)@buffer=nilselfend################################################################################# reads the requested number of bytes from the underlying IO stream. ## length should be a positive integer.defread(length)out=""if@bufferand!@buffer.empty?out<<head(length)length-=out.lengthendout<<@io.read(length)iflength>0outend################################################################################# Reads from the buffer until the specified token is found, or the end of the buffer ## bytes - the bytes to search for.defread_until(bytes)out=""size=bytes.sizeloopdoout<<@io.read(1)ifout[-1*size,size].eql?(bytes)out=out[0,out.size-size]seek(pos-size)breakendendoutend################################################################################# returns true if the underlying IO object is at end and the internal buffer # is emptydefeof?if@buffer@buffer.empty?&&@io.eof?else@io.eof?endend################################################################################defpos@io.posend################################################################################# PDF files are processed by tokenising the content into a series of objects and commands.# This prepares the buffer for use by reading the next line of tokens into memory.defready_token(with_strip=true,skip_blanks=true)while@buffer.nil?or@buffer.empty?@buffer=@io.readline@buffer.force_encoding("BINARY")if@buffer.respond_to?(:force_encoding)#@buffer.sub!(/%.*$/, '') if strip_comments@buffer.chomp!breakunlessskip_blanksend@buffer.lstrip!ifwith_stripend################################################################################# return the next token from the underlying IO streamdeftokenready_tokeni=@buffer.index(/[\[\]()<>{}\s\/]/)||@buffer.sizetoken_chars=ifi==0and@buffer[i,2]=="<<"then2elsifi==0and@buffer[i,2]==">>"then2elsifi==0then1elseiendstrip_space=!(i==0and@buffer[0,1]=='(')tok=head(token_chars,strip_space)iftok[0,1]=="%"@buffer=""tokenelsetokendend################################################################################defhead(chars,with_strip=true)val=@buffer[0,chars]@buffer=@buffer[chars..-1]||""@buffer.lstrip!ifwith_stripvalend################################################################################# return the internal buffer used by this class when reading from the IO stream.defraw@bufferend################################################################################# The Xref table in a PDF file acts as an aid for finding the location of various# objects in the file. This method attempts to locate the byte offset of the xref# table in the underlying IO stream.deffind_first_xref_offset@io.seek(-1024,IO::SEEK_END)rescueseek(0)data=@io.read(1024)# the PDF 1.7 spec (section #3.4) says that EOL markers can be either \r, \n, or both.# To ensure we find the xref offset correctly, change all possible options to a # standard formatdata=data.gsub("\r\n","\n").gsub("\n\r","\n").gsub("\r","\n")lines=data.split(/\n/).reverseeof_index=nillines.each_with_indexdo|line,index|ifline=~/^%%EOF\r?$/eof_index=indexbreakendendraiseMalformedPDFError,"PDF does not contain EOF marker"ifeof_index.nil?raiseMalformedPDFError,"PDF EOF marker does not follow offset"ifeof_index>=lines.size-1lines[eof_index+1].to_iend################################################################################end################################################################################end################################################################################