lib/pdf/reader/content.rb



################################################################################
#
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
# 
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
# 
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################
require 'stringio'
#require 'enumerable'

class PDF::Reader
  ################################################################################
  # Walks the PDF file and calls the appropriate callback methods when something of interest is
  # found. 
  #
  # The callback methods should exist on the receiver object passed into the constructor. Whenever
  # some content is found that will trigger a callback, the receiver is checked to see if the callback 
  # is defined.
  #
  # If it is defined it will be called. If not, processing will continue.
  #
  # = Available Callbacks
  # The following callbacks are available and should be methods defined on your receiver class. Only 
  # implement the ones you need - the rest will be ignored.
  #
  # Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
  # paramters, or where you don't need them, the *params argument can be left off. Some example callback 
  # method definitions are:
  #
  #   def begin_document
  #   def end_page
  #   def show_text(string, *params)
  #   def fill_stroke(*params)
  #
  # You should be able to infer the basic command the callback is reporting based on the name. For
  # further experimentation, define the callback with just a *params parameter, then print out the 
  # contents of the array using something like:
  #
  #   puts params.inspect
  #
  # == Text Callbacks
  #
  # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the 
  # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
  # when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
  # string may not be byte-by-byte identical with the string that was originally written to the PDF.
  #
  # - end_text_object
  # - move_to_start_of_next_line
  # - set_character_spacing
  # - move_text_position
  # - move_text_position_and_set_leading
  # - set_text_font_and_size
  # - show_text
  # - show_text_with_positioning
  # - set_text_leading
  # - set_text_matrix_and_text_line_matrix
  # - set_text_rendering_mode
  # - set_text_rise
  # - set_word_spacing
  # - set_horizontal_text_scaling
  # - move_to_next_line_and_show_text
  # - set_spacing_next_line_show_text
  #
  # == Graphics Callbacks
  # - close_fill_stroke
  # - fill_stroke
  # - close_fill_stroke_with_even_odd
  # - fill_stroke_with_even_odd
  # - begin_marked_content_with_pl
  # - begin_inline_image
  # - begin_marked_content
  # - begin_text_object
  # - append_curved_segment
  # - concatenate_matrix
  # - set_stroke_color_space
  # - set_nonstroke_color_space
  # - set_line_dash
  # - set_glyph_width
  # - set_glyph_width_and_bounding_box
  # - invoke_xobject
  # - define_marked_content_with_pl
  # - end_inline_image
  # - end_marked_content
  # - fill_path_with_nonzero
  # - fill_path_with_nonzero
  # - fill_path_with_even_odd
  # - set_gray_for_stroking
  # - set_gray_for_nonstroking
  # - set_graphics_state_parameters
  # - close_subpath
  # - set_flatness_tolerance
  # - begin_inline_image_data
  # - set_line_join_style
  # - set_line_cap_style
  # - set_cmyk_color_for_stroking,
  # - set_cmyk_color_for_nonstroking
  # - append_line
  # - begin_new_subpath
  # - set_miter_limit
  # - define_marked_content_point
  # - end_path
  # - save_graphics_state
  # - restore_graphics_state
  # - append_rectangle
  # - set_rgb_color_for_stroking
  # - set_rgb_color_for_nonstroking
  # - set_color_rendering_intent
  # - close_and_stroke_path
  # - stroke_path
  # - set_color_for_stroking
  # - set_color_for_nonstroking
  # - set_color_for_stroking_and_special
  # - set_color_for_nonstroking_and_special
  # - paint_area_with_shading_pattern
  # - append_curved_segment_initial_point_replicated
  # - set_line_width
  # - set_clipping_path_with_nonzero
  # - set_clipping_path_with_even_odd
  # - append_curved_segment_final_point_replicated
  #
  # == Misc Callbacks
  # - begin_compatibility_section
  # - end_compatibility_section,
  # - begin_document
  # - end_document
  # - begin_page_container
  # - end_page_container
  # - begin_page
  # - end_page
  # - metadata
  # - xml_metadata
  #
  # == Resource Callbacks
  #
  # Each page and page_container can contain a range of resources required for the page,
  # including things like fonts and images. The following callbacks may appear
  # after begin_page_container and begin_page if the relevant resources exist
  # on a page:
  #
  # In most cases, these callbacks associate a name with each resource, allowing it
  # to be referred to by name in the page content. For example, an XObject can hold an image. 
  # If it gets mapped to the name "IM1", then it can be placed on the page using 
  # invoke_xobject "IM1".
  #
  # - resource_procset
  # - resource_xobject
  # - resource_extgstate
  # - resource_colorspace
  # - resource_pattern
  # - resource_font
  class Content
    OPERATORS = {
      'b'   => :close_fill_stroke,
      'B'   => :fill_stroke,
      'b*'  => :close_fill_stroke_with_even_odd,
      'B*'  => :fill_stroke_with_even_odd,
      'BDC' => :begin_marked_content_with_pl,
      'BI'  => :begin_inline_image,
      'BMC' => :begin_marked_content,
      'BT'  => :begin_text_object,
      'BX'  => :begin_compatibility_section,
      'c'   => :append_curved_segment,
      'cm'  => :concatenate_matrix,
      'CS'  => :set_stroke_color_space,
      'cs'  => :set_nonstroke_color_space,
      'd'   => :set_line_dash,
      'd0'  => :set_glyph_width,
      'd1'  => :set_glyph_width_and_bounding_box,
      'Do'  => :invoke_xobject,
      'DP'  => :define_marked_content_with_pl,
      'EI'  => :end_inline_image,
      'EMC' => :end_marked_content,
      'ET'  => :end_text_object,
      'EX'  => :end_compatibility_section,
      'f'   => :fill_path_with_nonzero,
      'F'   => :fill_path_with_nonzero,
      'f*'  => :fill_path_with_even_odd,
      'G'   => :set_gray_for_stroking,
      'g'   => :set_gray_for_nonstroking,
      'gs'  => :set_graphics_state_parameters,
      'h'   => :close_subpath,
      'i'   => :set_flatness_tolerance,
      'ID'  => :begin_inline_image_data,
      'j'   => :set_line_join_style,
      'J'   => :set_line_cap_style,
      'K'   => :set_cmyk_color_for_stroking,
      'k'   => :set_cmyk_color_for_nonstroking,
      'l'   => :append_line,
      'm'   => :begin_new_subpath,
      'M'   => :set_miter_limit,
      'MP'  => :define_marked_content_point,
      'n'   => :end_path,
      'q'   => :save_graphics_state,
      'Q'   => :restore_graphics_state,
      're'  => :append_rectangle,
      'RG'  => :set_rgb_color_for_stroking,
      'rg'  => :set_rgb_color_for_nonstroking,
      'ri'  => :set_color_rendering_intent,
      's'   => :close_and_stroke_path,
      'S'   => :stroke_path,
      'SC'  => :set_color_for_stroking,
      'sc'  => :set_color_for_nonstroking,
      'SCN' => :set_color_for_stroking_and_special,
      'scn' => :set_color_for_nonstroking_and_special,
      'sh'  => :paint_area_with_shading_pattern,
      'T*'  => :move_to_start_of_next_line,
      'Tc'  => :set_character_spacing,
      'Td'  => :move_text_position,
      'TD'  => :move_text_position_and_set_leading,
      'Tf'  => :set_text_font_and_size,
      'Tj'  => :show_text,
      'TJ'  => :show_text_with_positioning,
      'TL'  => :set_text_leading,
      'Tm'  => :set_text_matrix_and_text_line_matrix,
      'Tr'  => :set_text_rendering_mode,
      'Ts'  => :set_text_rise,
      'Tw'  => :set_word_spacing,
      'Tz'  => :set_horizontal_text_scaling,
      'v'   => :append_curved_segment_initial_point_replicated,
      'w'   => :set_line_width,
      'W'   => :set_clipping_path_with_nonzero,
      'W*'  => :set_clipping_path_with_even_odd,
      'y'   => :append_curved_segment_final_point_replicated,
      '\''  => :move_to_next_line_and_show_text,
      '"'   => :set_spacing_next_line_show_text,
    }
    ################################################################################
    # Create a new PDF::Reader::Content object to process the contents of PDF file
    # - receiver - an object containing the required callback methods
    # - xref     - a PDF::Reader::Xref object that contains references to all the objects in a PDF file
    def initialize (receiver, xref)
      @receiver = receiver
      @xref     = xref
      @fonts ||= {}
    end
    ################################################################################
    # Begin processing the document metadata
    def metadata (info)
      info = utf16_to_utf8(info)
      callback(:metadata, [info]) if info
    end
    ################################################################################
    # Begin processing the document
    def document (root)
      if root[:Metadata]
        obj, stream = @xref.object(root[:Metadata])
        callback(:xml_metadata,stream) 
      end
      callback(:begin_document, [root])
      walk_pages(@xref.object(root[:Pages]).first)
      callback(:end_document)
    end
    ################################################################################
    # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all 
    # its content
    def walk_pages (page)
      
      if page[:Resources]
        res = page[:Resources]
        page.delete(:Resources)
      end

      # extract page content
      if page[:Type] == :Pages
        callback(:begin_page_container, [page])
        walk_resources(@xref.object(res).first) if res
        page[:Kids].each {|child| walk_pages(@xref.object(child).first)}
        callback(:end_page_container)
      elsif page[:Type] == :Page
        callback(:begin_page, [page])
        walk_resources(@xref.object(res).first) if res
        @page = page
        @params = []

        if page[:Contents].kind_of?(Array)
          contents = page[:Contents]
        elsif @xref.obj_type(page[:Contents]) == :Array
          contents, stream = @xref.object(page[:Contents])
        else
          contents = [page[:Contents]]
        end
        
        contents.each do |content|
          obj, stream = @xref.object(content)
          content_stream(stream)
        end if page.has_key?(:Contents) and page[:Contents]

        callback(:end_page)
      end
    end
    ################################################################################
    # Reads a PDF content stream and calls all the appropriate callback methods for the operators
    # it contains
    def content_stream (instructions)
      @buffer = Buffer.new(StringIO.new(instructions))
      @parser = Parser.new(@buffer, @xref)
      @params = [] if @params.nil?

      until @buffer.eof?
        loop do
          token = @parser.parse_token(OPERATORS)

          if token.kind_of?(Token) and OPERATORS.has_key?(token) 
            @current_font = @params.first if OPERATORS[token] == :set_text_font_and_size

            # handle special cases in response to certain operators
            if OPERATORS[token].to_s.include?("show_text") && @fonts[@current_font]
              # convert any text to utf-8
              @params = @fonts[@current_font].to_utf8(@params)
            elsif token == "ID"
              # inline image data, first convert the current params into a more familiar hash
              map = {}
              @params.each_slice(2) do |a|
                map[a.first] = a.last
              end
              @params = [map]
              # read the raw image data from the buffer without tokenising
              @params << @buffer.read_until("EI")
            end
            callback(OPERATORS[token], @params)
            @params.clear
            break
          end

          @params << token
        end
      end
    rescue EOFError => e
      raise MalformedPDFError, "End Of File while processing a content stream"
    end
    ################################################################################
    def walk_resources(resources)
      resources = resolve_references(resources)
      
      # extract any procset information
      if resources[:ProcSet]
        callback(:resource_procset, resources[:ProcSet])
      end

      # extract any xobject information
      if resources[:XObject]
        @xref.object(resources[:XObject]).first.each do |name, val|
          obj, stream = @xref.object(val)
          callback(:resource_xobject, [name, obj, stream])
        end
      end

      # extract any extgstate information
      if resources[:ExtGState]
        @xref.object(resources[:ExtGState]).first.each do |name, val|
          callback(:resource_extgstate, [name, @xref.object(val).first])
        end
      end

      # extract any colorspace information
      if resources[:ColorSpace]
        @xref.object(resources[:ColorSpace]).first.each do |name, val|
          callback(:resource_colorspace, [name, @xref.object(val).first])
        end
      end

      # extract any pattern information
      if resources[:Pattern]
        @xref.object(resources[:Pattern]).first.each do |name, val|
          callback(:resource_pattern, [name, @xref.object(val).first])
        end
      end

      # extract any font information
      if resources[:Font]
        @xref.object(resources[:Font]).first.each do |label, desc|
          desc = @xref.object(desc).first
          @fonts[label] = PDF::Reader::Font.new
          @fonts[label].label = label
          @fonts[label].subtype = desc[:Subtype] if desc[:Subtype]
          @fonts[label].basefont = desc[:BaseFont] if desc[:BaseFont]
          @fonts[label].encoding = PDF::Reader::Encoding.factory(@xref.object(desc[:Encoding]).first)
          @fonts[label].descendantfonts = desc[:DescendantFonts] if desc[:DescendantFonts]
          if desc[:ToUnicode]
            # this stream is a cmap
            begin
              @fonts[label].tounicode = PDF::Reader::CMap.new(desc[:ToUnicode])
            rescue
              # if the CMap fails to parse, don't worry too much. Means we can't translate the text properly
            end
          end
          callback(:resource_font, [label, @fonts[label]])
        end
      end
    end
    ################################################################################
    # Convert any PDF::Reader::Resource objects into a real object 
    def resolve_references(obj)
      case obj
      when PDF::Reader::Reference then 
        obj, stream = @xref.object(obj)
        if stream
          stream
        else
          resolve_references(obj)
        end
      when Hash                   then obj.each { |key,val| obj[key] = resolve_references(val) }
      when Array                  then obj.collect { |item| resolve_references(item) }
      else
        obj
      end
    end
    ################################################################################
    # calls the name callback method on the receiver class with params as the arguments
    def callback (name, params=[])
      @receiver.send(name, *params) if @receiver.respond_to?(name)
    end
    ################################################################################
    private
    def utf16_to_utf8(obj)
      case obj
      when String then 
        if obj[0,2] == "\376\377"
          obj[2, obj.size-2].unpack("n*").pack("U*")
        else
          obj
        end
      when Hash   then obj.each { |key,val| obj[key] = utf16_to_utf8(val) }
      when Array  then obj.collect { |item| utf16_to_utf8(item) }
      else
        obj
      end
    end
  end
  ################################################################################
end
################################################################################