lib/pdf/reader/pages_strategy.rb



################################################################################
#
# Copyright (C) 2006 Peter J Jones (pjones@pmade.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################

class PDF::Reader
  ################################################################################
  # Walks the pages of the PDF file and calls the appropriate callback methods when
  # something of interest is found.
  #
  # The callback methods should exist on the receiver object passed into the constructor. Whenever
  # some content is found that will trigger a callback, the receiver is checked to see if the callback
  # is defined.
  #
  # If it is defined it will be called. If not, processing will continue.
  #
  # = Available Callbacks
  # The following callbacks are available and should be methods defined on your receiver class. Only
  # implement the ones you need - the rest will be ignored.
  #
  # Some callbacks will include parameters which will be passed in as an array. For callbacks that supply no
  # paramters, or where you don't need them, the *params argument can be left off. Some example callback
  # method definitions are:
  #
  #   def begin_document
  #   def end_page
  #   def show_text(string, *params)
  #   def fill_stroke(*params)
  #
  # You should be able to infer the basic command the callback is reporting based on the name. For
  # further experimentation, define the callback with just a *params parameter, then print out the
  # contents of the array using something like:
  #
  #   puts params.inspect
  #
  # == Text Callbacks
  #
  # All text passed into these callbacks will be encoded as UTF-8. Depending on where (and when) the
  # PDF was generated, there's a good chance the text is NOT stored as UTF-8 internally so be careful
  # when doing a comparison on strings returned from PDF::Reader (when doing unit tests for example). The
  # string may not be byte-by-byte identical with the string that was originally written to the PDF.
  #
  # - end_text_object
  # - move_to_start_of_next_line
  # - set_character_spacing
  # - move_text_position
  # - move_text_position_and_set_leading
  # - set_text_font_and_size
  # - show_text
  # - show_text_with_positioning
  # - set_text_leading
  # - set_text_matrix_and_text_line_matrix
  # - set_text_rendering_mode
  # - set_text_rise
  # - set_word_spacing
  # - set_horizontal_text_scaling
  # - move_to_next_line_and_show_text
  # - set_spacing_next_line_show_text
  #
  # If the :raw_text option was passed to the PDF::Reader class the following callbacks
  # may also appear:
  #
  # - show_text_raw
  # - show_text_with_positioning_raw
  # - move_to_next_line_and_show_text_raw
  # - set_spacing_next_line_show_text_raw
  #
  # == Graphics Callbacks
  # - close_fill_stroke
  # - fill_stroke
  # - close_fill_stroke_with_even_odd
  # - fill_stroke_with_even_odd
  # - begin_marked_content_with_pl
  # - begin_inline_image
  # - begin_marked_content
  # - begin_text_object
  # - append_curved_segment
  # - concatenate_matrix
  # - set_stroke_color_space
  # - set_nonstroke_color_space
  # - set_line_dash
  # - set_glyph_width
  # - set_glyph_width_and_bounding_box
  # - invoke_xobject
  # - define_marked_content_with_pl
  # - end_inline_image
  # - end_marked_content
  # - fill_path_with_nonzero
  # - fill_path_with_nonzero
  # - fill_path_with_even_odd
  # - set_gray_for_stroking
  # - set_gray_for_nonstroking
  # - set_graphics_state_parameters
  # - close_subpath
  # - set_flatness_tolerance
  # - begin_inline_image_data
  # - set_line_join_style
  # - set_line_cap_style
  # - set_cmyk_color_for_stroking,
  # - set_cmyk_color_for_nonstroking
  # - append_line
  # - begin_new_subpath
  # - set_miter_limit
  # - define_marked_content_point
  # - end_path
  # - save_graphics_state
  # - restore_graphics_state
  # - append_rectangle
  # - set_rgb_color_for_stroking
  # - set_rgb_color_for_nonstroking
  # - set_color_rendering_intent
  # - close_and_stroke_path
  # - stroke_path
  # - set_color_for_stroking
  # - set_color_for_nonstroking
  # - set_color_for_stroking_and_special
  # - set_color_for_nonstroking_and_special
  # - paint_area_with_shading_pattern
  # - append_curved_segment_initial_point_replicated
  # - set_line_width
  # - set_clipping_path_with_nonzero
  # - set_clipping_path_with_even_odd
  # - append_curved_segment_final_point_replicated
  #
  # == Misc Callbacks
  # - begin_compatibility_section
  # - end_compatibility_section,
  # - begin_document
  # - end_document
  # - begin_page_container
  # - end_page_container
  # - begin_page
  # - end_page
  # - metadata
  # - xml_metadata
  # - page_count
  # - begin_form_xobject
  # - end_form_xobject
  #
  # == Resource Callbacks
  #
  # Each page can contain (or inherit) a range of resources required for the page,
  # including things like fonts and images. The following callbacks may appear
  # after begin_page if the relevant resources exist on a page:
  #
  # - resource_procset
  # - resource_xobject
  # - resource_extgstate
  # - resource_colorspace
  # - resource_pattern
  # - resource_font
  #
  # In most cases, these callbacks associate a name with each resource, allowing it
  # to be referred to by name in the page content. For example, an XObject can hold an image.
  # If it gets mapped to the name "IM1", then it can be placed on the page using
  # invoke_xobject "IM1".
  #
  # DEPRECATED: this class was deprecated in version 0.11.0 and will
  #             eventually be removed
  class PagesStrategy< AbstractStrategy # :nodoc:
    OPERATORS = {
      'b'   => :close_fill_stroke,
      'B'   => :fill_stroke,
      'b*'  => :close_fill_stroke_with_even_odd,
      'B*'  => :fill_stroke_with_even_odd,
      'BDC' => :begin_marked_content_with_pl,
      'BI'  => :begin_inline_image,
      'BMC' => :begin_marked_content,
      'BT'  => :begin_text_object,
      'BX'  => :begin_compatibility_section,
      'c'   => :append_curved_segment,
      'cm'  => :concatenate_matrix,
      'CS'  => :set_stroke_color_space,
      'cs'  => :set_nonstroke_color_space,
      'd'   => :set_line_dash,
      'd0'  => :set_glyph_width,
      'd1'  => :set_glyph_width_and_bounding_box,
      'Do'  => :invoke_xobject,
      'DP'  => :define_marked_content_with_pl,
      'EI'  => :end_inline_image,
      'EMC' => :end_marked_content,
      'ET'  => :end_text_object,
      'EX'  => :end_compatibility_section,
      'f'   => :fill_path_with_nonzero,
      'F'   => :fill_path_with_nonzero,
      'f*'  => :fill_path_with_even_odd,
      'G'   => :set_gray_for_stroking,
      'g'   => :set_gray_for_nonstroking,
      'gs'  => :set_graphics_state_parameters,
      'h'   => :close_subpath,
      'i'   => :set_flatness_tolerance,
      'ID'  => :begin_inline_image_data,
      'j'   => :set_line_join_style,
      'J'   => :set_line_cap_style,
      'K'   => :set_cmyk_color_for_stroking,
      'k'   => :set_cmyk_color_for_nonstroking,
      'l'   => :append_line,
      'm'   => :begin_new_subpath,
      'M'   => :set_miter_limit,
      'MP'  => :define_marked_content_point,
      'n'   => :end_path,
      'q'   => :save_graphics_state,
      'Q'   => :restore_graphics_state,
      're'  => :append_rectangle,
      'RG'  => :set_rgb_color_for_stroking,
      'rg'  => :set_rgb_color_for_nonstroking,
      'ri'  => :set_color_rendering_intent,
      's'   => :close_and_stroke_path,
      'S'   => :stroke_path,
      'SC'  => :set_color_for_stroking,
      'sc'  => :set_color_for_nonstroking,
      'SCN' => :set_color_for_stroking_and_special,
      'scn' => :set_color_for_nonstroking_and_special,
      'sh'  => :paint_area_with_shading_pattern,
      'T*'  => :move_to_start_of_next_line,
      'Tc'  => :set_character_spacing,
      'Td'  => :move_text_position,
      'TD'  => :move_text_position_and_set_leading,
      'Tf'  => :set_text_font_and_size,
      'Tj'  => :show_text,
      'TJ'  => :show_text_with_positioning,
      'TL'  => :set_text_leading,
      'Tm'  => :set_text_matrix_and_text_line_matrix,
      'Tr'  => :set_text_rendering_mode,
      'Ts'  => :set_text_rise,
      'Tw'  => :set_word_spacing,
      'Tz'  => :set_horizontal_text_scaling,
      'v'   => :append_curved_segment_initial_point_replicated,
      'w'   => :set_line_width,
      'W'   => :set_clipping_path_with_nonzero,
      'W*'  => :set_clipping_path_with_even_odd,
      'y'   => :append_curved_segment_final_point_replicated,
      '\''  => :move_to_next_line_and_show_text,
      '"'   => :set_spacing_next_line_show_text,
    }
    def self.to_sym
      :pages
    end
    ################################################################################
    # Begin processing the document
    def process
      return false unless options[:pages]

      callback(:begin_document, [root])
      walk_pages(@ohash.object(root[:Pages]))
      callback(:end_document)
    end
    private
    ################################################################################
    # Walk over all pages in the PDF file, calling the appropriate callbacks for each page and all
    # its content
    def walk_pages (page)

      # extract page content
      if page[:Type] == :Pages
        callback(:begin_page_container, [page])
        res = @ohash.object(page[:Resources])
        resources.push res if res
        @ohash.object(page[:Kids]).each {|child| walk_pages(@ohash.object(child))}
        resources.pop if res
        callback(:end_page_container)
      elsif page[:Type] == :Page
        callback(:begin_page, [page])
        res = @ohash.object(page[:Resources])
        resources.push res if res
        walk_resources(current_resources)

        if @ohash.object(page[:Contents]).kind_of?(Array)
          contents = @ohash.object(page[:Contents])
        else
          contents = [page[:Contents]]
        end

        fonts = font_hash_from_resources(current_resources)

        if page.has_key?(:Contents) and page[:Contents]
          direct_contents = contents.map { |content| @ohash.object(content) }
          content_stream(direct_contents, fonts)
        end

        resources.pop if res
        callback(:end_page)
      end
    end
    ################################################################################
    # Retreive the XObject for the supplied label and if it's a Form, walk it
    # like a regular page content stream.
    #
    def walk_xobject_form(label)
      xobjects = @ohash.object(current_resources[:XObject]) || {}
      xobject  = @ohash.object(xobjects[label])

      if xobject && xobject.hash[:Subtype] == :Form
        callback(:begin_form_xobject)
        xobj_resources = @ohash.object(xobject.hash[:Resources])
        if xobj_resources
          resources.push xobj_resources
          walk_resources(xobj_resources)
        end
        fonts = font_hash_from_resources(xobj_resources)
        content_stream(xobject, fonts)
        callback(:end_form_xobject)
        resources.pop if xobj_resources
      end
    end

    ################################################################################
    # Return a merged hash of all resources that are current. Pages, page and xobject
    #
    def current_resources
      hash = {}
      resources.each do |res|
        hash.merge!(res)
      end
      hash
    end
    ################################################################################
    # Reads a PDF content stream and calls all the appropriate callback methods for the operators
    # it contains
    #
    def content_stream (instructions, fonts = {})
      instructions = [instructions] unless instructions.kind_of?(Array)
      instructions = instructions.map { |ins|
        ins.is_a?(PDF::Reader::Stream) ? ins.unfiltered_data : ins.to_s
      }.join
      buffer       = Buffer.new(StringIO.new(instructions), :content_stream => true)
      parser       = Parser.new(buffer, @ohash)
      current_font = nil
      params       = []

      while (token = parser.parse_token(OPERATORS))
        if token.kind_of?(Token) and OPERATORS.has_key?(token)
           if OPERATORS[token] == :set_text_font_and_size
            current_font = params.first
            if fonts[current_font].nil?
              raise MalformedPDFError, "Unknown font #{current_font}"
            end
          end

          # handle special cases in response to certain operators
          if OPERATORS[token].to_s.include?("show_text")
            # convert any text to utf-8, but output the raw string if the user wants it
            if options[:raw_text]
              callback("#{OPERATORS[token]}_raw".to_sym, params)
            end
            params = fonts[current_font].to_utf8(params)
          elsif token == "ID"
            # inline image data, first convert the current params into a more familiar hash
            map = {}
            params.each_slice(2) do |key, value|
              map[key] = value
            end
            params = [map, buffer.token]
          end

          callback(OPERATORS[token], params)

          if OPERATORS[token] == :invoke_xobject
            xobject_label = params.first
            params.clear
            walk_xobject_form(xobject_label)
          else
            params.clear
          end
        else
          params << token
        end
      end
    rescue EOFError => e
      raise MalformedPDFError, "End Of File while processing a content stream"
    end
    ################################################################################
    def walk_resources(resources)
      return unless resources.respond_to?(:[])

      resources = resolve_references(resources)

      # extract any procset information
      if resources[:ProcSet]
        callback(:resource_procset, resources[:ProcSet])
      end

      # extract any xobject information
      if resources[:XObject]
        @ohash.object(resources[:XObject]).each do |name, val|
          callback(:resource_xobject, [name, @ohash.object(val)])
        end
      end

      # extract any extgstate information
      if resources[:ExtGState]
        @ohash.object(resources[:ExtGState]).each do |name, val|
          callback(:resource_extgstate, [name, @ohash.object(val)])
        end
      end

      # extract any colorspace information
      if resources[:ColorSpace]
        @ohash.object(resources[:ColorSpace]).each do |name, val|
          callback(:resource_colorspace, [name, @ohash.object(val)])
        end
      end

      # extract any pattern information
      if resources[:Pattern]
        @ohash.object(resources[:Pattern]).each do |name, val|
          callback(:resource_pattern, [name, @ohash.object(val)])
        end
      end

      # extract any font information
      if resources[:Font]
        fonts = font_hash_from_resources(resources)
        fonts.each do  |label, font|
          callback(:resource_font, [label, font])
        end
      end
    end
    ################################################################################
    # Convert any PDF::Reader::Resource objects into a real object
    def resolve_references(obj)
      case obj
      when PDF::Reader::Stream then
        obj.hash = resolve_references(obj.hash)
        obj
      when PDF::Reader::Reference then
        resolve_references(@ohash.object(obj))
      when Hash                   then
        arr = obj.map { |key,val| [key, resolve_references(val)] }.flatten(1)
        Hash[*arr]
      when Array                  then
        obj.collect { |item| resolve_references(item) }
      else
        obj
      end
    end
    ################################################################################
    ################################################################################
    def font_hash_from_resources(resources)
      return {} unless resources.respond_to?(:[])

      fonts = {}
      resources = @ohash.object(resources[:Font]) || {}
      resources.each do |label, desc|
        fonts[label] = PDF::Reader::Font.new(@ohash, @ohash.object(desc))
      end
      fonts
    end
    def resources
      @resources ||= []
    end
  end
  ################################################################################
end
################################################################################