lib/pdf/reader/page_text_receiver.rb



# coding: utf-8

require 'matrix'
require 'yaml'

begin
  require 'psych'
rescue LoadError
end

module PDF
  class Reader
    class PageTextReceiver

      DEFAULT_GRAPHICS_STATE = {
        :ctm          => Matrix.identity(3),
        :char_spacing => 0,
        :word_spacing => 0,
        :h_scaling    => 100,
        :text_leading => 0,
        :text_font    => nil,
        :text_font_size => nil,
        :text_mode    => 0,
        :text_rise    => 0,
        :text_knockout => 0
      }

      # starting a new page
      def page=(page)
        @page    = page
        @objects = page.objects
        @fonts   = build_fonts(page.fonts)
        @form_fonts = {}
        @content = {}
        @stack   = [DEFAULT_GRAPHICS_STATE]
      end

      def content
        keys = @content.keys.sort.reverse
        keys.map { |key|
          @content[key]
        }.join("\n")
      end

      #####################################################
      # Graphics State Operators
      #####################################################

      def save_graphics_state
        @stack.push clone_state
      end

      def restore_graphics_state
        @stack.pop
      end

      #####################################################
      # Matrix Operators
      #####################################################

      # update the current transformation matrix.
      #
      # If the CTM is currently undefined, just store the new values.
      #
      # If there's an existing CTM, then multiply the existing matrix
      # with the new matrix to form the updated matrix.
      #
      def concatenate_matrix(a, b, c, d, e, f)
        transform = Matrix[
          [a, b, 0],
          [c, d, 0],
          [e, f, 1]
        ]
        if state[:ctm]
          state[:ctm] = transform * state[:ctm]
        else
          state[:ctm] = transform
        end
      end

      #####################################################
      # Text Object Operators
      #####################################################

      def begin_text_object
        @text_matrix      = Matrix.identity(3)
        @text_line_matrix = Matrix.identity(3)
      end

      def end_text_object
        @text_matrix      = Matrix.identity(3)
        @text_line_matrix = Matrix.identity(3)
      end

      #####################################################
      # Text State Operators
      #####################################################

      def set_character_spacing(char_spacing)
        state[:char_spacing] = char_spacing
      end

      def set_horizontal_text_scaling(h_scaling)
        state[:h_scaling] = h_scaling
      end

      def set_text_font_and_size(label, size)
        state[:text_font]      = label
        state[:text_font_size] = size
      end

      def set_text_leading(leading)
        state[:text_leading] = leading
      end

      def set_text_rendering_mode(mode)
        state[:text_mode] = mode
      end

      def set_text_rise(rise)
        state[:text_rise] = rise
      end

      def set_word_spacing(word_spacing)
        state[:word_spacing] = word_spacing
      end

      #####################################################
      # Text Positioning Operators
      #####################################################

      def move_text_position(x, y) # Td
        temp_matrix = Matrix[
          [1, 0, 0],
          [0, 1, 0],
          [x, y, 1]
        ]
        @text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
      end

      def move_text_position_and_set_leading(x, y) # TD
        set_text_leading(-1 * y)
        move_text_position(x, y)
      end

      def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
        @text_matrix = @text_line_matrix = Matrix[
          [a, b, 0],
          [c, d, 0],
          [e, f, 1]
        ]
      end

      def move_to_start_of_next_line # T*
        move_text_position(0, -state[:text_leading])
      end

      #####################################################
      # Text Showing Operators
      #####################################################

      # record text that is drawn on the page
      def show_text(string) # Tj
        raise PDF::Reader::MalformedPDFError, "current font is invalid" if current_font.nil?
        at = transform(Point.new(0,0))
        @content[at.y] ||= ""
        @content[at.y] << current_font.to_utf8(string)
      end

      def show_text_with_positioning(params) # TJ
        params.each { |arg|
          case arg
          when String
            show_text(arg)
          when Fixnum, Float
            show_text(" ") if arg > 1000
          end
        }
      end

      def move_to_next_line_and_show_text(str) # '
        move_to_start_of_next_line
        show_text(str)
      end

      def set_spacing_next_line_show_text(aw, ac, string) # "
        set_word_spacing(aw)
        set_character_spacing(ac)
        move_to_next_line_and_show_text(string)
      end

      #####################################################
      # XObjects
      #####################################################
      def invoke_xobject(label)
        save_graphics_state
        xobject = @objects.deref(@page.xobjects[label])

        matrix = xobject.hash[:Matrix]
        concatenate_matrix(*matrix) if matrix

        if xobject.hash[:Subtype] == :Form
          form = PDF::Reader::FormXObject.new(@page, xobject)
          @form_fonts = form.fonts
          form.walk(self)
        end
        @form_fonts = {}

        restore_graphics_state
      end

      private

      # wrap the raw PDF Font objects in handy ruby Font objects.
      #
      def build_fonts(raw_fonts)
        wrapped_fonts = raw_fonts.map { |label, font|
          [label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
        }

        ::Hash[wrapped_fonts]
      end

      # transform x and y co-ordinates from the current text space to the
      # underlying device space.
      #
      def transform(point, z = 1)
        trm = text_rendering_matrix

        point.transform(text_rendering_matrix, z)
      end

      def text_rendering_matrix
        state_matrix = Matrix[
                         [state[:text_font_size] * state[:h_scaling], 0, 0],
                         [0, state[:text_font_size], 0],
                         [0, state[:text_rise], 1]
                       ]

        state_matrix * @text_matrix * ctm
      end

      def state
        @stack.last
      end

      # when save_graphics_state is called, we need to push a new copy of the
      # current state onto the stack. That way any modifications to the state
      # will be undone once restore_graphics_state is called.
      #
      # This returns a deep clone of the current state, ensuring changes are
      # keep separate from earlier states.
      #
      # YAML is used to round-trip the state through a string to easily perform
      # the deep clone. Kinda hacky, but effective.
      #
      def clone_state
        if @stack.empty?
          {}
        else
          yaml_lib.load yaml_lib.dump(@stack.last)
        end
      end

      def yaml_lib
        Kernel.const_defined?("Psych") ? Psych : YAML
      end

      # return the current transformation matrix
      #
      def ctm
        state[:ctm]
      end

      def current_font
        @form_fonts[state[:text_font]] || @fonts[state[:text_font]]
      end

      # private class for representing points on a cartesian plain. Used
      # to simplify maths in the MinPpi class.
      #
      class Point < Struct.new(:x, :y)
        def transform(trm, z)
          Point.new(
            (trm[0,0] * x) + (trm[1,0] * y) + (trm[2,0] * z),
            (trm[0,1] * x) + (trm[1,1] * y) + (trm[2,1] * z)
          )
        end

        def distance(point)
          Math.hypot(point.x - @x, point.y - @y)
        end
      end
    end
  end
end