# coding: utf-8
require 'matrix'
require 'yaml'
begin
require 'psych'
rescue LoadError
end
module PDF
class Reader
class PageTextReceiver
DEFAULT_GRAPHICS_STATE = {
:ctm => Matrix.identity(3),
:char_spacing => 0,
:word_spacing => 0,
:h_scaling => 100,
:text_leading => 0,
:text_font => nil,
:text_font_size => nil,
:text_mode => 0,
:text_rise => 0,
:text_knockout => 0
}
# starting a new page
def page=(page)
@page = page
@objects = page.objects
@fonts = build_fonts(page.fonts)
@form_fonts = {}
@content = {}
@stack = [DEFAULT_GRAPHICS_STATE]
end
def content
keys = @content.keys.sort.reverse
keys.map { |key|
@content[key]
}.join("\n")
end
#####################################################
# Graphics State Operators
#####################################################
def save_graphics_state
@stack.push clone_state
end
def restore_graphics_state
@stack.pop
end
#####################################################
# Matrix Operators
#####################################################
# update the current transformation matrix.
#
# If the CTM is currently undefined, just store the new values.
#
# If there's an existing CTM, then multiply the existing matrix
# with the new matrix to form the updated matrix.
#
def concatenate_matrix(a, b, c, d, e, f)
transform = Matrix[
[a, b, 0],
[c, d, 0],
[e, f, 1]
]
if state[:ctm]
state[:ctm] = transform * state[:ctm]
else
state[:ctm] = transform
end
end
#####################################################
# Text Object Operators
#####################################################
def begin_text_object
@text_matrix = Matrix.identity(3)
@text_line_matrix = Matrix.identity(3)
end
def end_text_object
@text_matrix = Matrix.identity(3)
@text_line_matrix = Matrix.identity(3)
end
#####################################################
# Text State Operators
#####################################################
def set_character_spacing(char_spacing)
state[:char_spacing] = char_spacing
end
def set_horizontal_text_scaling(h_scaling)
state[:h_scaling] = h_scaling
end
def set_text_font_and_size(label, size)
state[:text_font] = label
state[:text_font_size] = size
end
def set_text_leading(leading)
state[:text_leading] = leading
end
def set_text_rendering_mode(mode)
state[:text_mode] = mode
end
def set_text_rise(rise)
state[:text_rise] = rise
end
def set_word_spacing(word_spacing)
state[:word_spacing] = word_spacing
end
#####################################################
# Text Positioning Operators
#####################################################
def move_text_position(x, y) # Td
temp_matrix = Matrix[
[1, 0, 0],
[0, 1, 0],
[x, y, 1]
]
@text_matrix = @text_line_matrix = temp_matrix * @text_line_matrix
end
def move_text_position_and_set_leading(x, y) # TD
set_text_leading(-1 * y)
move_text_position(x, y)
end
def set_text_matrix_and_text_line_matrix(a, b, c, d, e, f) # Tm
@text_matrix = @text_line_matrix = Matrix[
[a, b, 0],
[c, d, 0],
[e, f, 1]
]
end
def move_to_start_of_next_line # T*
move_text_position(0, -state[:text_leading])
end
#####################################################
# Text Showing Operators
#####################################################
# record text that is drawn on the page
def show_text(string) # Tj
raise PDF::Reader::MalformedPDFError, "current font is invalid" if current_font.nil?
at = transform(Point.new(0,0))
@content[at.y] ||= ""
@content[at.y] << current_font.to_utf8(string)
end
def show_text_with_positioning(params) # TJ
params.each { |arg|
case arg
when String
show_text(arg)
when Fixnum, Float
show_text(" ") if arg > 1000
end
}
end
def move_to_next_line_and_show_text(str) # '
move_to_start_of_next_line
show_text(str)
end
def set_spacing_next_line_show_text(aw, ac, string) # "
set_word_spacing(aw)
set_character_spacing(ac)
move_to_next_line_and_show_text(string)
end
#####################################################
# XObjects
#####################################################
def invoke_xobject(label)
save_graphics_state
xobject = @objects.deref(@page.xobjects[label])
matrix = xobject.hash[:Matrix]
concatenate_matrix(*matrix) if matrix
if xobject.hash[:Subtype] == :Form
form = PDF::Reader::FormXObject.new(@page, xobject)
@form_fonts = form.fonts
form.walk(self)
end
@form_fonts = {}
restore_graphics_state
end
private
# wrap the raw PDF Font objects in handy ruby Font objects.
#
def build_fonts(raw_fonts)
wrapped_fonts = raw_fonts.map { |label, font|
[label, PDF::Reader::Font.new(@objects, @objects.deref(font))]
}
::Hash[wrapped_fonts]
end
# transform x and y co-ordinates from the current text space to the
# underlying device space.
#
def transform(point, z = 1)
trm = text_rendering_matrix
point.transform(text_rendering_matrix, z)
end
def text_rendering_matrix
state_matrix = Matrix[
[state[:text_font_size] * state[:h_scaling], 0, 0],
[0, state[:text_font_size], 0],
[0, state[:text_rise], 1]
]
state_matrix * @text_matrix * ctm
end
def state
@stack.last
end
# when save_graphics_state is called, we need to push a new copy of the
# current state onto the stack. That way any modifications to the state
# will be undone once restore_graphics_state is called.
#
# This returns a deep clone of the current state, ensuring changes are
# keep separate from earlier states.
#
# YAML is used to round-trip the state through a string to easily perform
# the deep clone. Kinda hacky, but effective.
#
def clone_state
if @stack.empty?
{}
else
yaml_lib.load yaml_lib.dump(@stack.last)
end
end
def yaml_lib
Kernel.const_defined?("Psych") ? Psych : YAML
end
# return the current transformation matrix
#
def ctm
state[:ctm]
end
def current_font
@form_fonts[state[:text_font]] || @fonts[state[:text_font]]
end
# private class for representing points on a cartesian plain. Used
# to simplify maths in the MinPpi class.
#
class Point < Struct.new(:x, :y)
def transform(trm, z)
Point.new(
(trm[0,0] * x) + (trm[1,0] * y) + (trm[2,0] * z),
(trm[0,1] * x) + (trm[1,1] * y) + (trm[2,1] * z)
)
end
def distance(point)
Math.hypot(point.x - @x, point.y - @y)
end
end
end
end
end