################################################################################
#
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################
class PDF::Reader
class Encoding # :nodoc:
CONTROL_CHARS = [0,1,2,3,4,5,6,7,8,11,12,14,15,16,17,18,19,20,21,22,23,
24,25,26,27,28,29,30,31]
UNKNOWN_CHAR = 0x25AF # ▯
attr_reader :unpack
def initialize(enc)
if enc.kind_of?(Hash)
self.differences = enc[:Differences] if enc[:Differences]
enc = enc[:Encoding] || enc[:BaseEncoding]
elsif enc != nil
enc = enc.to_sym
else
enc = nil
end
@to_unicode_required = unicode_required?(enc)
@unpack = get_unpack(enc)
@map_file = get_mapping_file(enc)
load_mapping(@map_file) if @map_file
end
def to_unicode_required?
@to_unicode_required
end
# set the differences table for this encoding. should be an array in the following format:
#
# [25, :A, 26, :B]
#
# The array alternates between a decimal byte number and a glyph name to map to that byte
#
# To save space the following array is also valid and equivalent to the previous one
#
# [25, :A, :B]
def differences=(diff)
raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)
@differences = {}
byte = 0
diff.each do |val|
if val.kind_of?(Numeric)
byte = val.to_i
else
@differences[byte] = val
byte += 1
end
end
@differences
end
def differences
@differences ||= {}
end
# convert the specified string to utf8
#
# * unpack raw bytes into codepoints
# * replace any that have entries in the differences table with a glyph name
# * convert codepoints from source encoding to Unicode codepoints
# * convert any glyph names to Unicode codepoints
# * replace characters that didn't convert to Unicode nicely with something
# valid
# * pack the final array of Unicode codepoints into a utf-8 string
# * mark the string as utf-8 if we're running on a M17N aware VM
#
def to_utf8(str, tounicode = nil)
ret = str.unpack(unpack).map { |c|
differences[c] || c
}.map { |num|
original_codepoint_to_unicode(num, tounicode)
}.map { |c|
names_to_unicode[c] || c
}.map { |c|
if c.nil? || !c.is_a?(Fixnum)
PDF::Reader::Encoding::UNKNOWN_CHAR
else
c
end
}.pack("U*")
ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)
ret
end
private
def original_codepoint_to_unicode(cp, tounicode = nil)
if tounicode && (code = tounicode.decode(cp))
code
elsif to_unicode_required? && (tounicode.nil? || tounicode.decode(cp).nil?)
PDF::Reader::Encoding::UNKNOWN_CHAR
elsif mapping[cp]
mapping[cp]
elsif PDF::Reader::Encoding::CONTROL_CHARS.include?(cp)
PDF::Reader::Encoding::UNKNOWN_CHAR
else
cp
end
end
def get_unpack(enc)
case enc
when :"Identity-H", :"Identity-V", :UTF16Encoding
"n*"
else
"C*"
end
end
def get_mapping_file(enc)
return File.dirname(__FILE__) + "/encodings/standard.txt" if enc.nil?
files = {
:"Identity-H" => nil,
:"Identity-V" => nil,
:MacRomanEncoding => File.dirname(__FILE__) + "/encodings/mac_roman.txt",
:MacExpertEncoding => File.dirname(__FILE__) + "/encodings/mac_expert.txt",
:PDFDocEncoding => File.dirname(__FILE__) + "/encodings/pdf_doc.txt",
:StandardEncoding => File.dirname(__FILE__) + "/encodings/standard.txt",
:SymbolEncoding => File.dirname(__FILE__) + "/encodings/symbol.txt",
:UTF16Encoding => nil,
:WinAnsiEncoding => File.dirname(__FILE__) + "/encodings/win_ansi.txt",
:ZapfDingbatsEncoding => File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
}
if files.has_key?(enc)
files[enc]
else
raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
end
end
def unicode_required?(enc)
enc == :"Identity-H" or enc == :"Identity-V"
end
def mapping
@mapping ||= {}
end
def has_mapping?
mapping.size > 0
end
def names_to_unicode
@names_to_unicode ||= PDF::Reader::GlyphHash.new
end
def load_mapping(file)
return if has_mapping?
RUBY_VERSION >= "1.9" ? mode = "r:BINARY" : mode = "r"
File.open(file, mode) do |f|
f.each do |l|
m, single_byte, unicode = *l.match(/([0-9A-Za-z]+);([0-9A-F]{4})/)
mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
end
end
end
end
end