# coding: utf-8
# typed: strict
# frozen_string_literal: true
################################################################################
#
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################
class PDF::Reader
# Util class for working with string encodings in PDF files. Mostly used to
# convert strings of various PDF-dialect encodings into UTF-8.
class Encoding # :nodoc:
CONTROL_CHARS = [0,1,2,3,4,5,6,7,8,11,12,14,15,16,17,18,19,20,21,22,23,
24,25,26,27,28,29,30,31]
UNKNOWN_CHAR = 0x25AF # ▯
attr_reader :unpack
def initialize(enc)
@mapping = default_mapping # maps from character codes to Unicode codepoints
@string_cache = {} # maps from character codes to UTF-8 strings.
@enc_name = if enc.kind_of?(Hash)
enc[:Encoding] || enc[:BaseEncoding]
elsif enc && enc.respond_to?(:to_sym)
enc.to_sym
else
:StandardEncoding
end
@unpack = get_unpack(@enc_name)
@map_file = get_mapping_file(@enc_name)
load_mapping(@map_file) if @map_file
if enc.is_a?(Hash) && enc[:Differences]
self.differences = enc[:Differences]
end
end
# set the differences table for this encoding. should be an array in the following format:
#
# [25, :A, 26, :B]
#
# The array alternates between a decimal byte number and a glyph name to map to that byte
#
# To save space the following array is also valid and equivalent to the previous one
#
# [25, :A, :B]
def differences=(diff)
PDF::Reader::Error.validate_type(diff, "diff", Array)
@differences = {}
byte = 0
diff.each do |val|
if val.kind_of?(Numeric)
byte = val.to_i
elsif codepoint = glyphlist.name_to_unicode(val)
@differences[byte] = val
@mapping[byte] = codepoint
byte += 1
end
end
@differences
end
def differences
# this method is only used by the spec tests
@differences ||= {}
end
# convert the specified string to utf8
#
# * unpack raw bytes into codepoints
# * replace any that have entries in the differences table with a glyph name
# * convert codepoints from source encoding to Unicode codepoints
# * convert any glyph names to Unicode codepoints
# * replace characters that didn't convert to Unicode nicely with something
# valid
# * pack the final array of Unicode codepoints into a utf-8 string
# * mark the string as utf-8 if we're running on a M17N aware VM
#
def to_utf8(str)
if utf8_conversion_impossible?
little_boxes(str.unpack(unpack).size)
else
convert_to_utf8(str)
end
end
def int_to_utf8_string(glyph_code)
@string_cache[glyph_code] ||= internal_int_to_utf8_string(glyph_code)
end
# convert an integer glyph code into an Adobe glyph name.
#
# int_to_name(65)
# => [:A]
#
def int_to_name(glyph_code)
if @enc_name == "Identity-H" || @enc_name == "Identity-V"
[]
elsif differences[glyph_code]
[differences[glyph_code]]
elsif @mapping[glyph_code]
glyphlist.unicode_to_name(@mapping[glyph_code])
else
[]
end
end
private
# returns a hash that:
# - maps control chars and nil to the unicode "unknown character"
# - leaves all other bytes <= 255 unchaged
#
# Each specific encoding will change this default as required for their glyphs
def default_mapping
all_bytes = (0..255).to_a
tuples = all_bytes.map {|i|
CONTROL_CHARS.include?(i) ? [i, UNKNOWN_CHAR] : [i,i]
}
mapping = Hash[tuples]
mapping[nil] = UNKNOWN_CHAR
mapping
end
def internal_int_to_utf8_string(glyph_code)
ret = [
@mapping[glyph_code.to_i] || glyph_code.to_i
].pack("U*")
ret.force_encoding("UTF-8")
ret
end
def utf8_conversion_impossible?
@enc_name == :"Identity-H" || @enc_name == :"Identity-V"
end
def little_boxes(times)
codepoints = [ PDF::Reader::Encoding::UNKNOWN_CHAR ] * times
ret = codepoints.pack("U*")
ret.force_encoding("UTF-8")
ret
end
def convert_to_utf8(str)
ret = str.unpack(unpack).map! { |c| @mapping[c.to_i] || c }.pack("U*")
ret.force_encoding("UTF-8")
ret
end
def get_unpack(enc)
case enc
when :"Identity-H", :"Identity-V", :UTF16Encoding
"n*"
else
"C*"
end
end
def get_mapping_file(enc)
case enc
when :"Identity-H", :"Identity-V", :UTF16Encoding then
nil
when :MacRomanEncoding then
File.dirname(__FILE__) + "/encodings/mac_roman.txt"
when :MacExpertEncoding then
File.dirname(__FILE__) + "/encodings/mac_expert.txt"
when :PDFDocEncoding then
File.dirname(__FILE__) + "/encodings/pdf_doc.txt"
when :SymbolEncoding then
File.dirname(__FILE__) + "/encodings/symbol.txt"
when :WinAnsiEncoding then
File.dirname(__FILE__) + "/encodings/win_ansi.txt"
when :ZapfDingbatsEncoding then
File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
else
File.dirname(__FILE__) + "/encodings/standard.txt"
end
end
def glyphlist
@glyphlist ||= PDF::Reader::GlyphHash.new
end
def load_mapping(file)
File.open(file, "r:BINARY") do |f|
f.each do |l|
_m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
@mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
end
end
end
end
end