class PDF::Reader::Encoding
:nodoc:
convert strings of various PDF-dialect encodings into UTF-8.
Util class for working with string encodings in PDF files. Mostly used to
def convert_to_utf8(str)
def convert_to_utf8(str) ret = str.unpack(unpack).map! { |c| @mapping[c.to_i] || c }.pack("U*") ret.force_encoding("UTF-8") ret end
def default_mapping
- leaves all other bytes <= 255 unchaged
- maps control chars and nil to the unicode "unknown character"
returns a hash that:
def default_mapping all_bytes = (0..255).to_a tuples = all_bytes.map {|i| CONTROL_CHARS.include?(i) ? [i, UNKNOWN_CHAR] : [i,i] } mapping = Hash[tuples] mapping end
def differences
def differences # this method is only used by the spec tests @differences ||= {} end
def differences=(diff)
To save space the following array is also valid and equivalent to the previous one
The array alternates between a decimal byte number and a glyph name to map to that byte
[25, :A, 26, :B]
set the differences table for this encoding. should be an array in the following format:
def differences=(diff) PDF::Reader::Error.validate_type(diff, "diff", Array) @differences = {} byte = 0 diff.each do |val| if val.kind_of?(Numeric) byte = val.to_i elsif codepoint = glyphlist.name_to_unicode(val) @differences[byte] = val @mapping[byte] = codepoint byte += 1 end end @differences end
def get_mapping_file(enc)
def get_mapping_file(enc) case enc when :"Identity-H", :"Identity-V", :UTF16Encoding then nil when :MacRomanEncoding then File.dirname(__FILE__) + "/encodings/mac_roman.txt" when :MacExpertEncoding then File.dirname(__FILE__) + "/encodings/mac_expert.txt" when :PDFDocEncoding then File.dirname(__FILE__) + "/encodings/pdf_doc.txt" when :SymbolEncoding then File.dirname(__FILE__) + "/encodings/symbol.txt" when :WinAnsiEncoding then File.dirname(__FILE__) + "/encodings/win_ansi.txt" when :ZapfDingbatsEncoding then File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt" else File.dirname(__FILE__) + "/encodings/standard.txt" end end
def get_unpack(enc)
def get_unpack(enc) case enc when :"Identity-H", :"Identity-V", :UTF16Encoding "n*" else "C*" end end
def glyphlist
def glyphlist @glyphlist ||= PDF::Reader::GlyphHash.new end
def initialize(enc)
def initialize(enc) @mapping = default_mapping # maps from character codes to Unicode codepoints @string_cache = {} # maps from character codes to UTF-8 strings. @enc_name = if enc.kind_of?(Hash) enc[:Encoding] || enc[:BaseEncoding] elsif enc && enc.respond_to?(:to_sym) enc.to_sym else :StandardEncoding end @unpack = get_unpack(@enc_name) @map_file = get_mapping_file(@enc_name) load_mapping(@map_file) if @map_file if enc.is_a?(Hash) && enc[:Differences] self.differences = enc[:Differences] end end
def int_to_name(glyph_code)
=> [:A]
int_to_name(65)
convert an integer glyph code into an Adobe glyph name.
def int_to_name(glyph_code) if @enc_name == :"Identity-H" || @enc_name == :"Identity-V" [] elsif differences[glyph_code] [differences[glyph_code]] elsif @mapping[glyph_code] glyphlist.unicode_to_name(@mapping[glyph_code]) else [] end end
def int_to_utf8_string(glyph_code)
def int_to_utf8_string(glyph_code) @string_cache[glyph_code] ||= internal_int_to_utf8_string(glyph_code) end
def internal_int_to_utf8_string(glyph_code)
def internal_int_to_utf8_string(glyph_code) ret = [ @mapping[glyph_code.to_i] || glyph_code.to_i ].pack("U*") ret.force_encoding("UTF-8") ret end
def little_boxes(times)
def little_boxes(times) codepoints = [ PDF::Reader::Encoding::UNKNOWN_CHAR ] * times ret = codepoints.pack("U*") ret.force_encoding("UTF-8") ret end
def load_mapping(file)
def load_mapping(file) File.open(file, "r:BINARY") do |f| f.each do |l| _m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/) @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte end end end
def to_utf8(str)
* mark the string as utf-8 if we're running on a M17N aware VM
* pack the final array of Unicode codepoints into a utf-8 string
valid
* replace characters that didn't convert to Unicode nicely with something
* convert any glyph names to Unicode codepoints
* convert codepoints from source encoding to Unicode codepoints
* replace any that have entries in the differences table with a glyph name
* unpack raw bytes into codepoints
convert the specified string to utf8
def to_utf8(str) if utf8_conversion_impossible? little_boxes(str.unpack(unpack).size) else convert_to_utf8(str) end end
def utf8_conversion_impossible?
def utf8_conversion_impossible? @enc_name == :"Identity-H" || @enc_name == :"Identity-V" end