class PDF::Reader::Encoding

:nodoc:
convert strings of various PDF-dialect encodings into UTF-8.
Util class for working with string encodings in PDF files. Mostly used to

def convert_to_utf8(str)

def convert_to_utf8(str)
  ret = str.unpack(unpack).map! { |c| @mapping[c.to_i] || c }.pack("U*")
  ret.force_encoding("UTF-8")
  ret
end

def default_mapping

Each specific encoding will change this default as required for their glyphs

- leaves all other bytes <= 255 unchaged
- maps control chars and nil to the unicode "unknown character"
returns a hash that:

def default_mapping
  all_bytes = (0..255).to_a
  tuples = all_bytes.map {|i|
    CONTROL_CHARS.include?(i) ? [i, UNKNOWN_CHAR] : [i,i]
  }
  mapping = Hash[tuples]
  mapping
end

def differences

def differences
  # this method is only used by the spec tests
  @differences ||= {}
end

def differences=(diff)

[25, :A, :B]

To save space the following array is also valid and equivalent to the previous one

The array alternates between a decimal byte number and a glyph name to map to that byte

[25, :A, 26, :B]

set the differences table for this encoding. should be an array in the following format:

def differences=(diff)
  PDF::Reader::Error.validate_type(diff, "diff", Array)
  @differences = {}
  byte = 0
  diff.each do |val|
    if val.kind_of?(Numeric)
      byte = val.to_i
    elsif codepoint = glyphlist.name_to_unicode(val)
      @differences[byte] = val
      @mapping[byte] = codepoint
      byte += 1
    end
  end
  @differences
end

def get_mapping_file(enc)

def get_mapping_file(enc)
  case enc
  when :"Identity-H", :"Identity-V", :UTF16Encoding then
    nil
  when :MacRomanEncoding then
    File.dirname(__FILE__) + "/encodings/mac_roman.txt"
  when :MacExpertEncoding then
    File.dirname(__FILE__) + "/encodings/mac_expert.txt"
  when :PDFDocEncoding then
    File.dirname(__FILE__) + "/encodings/pdf_doc.txt"
  when :SymbolEncoding then
    File.dirname(__FILE__) + "/encodings/symbol.txt"
  when :WinAnsiEncoding then
    File.dirname(__FILE__) + "/encodings/win_ansi.txt"
  when :ZapfDingbatsEncoding then
    File.dirname(__FILE__) + "/encodings/zapf_dingbats.txt"
  else
    File.dirname(__FILE__) + "/encodings/standard.txt"
  end
end

def get_unpack(enc)

def get_unpack(enc)
  case enc
  when :"Identity-H", :"Identity-V", :UTF16Encoding
    "n*"
  else
    "C*"
  end
end

def glyphlist

def glyphlist
  @glyphlist ||= PDF::Reader::GlyphHash.new
end

def initialize(enc)

def initialize(enc)
  @mapping  = default_mapping # maps from character codes to Unicode codepoints
  @string_cache  = {} # maps from character codes to UTF-8 strings.
  @enc_name = if enc.kind_of?(Hash)
    enc[:Encoding] || enc[:BaseEncoding]
  elsif enc && enc.respond_to?(:to_sym)
    enc.to_sym
  else
    :StandardEncoding
  end
  @unpack   = get_unpack(@enc_name)
  @map_file = get_mapping_file(@enc_name)
  load_mapping(@map_file) if @map_file
  if enc.is_a?(Hash) && enc[:Differences]
    self.differences = enc[:Differences]
  end
end

def int_to_name(glyph_code)

=> [:A]
int_to_name(65)

convert an integer glyph code into an Adobe glyph name.

def int_to_name(glyph_code)
  if @enc_name == :"Identity-H" || @enc_name == :"Identity-V"
    []
  elsif differences[glyph_code]
    [differences[glyph_code]]
  elsif @mapping[glyph_code]
    glyphlist.unicode_to_name(@mapping[glyph_code])
  else
    []
  end
end

def int_to_utf8_string(glyph_code)

def int_to_utf8_string(glyph_code)
  @string_cache[glyph_code] ||= internal_int_to_utf8_string(glyph_code)
end

def internal_int_to_utf8_string(glyph_code)

def internal_int_to_utf8_string(glyph_code)
  ret = [
    @mapping[glyph_code.to_i] || glyph_code.to_i
  ].pack("U*")
  ret.force_encoding("UTF-8")
  ret
end

def little_boxes(times)

def little_boxes(times)
  codepoints = [ PDF::Reader::Encoding::UNKNOWN_CHAR ] * times
  ret = codepoints.pack("U*")
  ret.force_encoding("UTF-8")
  ret
end

def load_mapping(file)

def load_mapping(file)
  File.open(file, "r:BINARY") do |f|
    f.each do |l|
      _m, single_byte, unicode = *l.match(/\A([0-9A-Za-z]+);([0-9A-F]{4})/)
      @mapping["0x#{single_byte}".hex] = "0x#{unicode}".hex if single_byte
    end
  end
end

def to_utf8(str)

* mark the string as utf-8 if we're running on a M17N aware VM
* pack the final array of Unicode codepoints into a utf-8 string
valid
* replace characters that didn't convert to Unicode nicely with something
* convert any glyph names to Unicode codepoints
* convert codepoints from source encoding to Unicode codepoints
* replace any that have entries in the differences table with a glyph name
* unpack raw bytes into codepoints

convert the specified string to utf8

def to_utf8(str)
  if utf8_conversion_impossible?
    little_boxes(str.unpack(unpack).size)
  else
    convert_to_utf8(str)
  end
end

def utf8_conversion_impossible?

def utf8_conversion_impossible?
  @enc_name == :"Identity-H" || @enc_name == :"Identity-V"
end

Modules

Classes