lib/pdf/reader/encoding.rb



################################################################################
#
# Copyright (C) 2008 James Healy (jimmy@deefa.com)
#
# Permission is hereby granted, free of charge, to any person obtaining
# a copy of this software and associated documentation files (the
# "Software"), to deal in the Software without restriction, including
# without limitation the rights to use, copy, modify, merge, publish,
# distribute, sublicense, and/or sell copies of the Software, and to
# permit persons to whom the Software is furnished to do so, subject to
# the following conditions:
#
# The above copyright notice and this permission notice shall be
# included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
#
################################################################################

require 'enumerator'

class PDF::Reader
  class Encoding

    UNKNOWN_CHAR = 0x25AF # ▯

    attr_reader :differences

    # set the differences table for this encoding. should be an array in the following format:
    #
    #   [25, "A", 26, "B"]
    #
    # The array alternates bewteen a decimal byte number and a glyph name to map to that byte
    #
    # To save space the following array is also valid and equivilant to the previous one
    #
    #   [25, "A", "B"]
    def differences=(diff)
      raise ArgumentError, "diff must be an array" unless diff.kind_of?(Array)

      @differences = {}
      byte = 0
      diff.each do |val|
        if val.kind_of?(Numeric)
          byte = val.to_i
        else
          @differences[byte] = val
          byte += 1
        end
      end
      @differences
    end

    # Takes the "Encoding" value of a Font dictionary and builds a PDF::Reader::Encoding object
    def self.factory(enc)
      if enc.kind_of?(Hash)
        diff = enc[:Differences]
        enc = enc[:Encoding] || enc[:BaseEncoding]
      elsif enc != nil
        enc = enc.to_sym
      end

      case enc
        when nil                   then enc = PDF::Reader::Encoding::StandardEncoding.new
        when "Identity-H".to_sym   then enc = PDF::Reader::Encoding::IdentityH.new
        when :MacRomanEncoding     then enc = PDF::Reader::Encoding::MacRomanEncoding.new
        when :MacExpertEncoding    then enc = PDF::Reader::Encoding::MacExpertEncoding.new
        when :StandardEncoding     then enc = PDF::Reader::Encoding::StandardEncoding.new
        when :SymbolEncoding       then enc = PDF::Reader::Encoding::SymbolEncoding.new
        when :WinAnsiEncoding      then enc = PDF::Reader::Encoding::WinAnsiEncoding.new
        when :ZapfDingbatsEncoding then enc = PDF::Reader::Encoding::ZapfDingbatsEncoding.new
        else raise UnsupportedFeatureError, "#{enc} is not currently a supported encoding"
      end

      enc.differences = diff if enc && diff

      return enc
    end

    def to_utf8(str, tounicode = nil)
      # abstract method, of sorts
      raise RuntimeError, "Called abstract method"
    end

    # accepts an array of byte numbers, and replaces any that have entries in the differences table
    # with a glyph name
    def process_differences(arr)
      @differences ||= {}
      arr.collect! { |n| @differences[n].nil? ? n : @differences[n]}
    end
    protected :process_differences

    # accepts an array of unicode code points and glyphnames, and converts any glyph names to codepoints
    def process_glyphnames(arr)
      @differences ||= {}
      arr.collect! { |n| n.kind_of?(Numeric) ? n : PDF::Reader::Font.glyphnames[n]}
    end
    protected :process_glyphnames

    class IdentityH < Encoding
      def to_utf8(str, tounicode = nil)

        array_enc = []

        # iterate over string, reading it in 2 byte chunks and interpreting those
        # chunks as ints
        str.unpack("n*").each do |num|

          # convert the int to a unicode codepoint if possible.
          # without a ToUnicode CMap, it's impossible to reliably convert this text
          # to unicode, so just replace each character with a little box. Big smacks
          # the the PDF producing app.
          if tounicode && (code = tounicode.decode(num))
            array_enc << code
          else
            array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
          end
        end

        # replace charcters that didn't convert to unicode nicely with something valid
        array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

        # pack all our Unicode codepoints into a UTF-8 string
        ret = array_enc.pack("U*")

        # set the strings encoding correctly under ruby 1.9+
        ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

        return ret
      end
    end

    class MacExpertEncoding < Encoding
      # convert a MacExpertEncoding string into UTF-8
      def to_utf8(str, tounicode = nil)
        array_expert = str.unpack('C*')
        array_expert = self.process_differences(array_expert)
        array_enc = []
        array_expert.each do |num|
          if tounicode && (code = tounicode.decode(num))
            array_enc << code
          elsif tounicode
            array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
          else
            case num
              # change necesary characters to equivilant Unicode codepoints
            when 0x21; array_enc << 0xF721
            when 0x22; array_enc << 0xF6F8 # Hungarumlautsmall
            when 0x23; array_enc << 0xF7A2
            when 0x24; array_enc << 0xF724
            when 0x25; array_enc << 0xF6E4
            when 0x26; array_enc << 0xF726
            when 0x27; array_enc << 0xF7B4
            when 0x28; array_enc << 0x207D
            when 0x29; array_enc << 0xF07E
            when 0x2A; array_enc << 0x2025
            when 0x2B; array_enc << 0x2024
            when 0x2F; array_enc << 0x2044
            when 0x30; array_enc << 0xF730
            when 0x31; array_enc << 0xF731
            when 0x32; array_enc << 0xF732
            when 0x33; array_enc << 0xF733
            when 0x34; array_enc << 0xF734
            when 0x35; array_enc << 0xF735
            when 0x36; array_enc << 0xF736
            when 0x37; array_enc << 0xF737
            when 0x38; array_enc << 0xF738
            when 0x39; array_enc << 0xF739
            when 0x3D; array_enc << 0xF6DE
            when 0x3F; array_enc << 0xF73F
            when 0x44; array_enc << 0xF7F0
            when 0x47; array_enc << 0x00BC
            when 0x48; array_enc << 0x00BD
            when 0x49; array_enc << 0x00BE
            when 0x4A; array_enc << 0x215B
            when 0x4B; array_enc << 0x215C
            when 0x4C; array_enc << 0x215D
            when 0x4D; array_enc << 0x215E
            when 0x4E; array_enc << 0x2153
            when 0x4F; array_enc << 0x2154
            when 0x56; array_enc << 0xFB00
            when 0x57; array_enc << 0xFB01
            when 0x58; array_enc << 0xFB02
            when 0x59; array_enc << 0xFB03
            when 0x5A; array_enc << 0xFB04
            when 0x5B; array_enc << 0x208D
            when 0x5D; array_enc << 0x208E
            when 0x5E; array_enc << 0xF6F6
            when 0x5F; array_enc << 0xF6E5
            when 0x60; array_enc << 0xF760
            when 0x61; array_enc << 0xF761
            when 0x62; array_enc << 0xF762
            when 0x63; array_enc << 0xF763
            when 0x64; array_enc << 0xF764
            when 0x65; array_enc << 0xF765
            when 0x66; array_enc << 0xF766
            when 0x67; array_enc << 0xF767
            when 0x68; array_enc << 0xF768
            when 0x69; array_enc << 0xF769
            when 0x6A; array_enc << 0xF76A
            when 0x6B; array_enc << 0xF76B
            when 0x6C; array_enc << 0xF76C
            when 0x6D; array_enc << 0xF76D
            when 0x6E; array_enc << 0xF76E
            when 0x6F; array_enc << 0xF76F
            when 0x70; array_enc << 0xF770
            when 0x71; array_enc << 0xF771
            when 0x72; array_enc << 0xF772
            when 0x73; array_enc << 0xF773
            when 0x74; array_enc << 0xF774
            when 0x75; array_enc << 0xF775
            when 0x76; array_enc << 0xF776
            when 0x77; array_enc << 0xF777
            when 0x78; array_enc << 0xF778
            when 0x79; array_enc << 0xF779
            when 0x7A; array_enc << 0xF77A
            when 0x7B; array_enc << 0x20A1
            when 0x7C; array_enc << 0xF6DC
            when 0x7D; array_enc << 0xF6DD
            when 0x7E; array_enc << 0xF6FE
            when 0x81; array_enc << 0xF6E9
            when 0x82; array_enc << 0xF6E0
            when 0x87; array_enc << 0xF7E1 # Acircumflexsmall
            when 0x88; array_enc << 0xF7E0
            when 0x89; array_enc << 0xF7E2 # Acutesmall
            when 0x8A; array_enc << 0xF7E4
            when 0x8B; array_enc << 0xF7E3
            when 0x8C; array_enc << 0xF7E5
            when 0x8D; array_enc << 0xF7E7
            when 0x8E; array_enc << 0xF7E9
            when 0x8F; array_enc << 0xF7E8
            when 0x90; array_enc << 0xF7E4
            when 0x91; array_enc << 0xF7EB
            when 0x92; array_enc << 0xF7ED
            when 0x93; array_enc << 0xF7EC
            when 0x94; array_enc << 0xF7EE
            when 0x95; array_enc << 0xF7EF
            when 0x96; array_enc << 0xF7F1
            when 0x97; array_enc << 0xF7F3
            when 0x98; array_enc << 0xF7F2
            when 0x99; array_enc << 0xF7F4
            when 0x9A; array_enc << 0xF7F6
            when 0x9B; array_enc << 0xF7F5
            when 0x9C; array_enc << 0xF7FA
            when 0x9D; array_enc << 0xF7F9
            when 0x9E; array_enc << 0xF7FB
            when 0x9F; array_enc << 0xF7FC
            when 0xA1; array_enc << 0x2078
            when 0xA2; array_enc << 0x2084
            when 0xA3; array_enc << 0x2083
            when 0xA4; array_enc << 0x2086
            when 0xA5; array_enc << 0x2088
            when 0xA6; array_enc << 0x2087
            when 0xA7; array_enc << 0xF6FD
            when 0xA9; array_enc << 0xF6DF
            when 0xAA; array_enc << 0x2082
            when 0xAC; array_enc << 0xF7A8
            when 0xAE; array_enc << 0xF6F5
            when 0xAF; array_enc << 0xF6F0
            when 0xB0; array_enc << 0x2085
            when 0xB2; array_enc << 0xF6E1
            when 0xB3; array_enc << 0xF6E7
            when 0xB4; array_enc << 0xF7FD
            when 0xB6; array_enc << 0xF6E3
            when 0xB9; array_enc << 0xF7FE
            when 0xBB; array_enc << 0x2089
            when 0xBC; array_enc << 0x2080
            when 0xBD; array_enc << 0xF6FF
            when 0xBE; array_enc << 0xF7E6 # AEsmall
            when 0xBF; array_enc << 0xF7F8
            when 0xC0; array_enc << 0xF7BF
            when 0xC1; array_enc << 0x2081
            when 0xC2; array_enc << 0xF6F9
            when 0xC9; array_enc << 0xF7B8
            when 0xCF; array_enc << 0xF6FA
            when 0xD0; array_enc << 0x2012
            when 0xD1; array_enc << 0xF6E6
            when 0xD6; array_enc << 0xF7A1
            when 0xD8; array_enc << 0xF7FF
            when 0xDA; array_enc << 0x00B9
            when 0xDB; array_enc << 0x00B2
            when 0xDC; array_enc << 0x00B3
            when 0xDD; array_enc << 0x2074
            when 0xDE; array_enc << 0x2075
            when 0xDF; array_enc << 0x2076
            when 0xE0; array_enc << 0x2077
            when 0xE1; array_enc << 0x2079
            when 0xE2; array_enc << 0x2070
            when 0xE4; array_enc << 0xF6EC
            when 0xE5; array_enc << 0xF6F1
            when 0xE6; array_enc << 0xF6F3
            when 0xE9; array_enc << 0xF6ED
            when 0xEA; array_enc << 0xF6F2
            when 0xEB; array_enc << 0xF6EB
            when 0xF1; array_enc << 0xF6EE
            when 0xF2; array_enc << 0xF6FB
            when 0xF3; array_enc << 0xF6F4
            when 0xF4; array_enc << 0xF7AF
            when 0xF5; array_enc << 0xF6EF
            when 0xF6; array_enc << 0x207F
            when 0xF7; array_enc << 0xF6EF
            when 0xF8; array_enc << 0xF6E2
            when 0xF9; array_enc << 0xF6E8
            when 0xFA; array_enc << 0xF6F7
            when 0xFB; array_enc << 0xF6FC
            else
              array_enc << num
            end
          end
        end

        # convert any glyph names to unicode codepoints
        array_enc = self.process_glyphnames(array_enc)

        # replace charcters that didn't convert to unicode nicely with something valid
        array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

        # pack all our Unicode codepoints into a UTF-8 string
        ret = array_enc.pack("U*")

        # set the strings encoding correctly under ruby 1.9+
        ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

        return ret
      end
    end

    # The default encoding for OSX <= v9
    # see: http://en.wikipedia.org/wiki/Mac_OS_Roman
    class MacRomanEncoding < Encoding
      # convert a MacRomanEncoding string into UTF-8
      def to_utf8(str, tounicode = nil)
        # content of this method borrowed from REXML::Encoding.decode_cp1252
        array_mac = str.unpack('C*')
        array_mac = self.process_differences(array_mac)
        array_enc = []
        array_mac.each do |num|
          if tounicode && (code = tounicode.decode(num))
            array_enc << code
          elsif tounicode
            array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
          else
            case num
              # change necesary characters to equivilant Unicode codepoints
            when 0x80; array_enc << 0x00C4
            when 0x81; array_enc << 0x00C5
            when 0x82; array_enc << 0x00C7
            when 0x83; array_enc << 0x00C9
            when 0x84; array_enc << 0x00D1
            when 0x85; array_enc << 0x00D6
            when 0x86; array_enc << 0x00DC
            when 0x87; array_enc << 0x00E1
            when 0x88; array_enc << 0x00E0
            when 0x89; array_enc << 0x00E2
            when 0x8A; array_enc << 0x00E4
            when 0x8B; array_enc << 0x00E3
            when 0x8C; array_enc << 0x00E5
            when 0x8D; array_enc << 0x00E7
            when 0x8E; array_enc << 0x00E9
            when 0x8F; array_enc << 0x00E8
            when 0x90; array_enc << 0x00EA
            when 0x91; array_enc << 0x00EB
            when 0x92; array_enc << 0x00ED
            when 0x93; array_enc << 0x00EC
            when 0x94; array_enc << 0x00EE
            when 0x95; array_enc << 0x00EF
            when 0x96; array_enc << 0x00F1
            when 0x97; array_enc << 0x00F3
            when 0x98; array_enc << 0x00F2
            when 0x99; array_enc << 0x00F4
            when 0x9A; array_enc << 0x00F6
            when 0x9B; array_enc << 0x00F5
            when 0x9C; array_enc << 0x00FA
            when 0x9D; array_enc << 0x00F9
            when 0x9E; array_enc << 0x00FB
            when 0x9F; array_enc << 0x00FC
            when 0xA0; array_enc << 0x2020
            when 0xA1; array_enc << 0x00B0
            when 0xA2; array_enc << 0x00A2
            when 0xA3; array_enc << 0x00A3
            when 0xA4; array_enc << 0x00A7
            when 0xA5; array_enc << 0x2022
            when 0xA6; array_enc << 0x00B6
            when 0xA7; array_enc << 0x00DF
            when 0xA8; array_enc << 0x00AE
            when 0xA9; array_enc << 0x00A9
            when 0xAA; array_enc << 0x2122
            when 0xAB; array_enc << 0x00B4
            when 0xAC; array_enc << 0x00A8
            when 0xAD; array_enc << 0x2260
            when 0xAE; array_enc << 0x00C6
            when 0xAF; array_enc << 0x00D8
            when 0xB0; array_enc << 0x221E
            when 0xB1; array_enc << 0x00B1
            when 0xB2; array_enc << 0x2264
            when 0xB3; array_enc << 0x2265
            when 0xB4; array_enc << 0x00A5
            when 0xB5; array_enc << 0x00B5
            when 0xB6; array_enc << 0x2202
            when 0xB7; array_enc << 0x2211
            when 0xB8; array_enc << 0x220F
            when 0xB9; array_enc << 0x03C0
            when 0xBA; array_enc << 0x222B
            when 0xBB; array_enc << 0x00AA
            when 0xBC; array_enc << 0x00BA
            when 0xBD; array_enc << 0x03A9
            when 0xBE; array_enc << 0x00E6
            when 0xBF; array_enc << 0x00F8
            when 0xC0; array_enc << 0x00BF
            when 0xC1; array_enc << 0x00A1
            when 0xC2; array_enc << 0x00AC
            when 0xC3; array_enc << 0x221A
            when 0xC4; array_enc << 0x0192
            when 0xC5; array_enc << 0x2248
            when 0xC6; array_enc << 0x2206
            when 0xC7; array_enc << 0x00AB
            when 0xC8; array_enc << 0x00BB
            when 0xC9; array_enc << 0x2026
            when 0xCA; array_enc << 0x00A0
            when 0xCB; array_enc << 0x00C0
            when 0xCC; array_enc << 0x00C3
            when 0xCD; array_enc << 0x00D5
            when 0xCE; array_enc << 0x0152
            when 0xCF; array_enc << 0x0153
            when 0xD0; array_enc << 0x2013
            when 0xD1; array_enc << 0x2014
            when 0xD2; array_enc << 0x201C
            when 0xD3; array_enc << 0x201D
            when 0xD4; array_enc << 0x2018
            when 0xD5; array_enc << 0x2019
            when 0xD6; array_enc << 0x00F7
            when 0xD7; array_enc << 0x25CA
            when 0xD8; array_enc << 0x00FF
            when 0xD9; array_enc << 0x0178
            when 0xDA; array_enc << 0x2044
            when 0xDB; array_enc << 0x20AC
            when 0xDC; array_enc << 0x2039
            when 0xDD; array_enc << 0x203A
            when 0xDE; array_enc << 0xFB01
            when 0xDF; array_enc << 0xFB02
            when 0xE0; array_enc << 0x2021
            when 0xE1; array_enc << 0x00B7
            when 0xE2; array_enc << 0x201A
            when 0xE3; array_enc << 0x201E
            when 0xE4; array_enc << 0x2030
            when 0xE5; array_enc << 0x00C2
            when 0xE6; array_enc << 0x00CA
            when 0xE7; array_enc << 0x00C1
            when 0xE8; array_enc << 0x00CB
            when 0xE9; array_enc << 0x00C8
            when 0xEA; array_enc << 0x00CD
            when 0xEB; array_enc << 0x00CE
            when 0xEC; array_enc << 0x00CF
            when 0xED; array_enc << 0x00CC
            when 0xEE; array_enc << 0x00D3
            when 0xEF; array_enc << 0x00D4
            when 0xF0; array_enc << 0xF8FF
            when 0xF1; array_enc << 0x00D2
            when 0xF2; array_enc << 0x00DA
            when 0xF3; array_enc << 0x00D8
            when 0xF4; array_enc << 0x00D9
            when 0xF5; array_enc << 0x0131
            when 0xF6; array_enc << 0x02C6
            when 0xF7; array_enc << 0x02DC
            when 0xF8; array_enc << 0x00AF
            when 0xF9; array_enc << 0x02D8
            when 0xFA; array_enc << 0x02D9
            when 0xFB; array_enc << 0x02DA
            when 0xFC; array_enc << 0x00B8
            when 0xFD; array_enc << 0x02DD
            when 0xFE; array_enc << 0x02DB
            when 0xFF; array_enc << 0x02C7
            else
              array_enc << num
            end
          end
        end

        # convert any glyph names to unicode codepoints
        array_enc = self.process_glyphnames(array_enc)

        # replace charcters that didn't convert to unicode nicely with something valid
        array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

        # pack all our Unicode codepoints into a UTF-8 string
        ret = array_enc.pack("U*")

        # set the strings encoding correctly under ruby 1.9+
        ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

        return ret
      end
    end

    class StandardEncoding < Encoding
      # convert an Adobe Standard Encoding string into UTF-8
      def to_utf8(str, tounicode = nil)
        # based on mapping described at:
        #   http://unicode.org/Public/MAPPINGS/VENDORS/ADOBE/stdenc.txt
        array_std = str.unpack('C*')
        array_std = self.process_differences(array_std)
        array_enc = []
        array_std.each do |num|
          if tounicode && (code = tounicode.decode(num))
            array_enc << code
          elsif tounicode
            array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
          else
            case num
            when 0x27; array_enc << 0x2019
            when 0x60; array_enc << 0x2018
            when 0xA4; array_enc << 0x2044
            when 0xA6; array_enc << 0x0192
            when 0xA8; array_enc << 0x00A4
            when 0xA9; array_enc << 0x0027
            when 0xAA; array_enc << 0x201C
            when 0xAC; array_enc << 0x2039
            when 0xAD; array_enc << 0x203A
            when 0xAE; array_enc << 0xFB01
            when 0xAF; array_enc << 0xFB02
            when 0xB1; array_enc << 0x2013
            when 0xB2; array_enc << 0x2020
            when 0xB3; array_enc << 0x2021
            when 0xB4; array_enc << 0x00B7
            when 0xB7; array_enc << 0x2022
            when 0xB8; array_enc << 0x201A
            when 0xB9; array_enc << 0x201E
            when 0xBA; array_enc << 0x201D
            when 0xBC; array_enc << 0x2026
            when 0xBD; array_enc << 0x2030
            when 0xC1; array_enc << 0x0060
            when 0xC2; array_enc << 0x00B4
            when 0xC3; array_enc << 0x02C6
            when 0xC4; array_enc << 0x02DC
            when 0xC5; array_enc << 0x00AF
            when 0xC6; array_enc << 0x02D8
            when 0xC7; array_enc << 0x02D9
            when 0xC8; array_enc << 0x00A8
            when 0xCA; array_enc << 0x02DA
            when 0xCB; array_enc << 0x00B8
            when 0xCD; array_enc << 0x02DD
            when 0xCE; array_enc << 0x02DB
            when 0xCF; array_enc << 0x02C7
            when 0xD0; array_enc << 0x2014
            when 0xE1; array_enc << 0x00C6
            when 0xE3; array_enc << 0x00AA
            when 0xE8; array_enc << 0x0141
            when 0xE9; array_enc << 0x00D8
            when 0xEA; array_enc << 0x0152
            when 0xEB; array_enc << 0x00BA
            when 0xF1; array_enc << 0x00E6
            when 0xF5; array_enc << 0x0131
            when 0xF8; array_enc << 0x0142
            when 0xF9; array_enc << 0x00F8
            when 0xFA; array_enc << 0x0153
            when 0xFB; array_enc << 0x00DF
            else
              array_enc << num
            end
          end
        end

        # convert any glyph names to unicode codepoints
        array_enc = self.process_glyphnames(array_enc)

        # replace charcters that didn't convert to unicode nicely with something valid
        array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

        # pack all our Unicode codepoints into a UTF-8 string
        ret = array_enc.pack("U*")

        # set the strings encoding correctly under ruby 1.9+
        ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

        return ret
      end
    end

    class SymbolEncoding < Encoding
      # convert a SymbolEncoding string into UTF-8
      def to_utf8(str, tounicode = nil)
        array_symbol = str.unpack('C*')
        array_symbol = self.process_differences(array_symbol)
        array_enc = []
        array_symbol.each do |num|
          if tounicode && (code = tounicode.decode(num))
            array_enc << code
          elsif tounicode
            array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
          else
            case num
            when 0x22; array_enc << 0x2200
            when 0x24; array_enc << 0x2203
            when 0x27; array_enc << 0x220B
            when 0x2A; array_enc << 0x2217
            when 0x2D; array_enc << 0x2212
            when 0x40; array_enc << 0x2245
            when 0x41; array_enc << 0x0391
            when 0x42; array_enc << 0x0392
            when 0x43; array_enc << 0x03A7
            when 0x44; array_enc << 0x0394
            when 0x45; array_enc << 0x0395
            when 0x46; array_enc << 0x03A6
            when 0x47; array_enc << 0x0393
            when 0x48; array_enc << 0x0397
            when 0x49; array_enc << 0x0399
            when 0x4A; array_enc << 0x03D1
            when 0x4B; array_enc << 0x039A
            when 0x4C; array_enc << 0x039B
            when 0x4D; array_enc << 0x039C
            when 0x4E; array_enc << 0x039D
            when 0x4F; array_enc << 0x039F
            when 0x50; array_enc << 0x03A0
            when 0x51; array_enc << 0x0398
            when 0x52; array_enc << 0x03A1
            when 0x53; array_enc << 0x03A3
            when 0x54; array_enc << 0x03A4
            when 0x55; array_enc << 0x03A5
            when 0x56; array_enc << 0x03C2
            when 0x57; array_enc << 0x03A9
            when 0x58; array_enc << 0x039E
            when 0x59; array_enc << 0x03A8
            when 0x5A; array_enc << 0x0396
            when 0x5C; array_enc << 0x2234
            when 0x5E; array_enc << 0x22A5
            when 0x60; array_enc << 0xF8E5
            when 0x61; array_enc << 0x03B1
            when 0x62; array_enc << 0x03B2
            when 0x63; array_enc << 0x03C7
            when 0x64; array_enc << 0x03B4
            when 0x65; array_enc << 0x03B5
            when 0x66; array_enc << 0x03C6
            when 0x67; array_enc << 0x03B3
            when 0x68; array_enc << 0x03B7
            when 0x69; array_enc << 0x03B9
            when 0x6A; array_enc << 0x03D5
            when 0x6B; array_enc << 0x03BA
            when 0x6C; array_enc << 0x03BB
            when 0x6D; array_enc << 0x03BC
            when 0x6E; array_enc << 0x03BD
            when 0x6F; array_enc << 0x03BF
            when 0x70; array_enc << 0x03C0
            when 0x71; array_enc << 0x03B8
            when 0x72; array_enc << 0x03C1
            when 0x73; array_enc << 0x03C3
            when 0x74; array_enc << 0x03C4
            when 0x75; array_enc << 0x03C5
            when 0x76; array_enc << 0x03D6
            when 0x77; array_enc << 0x03C9
            when 0x78; array_enc << 0x03BE
            when 0x79; array_enc << 0x03C8
            when 0x7A; array_enc << 0x03B6
            when 0x7E; array_enc << 0x223C
            when 0xA0; array_enc << 0x20AC
            when 0xA1; array_enc << 0x03D2
            when 0xA2; array_enc << 0x2032
            when 0xA3; array_enc << 0x2264
            when 0xA4; array_enc << 0x2215
            when 0xA5; array_enc << 0x221E
            when 0xA6; array_enc << 0x0192
            when 0xA7; array_enc << 0x2663
            when 0xA8; array_enc << 0x2666
            when 0xA9; array_enc << 0x2665
            when 0xAA; array_enc << 0x2660
            when 0xAB; array_enc << 0x2194
            when 0xAC; array_enc << 0x2190
            when 0xAD; array_enc << 0x2191
            when 0xAE; array_enc << 0x2192
            when 0xAF; array_enc << 0x2193
            when 0xB2; array_enc << 0x2033
            when 0xB3; array_enc << 0x2265
            when 0xB4; array_enc << 0x00D7
            when 0xB5; array_enc << 0x221D
            when 0xB6; array_enc << 0x2202
            when 0xB7; array_enc << 0x2022
            when 0xB8; array_enc << 0x00F7
            when 0xB9; array_enc << 0x2260
            when 0xBA; array_enc << 0x2261
            when 0xBB; array_enc << 0x2248
            when 0xBC; array_enc << 0x2026
            when 0xBD; array_enc << 0xF8E6
            when 0xBE; array_enc << 0xF8E7
            when 0xBF; array_enc << 0x21B5
            when 0xC0; array_enc << 0x2135
            when 0xC1; array_enc << 0x2111
            when 0xC2; array_enc << 0x211C
            when 0xC3; array_enc << 0x2118
            when 0xC4; array_enc << 0x2297
            when 0xC5; array_enc << 0x2295
            when 0xC6; array_enc << 0x2205
            when 0xC7; array_enc << 0x2229
            when 0xC8; array_enc << 0x222A
            when 0xC9; array_enc << 0x2283
            when 0xCA; array_enc << 0x2287
            when 0xCB; array_enc << 0x2284
            when 0xCC; array_enc << 0x2282
            when 0xCD; array_enc << 0x2286
            when 0xCE; array_enc << 0x2208
            when 0xCF; array_enc << 0x2209
            when 0xD0; array_enc << 0x2220
            when 0xD1; array_enc << 0x2207
            when 0xD2; array_enc << 0xF6DA
            when 0xD3; array_enc << 0xF6D9
            when 0xD4; array_enc << 0xF6DB
            when 0xD5; array_enc << 0x220F
            when 0xD6; array_enc << 0x221A
            when 0xD7; array_enc << 0x22C5
            when 0xD8; array_enc << 0x00AC
            when 0xD9; array_enc << 0x2227
            when 0xDA; array_enc << 0x2228
            when 0xDB; array_enc << 0x21D4
            when 0xDC; array_enc << 0x21D0
            when 0xDD; array_enc << 0x21D1
            when 0xDE; array_enc << 0x21D2
            when 0xDF; array_enc << 0x21D3
            when 0xE0; array_enc << 0x25CA
            when 0xE1; array_enc << 0x2329
            when 0xE2; array_enc << 0xF8E8
            when 0xE3; array_enc << 0xF8E9
            when 0xE4; array_enc << 0xF8EA
            when 0xE5; array_enc << 0x2211
            when 0xE6; array_enc << 0xF8EB
            when 0xE7; array_enc << 0xF8EC
            when 0xE8; array_enc << 0xF8ED
            when 0xE9; array_enc << 0xF8EE
            when 0xEA; array_enc << 0xF8EF
            when 0xEB; array_enc << 0xF8F0
            when 0xEC; array_enc << 0xF8F1
            when 0xED; array_enc << 0xF8F2
            when 0xEE; array_enc << 0xF8F3
            when 0xEF; array_enc << 0xF8F4
            when 0xF1; array_enc << 0x232A
            when 0xF2; array_enc << 0x222B
            when 0xF3; array_enc << 0x2320
            when 0xF4; array_enc << 0xF8F5
            when 0xF5; array_enc << 0x2321
            when 0xF6; array_enc << 0xF8F6
            when 0xF7; array_enc << 0xF8F7
            when 0xF8; array_enc << 0xF8F8
            when 0xF9; array_enc << 0xF8F9
            when 0xFA; array_enc << 0xF8FA
            when 0xFB; array_enc << 0xF8FB
            when 0xFC; array_enc << 0xF8FC
            when 0xFD; array_enc << 0xF8FD
            when 0xFE; array_enc << 0xF8FE
            else
              array_enc << num
            end
          end
        end

        # replace charcters that didn't convert to unicode nicely with something valid
        array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

        # convert any glyph names to unicode codepoints
        array_enc = self.process_glyphnames(array_enc)

        # pack all our Unicode codepoints into a UTF-8 string
        ret = array_enc.pack("U*")

        # set the strings encoding correctly under ruby 1.9+
        ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

        return ret
      end
    end

    class WinAnsiEncoding < Encoding
      # convert a WinAnsiEncoding string into UTF-8
      def to_utf8(str, tounicode = nil)
        # content of this method borrowed from REXML::Encoding.decode_cp1252
        # for further reading:
        # http://www.intertwingly.net/stories/2004/04/14/i18n.html
        array_latin9 = str.unpack('C*')
        array_latin9 = self.process_differences(array_latin9)
        array_enc = []
        array_latin9.each do |num|
          if tounicode && (code = tounicode.decode(num))
            array_enc << code
          elsif tounicode
            array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
          else
            case num
              # characters that added compared to iso-8859-1
            when 0x80; array_enc << 0x20AC # 0xe2 0x82 0xac
            when 0x82; array_enc << 0x201A # 0xe2 0x82 0x9a
            when 0x83; array_enc << 0x0192 # 0xc6 0x92
            when 0x84; array_enc << 0x201E # 0xe2 0x82 0x9e
            when 0x85; array_enc << 0x2026 # 0xe2 0x80 0xa6
            when 0x86; array_enc << 0x2020 # 0xe2 0x80 0xa0
            when 0x87; array_enc << 0x2021 # 0xe2 0x80 0xa1
            when 0x88; array_enc << 0x02C6 # 0xcb 0x86
            when 0x89; array_enc << 0x2030 # 0xe2 0x80 0xb0
            when 0x8A; array_enc << 0x0160 # 0xc5 0xa0
            when 0x8B; array_enc << 0x2039 # 0xe2 0x80 0xb9
            when 0x8C; array_enc << 0x0152 # 0xc5 0x92
            when 0x8E; array_enc << 0x017D # 0xc5 0xbd
            when 0x91; array_enc << 0x2018 # 0xe2 0x80 0x98
            when 0x92; array_enc << 0x2019 # 0xe2 0x80 0x99
            when 0x93; array_enc << 0x201C
            when 0x94; array_enc << 0x201D
            when 0x95; array_enc << 0x2022
            when 0x96; array_enc << 0x2013
            when 0x97; array_enc << 0x2014
            when 0x98; array_enc << 0x02DC
            when 0x99; array_enc << 0x2122
            when 0x9A; array_enc << 0x0161
            when 0x9B; array_enc << 0x203A
            when 0x9C; array_enc << 0x0152 # 0xc5 0x93
            when 0x9E; array_enc << 0x017E # 0xc5 0xbe
            when 0x9F; array_enc << 0x0178
            else
              array_enc << num
            end
          end
        end

        # convert any glyph names to unicode codepoints
        array_enc = self.process_glyphnames(array_enc)

        # replace charcters that didn't convert to unicode nicely with something valid
        array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

        # pack all our Unicode codepoints into a UTF-8 string
        ret = array_enc.pack("U*")

        # set the strings encoding correctly under ruby 1.9+
        ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

        return ret
      end
    end

    class ZapfDingbatsEncoding < Encoding
      # convert a ZapfDingbatsEncoding string into UTF-8
      def to_utf8(str, tounicode = nil)
        # mapping to unicode taken from:
        #   http://unicode.org/Public/MAPPINGS/VENDORS/ADOBE/zdingbat.txt
        array_symbol = str.unpack('C*')
        array_symbol = self.process_differences(array_symbol)
        array_enc = []
        array_symbol.each do |num|
          if tounicode && (code = tounicode.decode(num))
            array_enc << code
          elsif tounicode
            array_enc << PDF::Reader::Encoding::UNKNOWN_CHAR
          else
            case num
            when 0x21; array_enc << 0x2701
            when 0x22; array_enc << 0x2702
            when 0x23; array_enc << 0x2703
            when 0x24; array_enc << 0x2704
            when 0x25; array_enc << 0x260E
            when 0x26; array_enc << 0x2706
            when 0x27; array_enc << 0x2707
            when 0x28; array_enc << 0x2708
            when 0x29; array_enc << 0x2709
            when 0x2A; array_enc << 0x261B
            when 0x2B; array_enc << 0x261E
            when 0x2C; array_enc << 0x270C
            when 0x2D; array_enc << 0x270D
            when 0x2E; array_enc << 0x270E
            when 0x2F; array_enc << 0x270F
            when 0x30; array_enc << 0x2710
            when 0x31; array_enc << 0x2711
            when 0x32; array_enc << 0x2712
            when 0x33; array_enc << 0x2713
            when 0x34; array_enc << 0x2714
            when 0x35; array_enc << 0x2715
            when 0x36; array_enc << 0x2716
            when 0x37; array_enc << 0x2717
            when 0x38; array_enc << 0x2718
            when 0x39; array_enc << 0x2719
            when 0x3A; array_enc << 0x271A
            when 0x3B; array_enc << 0x271B
            when 0x3C; array_enc << 0x271C
            when 0x3D; array_enc << 0x271D
            when 0x3E; array_enc << 0x271E
            when 0x3F; array_enc << 0x271E
            when 0x40; array_enc << 0x2720
            when 0x41; array_enc << 0x2721
            when 0x42; array_enc << 0x2722
            when 0x43; array_enc << 0x2723
            when 0x44; array_enc << 0x2724
            when 0x45; array_enc << 0x2725
            when 0x46; array_enc << 0x2726
            when 0x47; array_enc << 0x2727
            when 0x48; array_enc << 0x2605
            when 0x49; array_enc << 0x2729
            when 0x4A; array_enc << 0x272A
            when 0x4B; array_enc << 0x272B
            when 0x4C; array_enc << 0x272C
            when 0x4D; array_enc << 0x272D
            when 0x4E; array_enc << 0x272E
            when 0x4F; array_enc << 0x272F
            when 0x50; array_enc << 0x2730
            when 0x51; array_enc << 0x2731
            when 0x52; array_enc << 0x2732
            when 0x53; array_enc << 0x2733
            when 0x54; array_enc << 0x2734
            when 0x55; array_enc << 0x2735
            when 0x56; array_enc << 0x2736
            when 0x57; array_enc << 0x2737
            when 0x58; array_enc << 0x2738
            when 0x59; array_enc << 0x2739
            when 0x5A; array_enc << 0x273A
            when 0x5B; array_enc << 0x273B
            when 0x5C; array_enc << 0x273C
            when 0x5D; array_enc << 0x273D
            when 0x5E; array_enc << 0x273E
            when 0x5F; array_enc << 0x273F
            when 0x60; array_enc << 0x2740
            when 0x61; array_enc << 0x2741
            when 0x62; array_enc << 0x2742
            when 0x63; array_enc << 0x2743
            when 0x64; array_enc << 0x2744
            when 0x65; array_enc << 0x2745
            when 0x66; array_enc << 0x2746
            when 0x67; array_enc << 0x2747
            when 0x68; array_enc << 0x2748
            when 0x69; array_enc << 0x2749
            when 0x6A; array_enc << 0x274A
            when 0x6B; array_enc << 0x274B
            when 0x6C; array_enc << 0x25CF
            when 0x6D; array_enc << 0x274D
            when 0x6E; array_enc << 0x25A0
            when 0x6F; array_enc << 0x274F
            when 0x70; array_enc << 0x2750
            when 0x71; array_enc << 0x2751
            when 0x72; array_enc << 0x2752
            when 0x73; array_enc << 0x2753
            when 0x74; array_enc << 0x2754
            when 0x75; array_enc << 0x2755
            when 0x76; array_enc << 0x2756
            when 0x77; array_enc << 0x2757
            when 0x78; array_enc << 0x2758
            when 0x79; array_enc << 0x2759
            when 0x7A; array_enc << 0x275A
            when 0x7B; array_enc << 0x275B
            when 0x7C; array_enc << 0x275C
            when 0x7D; array_enc << 0x275D
            when 0x7E; array_enc << 0x275E
            when 0x80; array_enc << 0xF8D7
            when 0x81; array_enc << 0xF8D8
            when 0x82; array_enc << 0xF8D9
            when 0x83; array_enc << 0xF8DA
            when 0x84; array_enc << 0xF8DB
            when 0x85; array_enc << 0xF8DC
            when 0x86; array_enc << 0xF8DD
            when 0x87; array_enc << 0xF8DE
            when 0x88; array_enc << 0xF8DF
            when 0x89; array_enc << 0xF8E0
            when 0x8A; array_enc << 0xF8E1
            when 0x8B; array_enc << 0xF8E2
            when 0x8C; array_enc << 0xF8E3
            when 0x8D; array_enc << 0xF8E4
            when 0xA1; array_enc << 0x2761
            when 0xA2; array_enc << 0x2762
            when 0xA3; array_enc << 0x2763
            when 0xA4; array_enc << 0x2764
            when 0xA5; array_enc << 0x2765
            when 0xA6; array_enc << 0x2766
            when 0xA7; array_enc << 0x2767
            when 0xA8; array_enc << 0x2663
            when 0xA9; array_enc << 0x2666
            when 0xAA; array_enc << 0x2665
            when 0xAB; array_enc << 0x2660
            when 0xAC; array_enc << 0x2460
            when 0xAD; array_enc << 0x2461
            when 0xAE; array_enc << 0x2462
            when 0xAF; array_enc << 0x2463
            when 0xB0; array_enc << 0x2464
            when 0xB1; array_enc << 0x2465
            when 0xB2; array_enc << 0x2466
            when 0xB3; array_enc << 0x2467
            when 0xB4; array_enc << 0x2468
            when 0xB5; array_enc << 0x2469
            when 0xB6; array_enc << 0x2776
            when 0xB7; array_enc << 0x2777
            when 0xB8; array_enc << 0x2778
            when 0xB9; array_enc << 0x2779
            when 0xBA; array_enc << 0x277A
            when 0xBB; array_enc << 0x277B
            when 0xBC; array_enc << 0x277C
            when 0xBD; array_enc << 0x277D
            when 0xBE; array_enc << 0x277E
            when 0xBF; array_enc << 0x277F
            when 0xC0; array_enc << 0x2780
            when 0xC1; array_enc << 0x2781
            when 0xC2; array_enc << 0x2782
            when 0xC3; array_enc << 0x2783
            when 0xC4; array_enc << 0x2784
            when 0xC5; array_enc << 0x2785
            when 0xC6; array_enc << 0x2786
            when 0xC7; array_enc << 0x2787
            when 0xC8; array_enc << 0x2788
            when 0xC9; array_enc << 0x2789
            when 0xCA; array_enc << 0x278A
            when 0xCB; array_enc << 0x278B
            when 0xCC; array_enc << 0x278C
            when 0xCD; array_enc << 0x278D
            when 0xCE; array_enc << 0x278E
            when 0xCF; array_enc << 0x278F
            when 0xD0; array_enc << 0x2790
            when 0xD1; array_enc << 0x2791
            when 0xD2; array_enc << 0x2792
            when 0xD3; array_enc << 0x2793
            when 0xD4; array_enc << 0x2794
            when 0xD5; array_enc << 0x2795
            when 0xD6; array_enc << 0x2796
            when 0xD7; array_enc << 0x2797
            when 0xD8; array_enc << 0x2798
            when 0xD9; array_enc << 0x2799
            when 0xDA; array_enc << 0x279A
            when 0xDB; array_enc << 0x279B
            when 0xDC; array_enc << 0x279C
            when 0xDD; array_enc << 0x279D
            when 0xDE; array_enc << 0x279E
            when 0xDF; array_enc << 0x279F
            when 0xE0; array_enc << 0x27A0
            when 0xE1; array_enc << 0x27A1
            when 0xE2; array_enc << 0x27A2
            when 0xE3; array_enc << 0x27A3
            when 0xE4; array_enc << 0x27A4
            when 0xE5; array_enc << 0x27A5
            when 0xE6; array_enc << 0x27A6
            when 0xE7; array_enc << 0x27A7
            when 0xE8; array_enc << 0x27A8
            when 0xE9; array_enc << 0x27A9
            when 0xEA; array_enc << 0x27AA
            when 0xEB; array_enc << 0x27AB
            when 0xEC; array_enc << 0x27AC
            when 0xED; array_enc << 0x27AD
            when 0xEE; array_enc << 0x27AE
            when 0xEF; array_enc << 0x27AF
            when 0xF1; array_enc << 0x27B1
            when 0xF2; array_enc << 0x27B2
            when 0xF3; array_enc << 0x27B3
            when 0xF4; array_enc << 0x27B4
            when 0xF5; array_enc << 0x27B5
            when 0xF6; array_enc << 0x27B6
            when 0xF7; array_enc << 0x27B7
            when 0xF8; array_enc << 0x27B8
            when 0xF9; array_enc << 0x27B9
            when 0xFA; array_enc << 0x27BA
            when 0xFB; array_enc << 0x27BB
            when 0xFC; array_enc << 0x27BC
            when 0xFD; array_enc << 0x27BD
            when 0xFE; array_enc << 0x27BE
            else
              array_enc << num
            end
          end
        end

        # convert any glyph names to unicode codepoints
        array_enc = self.process_glyphnames(array_enc)

        # replace charcters that didn't convert to unicode nicely with something valid
        array_enc.collect! { |c| c ? c : PDF::Reader::Encoding::UNKNOWN_CHAR }

        # pack all our Unicode codepoints into a UTF-8 string
        ret = array_enc.pack("U*")

        # set the strings encoding correctly under ruby 1.9+
        ret.force_encoding("UTF-8") if ret.respond_to?(:force_encoding)

        return ret
      end
    end
  end
end