class HexaPDF::Font::CMap::Parser
See: Adobe Technical Notes #5014 and #5411
Parses CMap files.
def bytes_to_int(string)
Treats the string as an array of bytes and converts it to an integer.
def bytes_to_int(string) result = 0 index = 0 while index < string.length result = (result << 8) | string.getbyte(index) index += 1 end result end
def parse(string)
def parse(string) tokenizer = HexaPDF::Content::Tokenizer.new(string) cmap = CMap.new until (token = tokenizer.next_token) == HexaPDF::Tokenizer::NO_MORE_TOKENS if token.kind_of?(HexaPDF::Tokenizer::Token) case token when 'beginbfchar' then parse_bf_char(tokenizer, cmap) when 'beginbfrange' then parse_bf_range(tokenizer, cmap) when 'begincidchar' then parse_cid_char(tokenizer, cmap) when 'begincidrange' then parse_cid_range(tokenizer, cmap) when 'begincodespacerange' then parse_codespace_range(tokenizer, cmap) when 'endcmap' then break end elsif token.kind_of?(Symbol) value = tokenizer.next_token if value.kind_of?(HexaPDF::Tokenizer::Token) parse_cmap(cmap, token) if value == 'usecmap' else parse_dict_mapping(cmap, token, value) end end end cmap rescue StandardError => e raise HexaPDF::Error, "Error parsing CMap: #{e.message}", e.backtrace end
def parse_bf_char(tokenizer, cmap)
def parse_bf_char(tokenizer, cmap) until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token) str = tokenizer.next_token.encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE) cmap.add_unicode_mapping(bytes_to_int(code), str) end end
def parse_bf_range(tokenizer, cmap)
"endCode" have to be the same. So it seems that this is a mistake in the PDF reference.
Additionally, #5411 mentions in section 1.4.1 that the first byte of "startCode" and
this.
of 255. However #5411 has the range "<1379> <137B> <90FE>" as example which contradicts
PDF2.0 mentions that the last byte of "codePoint" should be incremented, up to a maximum
operators of the form "startCode endCode codePoint" should be handled.
PDF2.0 s9.10.3 and Adobe Technical Note #5411 have different views as to how "bfrange"
--
Parses the "bfrange" operator at the current position.
def parse_bf_range(tokenizer, cmap) until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token) code1 = bytes_to_int(code1) code2 = bytes_to_int(tokenizer.next_token) dest = tokenizer.next_object if dest.kind_of?(String) codepoint = dest.force_encoding(::Encoding::UTF_16BE).ord code1.upto(code2) do |code| cmap.add_unicode_mapping(code, +'' << codepoint) codepoint += 1 end elsif dest.kind_of?(Array) code1.upto(code2) do |code| str = dest[code - code1].encode!(::Encoding::UTF_8, ::Encoding::UTF_16BE) cmap.add_unicode_mapping(code, str) end else raise HexaPDF::Error, "Invalid bfrange operator in CMap" end end end
def parse_cid_char(tokenizer, cmap)
def parse_cid_char(tokenizer, cmap) until (code = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token) cmap.add_cid_mapping(bytes_to_int(code), tokenizer.next_token) end end
def parse_cid_range(tokenizer, cmap)
def parse_cid_range(tokenizer, cmap) until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token) code1 = bytes_to_int(code1) code2 = bytes_to_int(tokenizer.next_token) cid_start = tokenizer.next_object if code1 == code2 cmap.add_cid_mapping(code1, cid_start) else cmap.add_cid_range(code1, code2, cid_start) end end end
def parse_cmap(cmap, name)
def parse_cmap(cmap, name) cmap.use_cmap(CMap.for_name(name.to_s)) end
def parse_codespace_range(tokenizer, cmap)
def parse_codespace_range(tokenizer, cmap) until (code1 = tokenizer.next_token).kind_of?(HexaPDF::Tokenizer::Token) code2 = tokenizer.next_token byte_ranges = [] code1.each_byte.with_index do |byte, index| byte_ranges << (byte..(code2.getbyte(index))) end cmap.add_codespace_range(*byte_ranges) end end
def parse_dict_mapping(cmap, name, value)
Parses a single mapping of a dictionary pair. The +name+ and +value+ of the mapping have
def parse_dict_mapping(cmap, name, value) case name when :Registry cmap.registry = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String) when :Ordering cmap.ordering = value.force_encoding(::Encoding::UTF_8) if value.kind_of?(String) when :Supplement cmap.supplement = value if value.kind_of?(Integer) when :CMapName cmap.name = value.to_s.dup.force_encoding(::Encoding::UTF_8) if value.kind_of?(Symbol) when :WMode cmap.wmode = value end end