module ActiveSupport::Multibyte::Unicode
def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
codepoints.
Returns a regular expression pattern that matches the passed Unicode
def self.codepoints_to_pattern(array_of_codepoints) #:nodoc: array_of_codepoints.collect{ |e| [e].pack 'U*'.freeze }.join('|'.freeze) end
def apply_mapping(string, mapping) #:nodoc:
def apply_mapping(string, mapping) #:nodoc: database.codepoints string.each_codepoint.map do |codepoint| cp = database.codepoints[codepoint] if cp and (ncp = cp.send(mapping)) and ncp > 0 ncp else codepoint end end.pack('U*') end
def compose(codepoints)
def compose(codepoints) pos = 0 eoa = codepoints.length - 1 starter_pos = 0 starter_char = codepoints[0] previous_combining_class = -1 while pos < eoa pos += 1 lindex = starter_char - HANGUL_LBASE # -- Hangul if 0 <= lindex and lindex < HANGUL_LCOUNT vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 if 0 <= vindex and vindex < HANGUL_VCOUNT tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 if 0 <= tindex and tindex < HANGUL_TCOUNT j = starter_pos + 2 eoa -= 2 else tindex = 0 j = starter_pos + 1 eoa -= 1 end codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE end starter_pos += 1 starter_char = codepoints[starter_pos] # -- Other characters else current_char = codepoints[pos] current = database.codepoints[current_char] if current.combining_class > previous_combining_class if ref = database.composition_map[starter_char] composition = ref[current_char] else composition = nil end unless composition.nil? codepoints[starter_pos] = composition starter_char = composition codepoints.delete_at pos eoa -= 1 pos -= 1 previous_combining_class = -1 else previous_combining_class = current.combining_class end else previous_combining_class = current.combining_class end if current.combining_class == 0 starter_pos = pos starter_char = codepoints[pos] end end end codepoints end
def database
def database @database ||= UnicodeDatabase.new end
def decompose(type, codepoints)
def decompose(type, codepoints) codepoints.inject([]) do |decomposed, cp| # if it's a hangul syllable starter character if HANGUL_SBASE <= cp and cp < HANGUL_SLAST sindex = cp - HANGUL_SBASE ncp = [] # new codepoints ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT tindex = sindex % HANGUL_TCOUNT ncp << (HANGUL_TBASE + tindex) unless tindex == 0 decomposed.concat ncp # if the codepoint is decomposable in with the current decomposition type elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatibility) decomposed.concat decompose(type, ncp.dup) else decomposed << cp end end end
def downcase(string)
def downcase(string) apply_mapping string, :lowercase_mapping end
def in_char_class?(codepoint, classes)
:v, :lv, :lvt and :t.
Valid character classes are: :cr, :lf, :l,
+true+ when it's in the specified character class and +false+ otherwise.
Detect whether the codepoint is in a certain character class. Returns
def in_char_class?(codepoint, classes) classes.detect { |c| database.boundary[c] === codepoint } ? true : false end
def normalize(string, form=nil)
the following: :c, :kc, :d, or :kd.
* form - The form you want to normalize in. Should be one of
* string - The string to perform normalization on.
and validations.
considered the best normalization form for passing strings to databases
Returns the KC normalization of the string by default. NFKC is
def normalize(string, form=nil) form ||= @default_normalization_form # See http://www.unicode.org/reports/tr15, Table 1 codepoints = string.codepoints.to_a case form when :d reorder_characters(decompose(:canonical, codepoints)) when :c compose(reorder_characters(decompose(:canonical, codepoints))) when :kd reorder_characters(decompose(:compatibility, codepoints)) when :kc compose(reorder_characters(decompose(:compatibility, codepoints))) else raise ArgumentError, "#{form} is not a valid normalization variant", caller end.pack('U*'.freeze) end
def pack_graphemes(unpacked)
Reverse operation of unpack_graphemes.
def pack_graphemes(unpacked) unpacked.flatten.pack('U*') end
def recode_windows1252_chars(string)
def recode_windows1252_chars(string) string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace) end
def reorder_characters(codepoints)
def reorder_characters(codepoints) length = codepoints.length- 1 pos = 0 while pos < length do cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos+1]] if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) codepoints[pos..pos+1] = cp2.code, cp1.code pos += (pos > 0 ? -1 : 1) else pos += 1 end end codepoints end
def swapcase(string)
def swapcase(string) apply_mapping string, :swapcase_mapping end
def tidy_bytes(string, force = false)
Passing +true+ will forcibly tidy all bytes, assuming that the string's
resulting in a valid UTF-8 string.
Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent
def tidy_bytes(string, force = false) return string if string.empty? return recode_windows1252_chars(string) if force string.scrub { |bad| recode_windows1252_chars(bad) } end
def tidy_bytes(string, force = false)
def tidy_bytes(string, force = false) return string if string.empty? return recode_windows1252_chars(string) if force # We can't transcode to the same format, so we choose a nearly-identical encoding. # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to # CP1252 when we get errors. The final string will be 'converted' back to UTF-8 # before returning. reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_16LE) source = string.dup out = ''.force_encoding(Encoding::UTF_16LE) loop do reader.primitive_convert(source, out) _, _, _, error_bytes, _ = reader.primitive_errinfo break if error_bytes.nil? out << error_bytes.encode(Encoding::UTF_16LE, Encoding::Windows_1252, invalid: :replace, undef: :replace) end reader.finish out.encode!(Encoding::UTF_8) end
def unpack_graphemes(string)
Unicode.unpack_graphemes('क्षि') # => [[2325, 2381], [2359], [2367]]
lists.
Unpack the string at grapheme boundaries. Returns a list of character
def unpack_graphemes(string) codepoints = string.codepoints.to_a unpacked = [] pos = 0 marker = 0 eoc = codepoints.length while(pos < eoc) pos += 1 previous = codepoints[pos-1] current = codepoints[pos] should_break = # GB3. CR X LF if previous == database.boundary[:cr] and current == database.boundary[:lf] false # GB4. (Control|CR|LF) ÷ elsif previous and in_char_class?(previous, [:control,:cr,:lf]) true # GB5. ÷ (Control|CR|LF) elsif in_char_class?(current, [:control,:cr,:lf]) true # GB6. L X (L|V|LV|LVT) elsif database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) false # GB7. (LV|V) X (V|T) elsif in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) false # GB8. (LVT|T) X (T) elsif in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current false # GB8a. Regional_Indicator X Regional_Indicator elsif database.boundary[:regional_indicator] === previous and database.boundary[:regional_indicator] === current false # GB9. X Extend elsif database.boundary[:extend] === current false # GB9a. X SpacingMark elsif database.boundary[:spacingmark] === current false # GB9b. Prepend X elsif database.boundary[:prepend] === previous false # GB10. Any ÷ Any else true end if should_break unpacked << codepoints[marker..pos-1] marker = pos end end unpacked end
def upcase(string)
def upcase(string) apply_mapping string, :uppercase_mapping end