module ActiveSupport::Multibyte::Unicode

def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:

:nodoc:
codepoints.
Returns a regular expression pattern that matches the passed Unicode

def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
  array_of_codepoints.collect{ |e| [e].pack 'U*'.freeze }.join('|'.freeze)
end

def apply_mapping(string, mapping) #:nodoc:

:nodoc:

def apply_mapping(string, mapping) #:nodoc:
  database.codepoints
  string.each_codepoint.map do |codepoint|
    cp = database.codepoints[codepoint]
    if cp and (ncp = cp.send(mapping)) and ncp > 0
      ncp
    else
      codepoint
    end
  end.pack('U*')
end

def compose(codepoints)

Compose decomposed characters to the composed form.

def compose(codepoints)
  pos = 0
  eoa = codepoints.length - 1
  starter_pos = 0
  starter_char = codepoints[0]
  previous_combining_class = -1
  while pos < eoa
    pos += 1
    lindex = starter_char - HANGUL_LBASE
    # -- Hangul
    if 0 <= lindex and lindex < HANGUL_LCOUNT
      vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
      if 0 <= vindex and vindex < HANGUL_VCOUNT
        tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
        if 0 <= tindex and tindex < HANGUL_TCOUNT
          j = starter_pos + 2
          eoa -= 2
        else
          tindex = 0
          j = starter_pos + 1
          eoa -= 1
        end
        codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
      end
      starter_pos += 1
      starter_char = codepoints[starter_pos]
    # -- Other characters
    else
      current_char = codepoints[pos]
      current = database.codepoints[current_char]
      if current.combining_class > previous_combining_class
        if ref = database.composition_map[starter_char]
          composition = ref[current_char]
        else
          composition = nil
        end
        unless composition.nil?
          codepoints[starter_pos] = composition
          starter_char = composition
          codepoints.delete_at pos
          eoa -= 1
          pos -= 1
          previous_combining_class = -1
        else
          previous_combining_class = current.combining_class
        end
      else
        previous_combining_class = current.combining_class
      end
      if current.combining_class == 0
        starter_pos = pos
        starter_char = codepoints[pos]
      end
    end
  end
  codepoints
end

def database

def database
  @database ||= UnicodeDatabase.new
end

def decompose(type, codepoints)

Decompose composed characters to the decomposed form.

def decompose(type, codepoints)
  codepoints.inject([]) do |decomposed, cp|
    # if it's a hangul syllable starter character
    if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
      sindex = cp - HANGUL_SBASE
      ncp = [] # new codepoints
      ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
      ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
      tindex = sindex % HANGUL_TCOUNT
      ncp << (HANGUL_TBASE + tindex) unless tindex == 0
      decomposed.concat ncp
    # if the codepoint is decomposable in with the current decomposition type
    elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatibility)
      decomposed.concat decompose(type, ncp.dup)
    else
      decomposed << cp
    end
  end
end

def downcase(string)

def downcase(string)
  apply_mapping string, :lowercase_mapping
end

def in_char_class?(codepoint, classes)

Primarily used by the grapheme cluster support.

:v, :lv, :lvt and :t.
Valid character classes are: :cr, :lf, :l,
+true+ when it's in the specified character class and +false+ otherwise.
Detect whether the codepoint is in a certain character class. Returns

def in_char_class?(codepoint, classes)
  classes.detect { |c| database.boundary[c] === codepoint } ? true : false
end

def normalize(string, form=nil)

Default is ActiveSupport::Multibyte::Unicode.default_normalization_form.
the following: :c, :kc, :d, or :kd.
* form - The form you want to normalize in. Should be one of
* string - The string to perform normalization on.

and validations.
considered the best normalization form for passing strings to databases
Returns the KC normalization of the string by default. NFKC is

def normalize(string, form=nil)
  form ||= @default_normalization_form
  # See http://www.unicode.org/reports/tr15, Table 1
  codepoints = string.codepoints.to_a
  case form
    when :d
      reorder_characters(decompose(:canonical, codepoints))
    when :c
      compose(reorder_characters(decompose(:canonical, codepoints)))
    when :kd
      reorder_characters(decompose(:compatibility, codepoints))
    when :kc
      compose(reorder_characters(decompose(:compatibility, codepoints)))
    else
      raise ArgumentError, "#{form} is not a valid normalization variant", caller
  end.pack('U*'.freeze)
end

def pack_graphemes(unpacked)

Unicode.pack_graphemes(Unicode.unpack_graphemes('क्षि')) # => 'क्षि'

Reverse operation of unpack_graphemes.

def pack_graphemes(unpacked)
  unpacked.flatten.pack('U*')
end

def recode_windows1252_chars(string)

def recode_windows1252_chars(string)
  string.encode(Encoding::UTF_8, Encoding::Windows_1252, invalid: :replace, undef: :replace)
end

def reorder_characters(codepoints)

Re-order codepoints so the string becomes canonical.

def reorder_characters(codepoints)
  length = codepoints.length- 1
  pos = 0
  while pos < length do
    cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos+1]]
    if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
      codepoints[pos..pos+1] = cp2.code, cp1.code
      pos += (pos > 0 ? -1 : 1)
    else
      pos += 1
    end
  end
  codepoints
end

def swapcase(string)

def swapcase(string)
  apply_mapping string, :swapcase_mapping
end

def tidy_bytes(string, force = false)

encoding is entirely CP1252 or ISO-8859-1.
Passing +true+ will forcibly tidy all bytes, assuming that the string's

resulting in a valid UTF-8 string.
Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent

def tidy_bytes(string, force = false)
  return string if string.empty?
  return recode_windows1252_chars(string) if force
  string.scrub { |bad| recode_windows1252_chars(bad) }
end

def tidy_bytes(string, force = false)

def tidy_bytes(string, force = false)
  return string if string.empty?
  return recode_windows1252_chars(string) if force
  # We can't transcode to the same format, so we choose a nearly-identical encoding.
  # We're going to 'transcode' bytes from UTF-8 when possible, then fall back to
  # CP1252 when we get errors. The final string will be 'converted' back to UTF-8
  # before returning.
  reader = Encoding::Converter.new(Encoding::UTF_8, Encoding::UTF_16LE)
  source = string.dup
  out = ''.force_encoding(Encoding::UTF_16LE)
  loop do
    reader.primitive_convert(source, out)
    _, _, _, error_bytes, _ = reader.primitive_errinfo
    break if error_bytes.nil?
    out << error_bytes.encode(Encoding::UTF_16LE, Encoding::Windows_1252, invalid: :replace, undef: :replace)
  end
  reader.finish
  out.encode!(Encoding::UTF_8)
end

def unpack_graphemes(string)

Unicode.unpack_graphemes('Café') # => [[67], [97], [102], [233]]
Unicode.unpack_graphemes('क्षि') # => [[2325, 2381], [2359], [2367]]

lists.
Unpack the string at grapheme boundaries. Returns a list of character

def unpack_graphemes(string)
  codepoints = string.codepoints.to_a
  unpacked = []
  pos = 0
  marker = 0
  eoc = codepoints.length
  while(pos < eoc)
    pos += 1
    previous = codepoints[pos-1]
    current = codepoints[pos]
    should_break =
      # GB3. CR X LF
      if previous == database.boundary[:cr] and current == database.boundary[:lf]
        false
      # GB4. (Control|CR|LF) ÷
      elsif previous and in_char_class?(previous, [:control,:cr,:lf])
        true
      # GB5. ÷ (Control|CR|LF)
      elsif in_char_class?(current, [:control,:cr,:lf])
        true
      # GB6. L X (L|V|LV|LVT)
      elsif database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt])
        false
      # GB7. (LV|V) X (V|T)
      elsif in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t])
        false
      # GB8. (LVT|T) X (T)
      elsif in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current
        false
      # GB8a. Regional_Indicator X Regional_Indicator
      elsif database.boundary[:regional_indicator] === previous and database.boundary[:regional_indicator] === current
        false
      # GB9. X Extend
      elsif database.boundary[:extend] === current
        false
      # GB9a. X SpacingMark
      elsif database.boundary[:spacingmark] === current
        false
      # GB9b. Prepend X
      elsif database.boundary[:prepend] === previous
        false
      # GB10. Any ÷ Any
      else
        true
      end
    if should_break
      unpacked << codepoints[marker..pos-1]
      marker = pos
    end
  end
  unpacked
end

def upcase(string)

def upcase(string)
  apply_mapping string, :uppercase_mapping
end

Modules

Classes

module ActiveSupport::Multibyte::Unicode

def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:

def apply_mapping(string, mapping) #:nodoc:

def compose(codepoints)

def database

def decompose(type, codepoints)

def downcase(string)

def in_char_class?(codepoint, classes)

def normalize(string, form=nil)

def pack_graphemes(unpacked)

def recode_windows1252_chars(string)

def reorder_characters(codepoints)

def swapcase(string)

def tidy_bytes(string, force = false)

def tidy_bytes(string, force = false)

def unpack_graphemes(string)

def upcase(string)

Classes

Class Methods

Instance Methods

Defined in