module ActiveSupport::Multibyte::Unicode

def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:

:nodoc:
Returns a regular expression pattern that matches the passed Unicode codepoints

def self.codepoints_to_pattern(array_of_codepoints) #:nodoc:
  array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')
end

def apply_mapping(string, mapping) #:nodoc:

:nodoc:

def apply_mapping(string, mapping) #:nodoc:
  u_unpack(string).map do |codepoint|
    cp = database.codepoints[codepoint]
    if cp and (ncp = cp.send(mapping)) and ncp > 0
      ncp
    else
      codepoint
    end
  end.pack('U*')
end

def compose_codepoints(codepoints)

Compose decomposed characters to the composed form.

def compose_codepoints(codepoints)
  pos = 0
  eoa = codepoints.length - 1
  starter_pos = 0
  starter_char = codepoints[0]
  previous_combining_class = -1
  while pos < eoa
    pos += 1
    lindex = starter_char - HANGUL_LBASE
    # -- Hangul
    if 0 <= lindex and lindex < HANGUL_LCOUNT
      vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1
      if 0 <= vindex and vindex < HANGUL_VCOUNT
        tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1
        if 0 <= tindex and tindex < HANGUL_TCOUNT
          j = starter_pos + 2
          eoa -= 2
        else
          tindex = 0
          j = starter_pos + 1
          eoa -= 1
        end
        codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE
      end
      starter_pos += 1
      starter_char = codepoints[starter_pos]
    # -- Other characters
    else
      current_char = codepoints[pos]
      current = database.codepoints[current_char]
      if current.combining_class > previous_combining_class
        if ref = database.composition_map[starter_char]
          composition = ref[current_char]
        else
          composition = nil
        end
        unless composition.nil?
          codepoints[starter_pos] = composition
          starter_char = composition
          codepoints.delete_at pos
          eoa -= 1
          pos -= 1
          previous_combining_class = -1
        else
          previous_combining_class = current.combining_class
        end
      else
        previous_combining_class = current.combining_class
      end
      if current.combining_class == 0
        starter_pos = pos
        starter_char = codepoints[pos]
      end
    end
  end
  codepoints
end

def database

def database
  @database ||= UnicodeDatabase.new
end

def decompose_codepoints(type, codepoints)

Decompose composed characters to the decomposed form.

def decompose_codepoints(type, codepoints)
  codepoints.inject([]) do |decomposed, cp|
    # if it's a hangul syllable starter character
    if HANGUL_SBASE <= cp and cp < HANGUL_SLAST
      sindex = cp - HANGUL_SBASE
      ncp = [] # new codepoints
      ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT
      ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT
      tindex = sindex % HANGUL_TCOUNT
      ncp << (HANGUL_TBASE + tindex) unless tindex == 0
      decomposed.concat ncp
    # if the codepoint is decomposable in with the current decomposition type
    elsif (ncp = database.codepoints[cp].decomp_mapping) and (!database.codepoints[cp].decomp_type || type == :compatability)
      decomposed.concat decompose_codepoints(type, ncp.dup)
    else
      decomposed << cp
    end
  end
end

def g_pack(unpacked)

Unicode.g_pack(Unicode.g_unpack('क्षि')) # => 'क्षि'
Example:

Reverse operation of g_unpack.

def g_pack(unpacked)
  (unpacked.flatten).pack('U*')
end

def g_unpack(string)

Unicode.g_unpack('Café') # => [[67], [97], [102], [233]]
Unicode.g_unpack('क्षि') # => [[2325, 2381], [2359], [2367]]
Example:

Unpack the string at grapheme boundaries. Returns a list of character lists.

def g_unpack(string)
  codepoints = u_unpack(string)
  unpacked = []
  pos = 0
  marker = 0
  eoc = codepoints.length
  while(pos < eoc)
    pos += 1
    previous = codepoints[pos-1]
    current = codepoints[pos]
    if (
        # CR X LF
        ( previous == database.boundary[:cr] and current == database.boundary[:lf] ) or
        # L X (L|V|LV|LVT)
        ( database.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or
        # (LV|V) X (V|T)
        ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or
        # (LVT|T) X (T)
        ( in_char_class?(previous, [:lvt,:t]) and database.boundary[:t] === current ) or
        # X Extend
        (database.boundary[:extend] === current)
      )
    else
      unpacked << codepoints[marker..pos-1]
      marker = pos
    end
  end
  unpacked
end

def in_char_class?(codepoint, classes)

Primarily used by the grapheme cluster support.

:v, :lv, :lvt and :t.
character class and +false+ otherwise. Valid character classes are: :cr, :lf, :l,
Detect whether the codepoint is in a certain character class. Returns +true+ when it's in the specified

def in_char_class?(codepoint, classes)
  classes.detect { |c| database.boundary[c] === codepoint } ? true : false
end

def normalize(string, form=nil)

ActiveSupport::Multibyte.default_normalization_form
:c, :kc, :d, or :kd. Default is
* form - The form you want to normalize in. Should be one of the following:
* string - The string to perform normalization on.

passing strings to databases and validations.
Returns the KC normalization of the string by default. NFKC is considered the best normalization form for

def normalize(string, form=nil)
  form ||= @default_normalization_form
  # See http://www.unicode.org/reports/tr15, Table 1
  codepoints = u_unpack(string)
  case form
    when :d
      reorder_characters(decompose_codepoints(:canonical, codepoints))
    when :c
      compose_codepoints(reorder_characters(decompose_codepoints(:canonical, codepoints)))
    when :kd
      reorder_characters(decompose_codepoints(:compatability, codepoints))
    when :kc
      compose_codepoints(reorder_characters(decompose_codepoints(:compatability, codepoints)))
    else
      raise ArgumentError, "#{form} is not a valid normalization variant", caller
  end.pack('U*')
end

def reorder_characters(codepoints)

Re-order codepoints so the string becomes canonical.

def reorder_characters(codepoints)
  length = codepoints.length- 1
  pos = 0
  while pos < length do
    cp1, cp2 = database.codepoints[codepoints[pos]], database.codepoints[codepoints[pos+1]]
    if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0)
      codepoints[pos..pos+1] = cp2.code, cp1.code
      pos += (pos > 0 ? -1 : 1)
    else
      pos += 1
    end
  end
  codepoints
end

def tidy_byte(byte)

def tidy_byte(byte)
  if byte < 160
    [database.cp1252[byte] || byte].pack("U").unpack("C*")
  elsif byte < 192
    [194, byte]
  else
    [195, byte - 64]
  end
end

def tidy_bytes(string, force = false)

Passing +true+ will forcibly tidy all bytes, assuming that the string's encoding is entirely CP1252 or ISO-8859-1.

Replaces all ISO-8859-1 or CP1252 characters by their UTF-8 equivalent resulting in a valid UTF-8 string.

def tidy_bytes(string, force = false)
  if force
    return string.unpack("C*").map do |b|
      tidy_byte(b)
    end.flatten.compact.pack("C*").unpack("U*").pack("U*")
  end
  bytes = string.unpack("C*")
  conts_expected = 0
  last_lead = 0
  bytes.each_index do |i|
    byte          = bytes[i]
    is_cont       = byte > 127 && byte < 192
    is_lead       = byte > 191 && byte < 245
    is_unused     = byte > 240
    is_restricted = byte > 244
    # Impossible or highly unlikely byte? Clean it.
    if is_unused || is_restricted
      bytes[i] = tidy_byte(byte)
    elsif is_cont
      # Not expecting contination byte? Clean up. Otherwise, now expect one less.
      conts_expected == 0 ? bytes[i] = tidy_byte(byte) : conts_expected -= 1
    else
      if conts_expected > 0
        # Expected continuation, but got ASCII or leading? Clean backwards up to
        # the leading byte.
        (1..(i - last_lead)).each {|j| bytes[i - j] = tidy_byte(bytes[i - j])}
        conts_expected = 0
      end
      if is_lead
        # Final byte is leading? Clean it.
        if i == bytes.length - 1
          bytes[i] = tidy_byte(bytes.last)
        else
          # Valid leading byte? Expect continuations determined by position of
          # first zero bit, with max of 3.
          conts_expected = byte < 224 ? 1 : byte < 240 ? 2 : 3
          last_lead = i
        end
      end
    end
  end
  bytes.empty? ? "" : bytes.flatten.compact.pack("C*").unpack("U*").pack("U*")
end

def u_unpack(string)

Unicode.u_unpack('Café') # => [67, 97, 102, 233]
Example:

valid UTF-8.
Unpack the string at codepoints boundaries. Raises an EncodingError when the encoding of the string isn't

def u_unpack(string)
  begin
    string.unpack 'U*'
  rescue ArgumentError
    raise EncodingError, 'malformed UTF-8 character'
  end
end

Modules

Classes