lib/unicode/display_width.rb



# frozen_string_literal: true

require "unicode/emoji"

require_relative "display_width/constants"
require_relative "display_width/index"
require_relative "display_width/emoji_support"

module Unicode
  class DisplayWidth
    DEFAULT_AMBIGUOUS = 1
    INITIAL_DEPTH = 0x10000
    ASCII_NON_ZERO_REGEX = /[\0\x05\a\b\n-\x0F]/
    ASCII_NON_ZERO_STRING = "\0\x05\a\b\n-\x0F"
    ASCII_BACKSPACE = "\b"
    AMBIGUOUS_MAP = {
      1 => :WIDTH_ONE,
      2 => :WIDTH_TWO,
    }
    FIRST_AMBIGUOUS = {
      WIDTH_ONE: 768,
      WIDTH_TWO: 161,
    }
    NOT_COMMON_NARROW_REGEX = {
     WIDTH_ONE: /[^\u{10}-\u{2FF}]/m,
     WIDTH_TWO: /[^\u{10}-\u{A1}]/m,
    }
    FIRST_4096 = {
      WIDTH_ONE: decompress_index(INDEX[:WIDTH_ONE][0][0], 1),
      WIDTH_TWO: decompress_index(INDEX[:WIDTH_TWO][0][0], 1),
    }
    EMOJI_SEQUENCES_REGEX_MAPPING = {
      rgi: :REGEX_INCLUDE_MQE_UQE,
      rgi_at: :REGEX_INCLUDE_MQE_UQE,
      possible: :REGEX_WELL_FORMED,
    }
    REGEX_EMOJI_VS16 = Regexp.union(
      Regexp.compile(
        Unicode::Emoji::REGEX_TEXT_PRESENTATION.source +
        "(?<![#*0-9])" +
        "\u{FE0F}"
      ),
      Unicode::Emoji::REGEX_EMOJI_KEYCAP
    )

    # ebase = Unicode::Emoji::REGEX_PROP_MODIFIER_BASE.source
    REGEX_EMOJI_ALL_SEQUENCES = Regexp.union(/.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?(\u{200D}.[\u{1F3FB}-\u{1F3FF}\u{FE0F}]?)+|.[\u{1F3FB}-\u{1F3FF}]/, Unicode::Emoji::REGEX_EMOJI_KEYCAP)
    REGEX_EMOJI_ALL_SEQUENCES_AND_VS16 = Regexp.union(REGEX_EMOJI_ALL_SEQUENCES, REGEX_EMOJI_VS16)

    # Returns monospace display width of string
    def self.of(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
      # Binary strings don't make much sense when calculating display width.
      # Assume it's valid UTF-8
      if string.encoding == Encoding::BINARY && !string.force_encoding(Encoding::UTF_8).valid_encoding?
        # Didn't work out, go back to binary
        string.force_encoding(Encoding::BINARY)
      end

      string = string.encode(Encoding::UTF_8, invalid: :replace, undef: :replace) unless string.encoding == Encoding::UTF_8
      options = normalize_options(string, ambiguous, overwrite, old_options, **options)

      width = 0

      unless options[:overwrite].empty?
        width, string = width_custom(string, options[:overwrite])
      end

      if string.ascii_only?
        return width + width_ascii(string)
      end

      ambiguous_index_name = AMBIGUOUS_MAP[options[:ambiguous]]

      unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
        return width + string.size
      end

      # Retrieve Emoji width
      if options[:emoji] != :none
        e_width, string = emoji_width(
          string,
          options[:emoji],
          options[:ambiguous],
        )
        width += e_width

        unless string.match?(NOT_COMMON_NARROW_REGEX[ambiguous_index_name])
          return width + string.size
        end
      end

      index_full = INDEX[ambiguous_index_name]
      index_low = FIRST_4096[ambiguous_index_name]
      first_ambiguous = FIRST_AMBIGUOUS[ambiguous_index_name]

      string.each_codepoint{ |codepoint|
        if codepoint > 15 && codepoint < first_ambiguous
          width += 1
        elsif codepoint < 0x1001
          width += index_low[codepoint] || 1
        else
          d = INITIAL_DEPTH
          w = index_full[codepoint / d]
          while w.instance_of? Array
            w = w[(codepoint %= d) / (d /= 16)]
          end

          width += w || 1
        end
      }

      # Return result + prevent negative lengths
      width < 0 ? 0 : width
    end

    # Returns width of custom overwrites and remaining string
    def self.width_custom(string, overwrite)
      width = 0

      string = string.each_codepoint.select{ |codepoint|
        if overwrite[codepoint]
          width += overwrite[codepoint]
          nil
        else
          codepoint
        end
      }.pack("U*")

      [width, string]
    end

    # Returns width for ASCII-only strings. Will consider zero-width control symbols.
    def self.width_ascii(string)
      if string.match?(ASCII_NON_ZERO_REGEX)
        res = string.delete(ASCII_NON_ZERO_STRING).bytesize - string.count(ASCII_BACKSPACE)
        return res < 0 ? 0 : res
      end

      string.bytesize
    end

    # Returns width of all considered Emoji and remaining string
    def self.emoji_width(string, mode = :all, ambiguous = DEFAULT_AMBIGUOUS)
      res = 0

      if emoji_set_regex = EMOJI_SEQUENCES_REGEX_MAPPING[mode]
        emoji_width_via_possible(
          string,
          Unicode::Emoji.const_get(emoji_set_regex),
          mode == :rgi_at,
          ambiguous,
        )

      elsif mode == :all_no_vs16
        no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES){ res += 2; "" }
        [res, no_emoji_string]

      elsif mode == :vs16
        no_emoji_string = string.gsub(REGEX_EMOJI_VS16){ res += 2; "" }
        [res, no_emoji_string]

      elsif mode == :all
        no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ res += 2; "" }
        [res, no_emoji_string]

      else
        [0, string]

      end
    end

    # Match possible Emoji first, then refine
    def self.emoji_width_via_possible(string, emoji_set_regex, strict_eaw = false, ambiguous = DEFAULT_AMBIGUOUS)
      res = 0

      # For each string possibly an emoji
      no_emoji_string = string.gsub(REGEX_EMOJI_ALL_SEQUENCES_AND_VS16){ |emoji_candidate|
        # Check if we have a combined Emoji with width 2 (or EAW an Apple Terminal)
        if emoji_candidate == emoji_candidate[emoji_set_regex]
          if strict_eaw
            res += self.of(emoji_candidate[0], ambiguous, emoji: false)
          else
            res += 2
          end
          ""

        # We are dealing with a default text presentation emoji or a well-formed sequence not matching the above Emoji set
        else
          if !strict_eaw
            # Ensure all explicit VS16 sequences have width 2
            emoji_candidate.gsub!(REGEX_EMOJI_VS16){ res += 2; "" }
          end

          emoji_candidate
        end
      }

      [res, no_emoji_string]
    end

    def self.normalize_options(string, ambiguous = nil, overwrite = nil, old_options = {}, **options)
      unless old_options.empty?
        warn "Unicode::DisplayWidth: Please migrate to keyword arguments - #{old_options.inspect}"
        options.merge! old_options
      end

      options[:ambiguous] = ambiguous if ambiguous
      options[:ambiguous] ||= DEFAULT_AMBIGUOUS

      if options[:ambiguous] != 1 && options[:ambiguous] != 2
        raise ArgumentError, "Unicode::DisplayWidth: Ambiguous width must be 1 or 2"
      end

      if overwrite && !overwrite.empty?
        warn "Unicode::DisplayWidth: Please migrate to keyword arguments - overwrite: #{overwrite.inspect}"
        options[:overwrite] = overwrite
      end
      options[:overwrite] ||= {}

      if [nil, true, :auto].include?(options[:emoji])
        options[:emoji] = EmojiSupport.recommended
      elsif options[:emoji] == false
        options[:emoji] = :none
      end

      options
    end

    def initialize(ambiguous: DEFAULT_AMBIGUOUS, overwrite: {}, emoji: true)
      @ambiguous = ambiguous
      @overwrite = overwrite
      @emoji     = emoji
    end

    def get_config(**kwargs)
      {
        ambiguous: kwargs[:ambiguous] || @ambiguous,
        overwrite: kwargs[:overwrite] || @overwrite,
        emoji:     kwargs[:emoji]     || @emoji,
      }
    end

    def of(string, **kwargs)
      self.class.of(string, **get_config(**kwargs))
    end
  end
end