lib/utils/cjk.rb



module Metanorma
  module Utils
    class << self
      # Basic CJK scripts
      HAN = "\\p{Han}".freeze
      BOPOMOFO = "\\p{Bopomofo}".freeze
      HANGUL = "\\p{Hangul}".freeze
      HIRAGANA = "\\p{Hiragana}".freeze
      KATAKANA = "\\p{Katakana}".freeze
      
      # Script extensions - characters shared between scripts
      
      # CJK Symbols and Punctuation (U+3000–U+303F)
      # Used across all CJK scripts
      CJK_SYMBOLS = "[\\u3000-\\u303F]".freeze
      
      # CJK Punctuation (subset of CJK Symbols commonly used)
      CJK_PUNCTUATION = "[\\u3001-\\u3003\\u3008-\\u3011\\u3014-\\u301F]".freeze
      
      # Halfwidth and Fullwidth Forms (U+FF00–U+FFEF)
      # Used in all CJK contexts
      CJK_HALFWIDTH_FULLWIDTH = "[\\uFF00-\\uFFEF]".freeze
      
      # CJK Compatibility Forms (U+FE30–U+FE4F)
      # Primarily used with Han but relevant for all CJK
      CJK_COMPAT = "[\\uFE30-\\uFE4F]".freeze
      
      # Vertical Forms (U+FE10–U+FE1F)
      # Used in vertical text layout for all CJK
      CJK_VERTICAL = "[\\uFE10-\\uFE1F]".freeze
      
      # Small Form Variants (U+FE50–U+FE6F)
      # Used in all CJK contexts
      CJK_SMALL_FORMS = "[\\uFE50-\\uFE6F]".freeze
      
      # Ideographic Description Characters (U+2FF0–U+2FFF)
      # Used with Han script
      HAN_IDC = "[\\u2FF0-\\u2FFF]".freeze
      
      # Kanbun (U+3190–U+319F)
      # Used with Han script for Japanese
      KANBUN = "[\\u3190-\\u319F]".freeze
      
      # CJK Compatibility (U+3300–U+33FF)
      # Used with Han script
      CJK_COMPAT_IDEOGRAPHS = "[\\u3300-\\u33FF]".freeze
      
      # CJK Compatibility Ideographs (U+F900–U+FAFF)
      HAN_COMPAT_IDEOGRAPHS = "[\\uF900-\\uFAFF]".freeze
      
      # Script extensions by primary script
      HAN_EXTENSIONS = [
        HAN,
        CJK_SYMBOLS,
        CJK_PUNCTUATION,
        CJK_HALFWIDTH_FULLWIDTH,
        CJK_COMPAT,
        CJK_VERTICAL,
        CJK_SMALL_FORMS,
        HAN_IDC,
        KANBUN,
        CJK_COMPAT_IDEOGRAPHS,
        HAN_COMPAT_IDEOGRAPHS
      ].join("|").freeze
      
      HANGUL_EXTENSIONS = [
        HANGUL,
        CJK_SYMBOLS,
        CJK_PUNCTUATION,
        CJK_HALFWIDTH_FULLWIDTH,
        CJK_VERTICAL,
        CJK_SMALL_FORMS
      ].join("|").freeze
      
      HIRAGANA_EXTENSIONS = [
        HIRAGANA,
        CJK_SYMBOLS,
        CJK_PUNCTUATION,
        CJK_HALFWIDTH_FULLWIDTH,
        CJK_VERTICAL,
        CJK_SMALL_FORMS
      ].join("|").freeze
      
      KATAKANA_EXTENSIONS = [
        KATAKANA,
        CJK_SYMBOLS,
        CJK_PUNCTUATION,
        CJK_HALFWIDTH_FULLWIDTH,
        CJK_VERTICAL,
        CJK_SMALL_FORMS
      ].join("|").freeze
      
      BOPOMOFO_EXTENSIONS = [
        BOPOMOFO,
        CJK_SYMBOLS,
        CJK_PUNCTUATION,
        CJK_HALFWIDTH_FULLWIDTH
      ].join("|").freeze
      
      # Combined CJK pattern including all script extensions
      CJK = [
        HAN_EXTENSIONS,
        HANGUL_EXTENSIONS,
        HIRAGANA_EXTENSIONS,
        KATAKANA_EXTENSIONS,
        BOPOMOFO_EXTENSIONS
      ].join("|").freeze
    end
  end
end