lib/clacky/utils/encoding.rb
# frozen_string_literal: true module Clacky module Utils # Centralised UTF-8 encoding helpers used throughout the codebase. # # Three distinct use-cases exist: # # 1. to_utf8 – binary/unknown bytes → valid UTF-8 String. # Used when reading shell output, HTTP response bodies, # or any raw byte stream that is *expected* to be UTF-8 # but arrives with ASCII-8BIT (binary) encoding. # Strategy: force_encoding("UTF-8") then scrub invalid # sequences with U+FFFD so multibyte characters (CJK, # emoji, …) are preserved as-is. # # 2. sanitize_utf8 – UTF-8 String → clean UTF-8 String. # Used for UI rendering (terminal output, screen # buffers) where the string is already nominally UTF-8 # but may still contain isolated invalid bytes. # Strategy: encode UTF-8→UTF-8 replacing invalid / # undefined codepoints with an empty string so the # rendered output never contains replacement characters. # # 3. safe_check – any String → ASCII-safe UTF-8 String for regex. # Used only for security pattern matching (terminal/Security). # Multibyte bytes are replaced with '?' so that Ruby's # regex engine operates on a plain ASCII-compatible # string without raising Encoding errors. # module Encoding # Convert a binary (or unknown-encoding) byte string to a valid UTF-8 # String. Multibyte sequences that are already valid UTF-8 (e.g. CJK # characters) are preserved unchanged; only genuinely invalid byte # sequences are replaced with U+FFFD (the Unicode replacement character). # # @param data [String, nil] raw bytes, typically from a pipe or HTTP body # @return [String] valid UTF-8 string def self.to_utf8(data) return "" if data.nil? || data.empty? data.dup.force_encoding("UTF-8").scrub("\u{FFFD}") end # Clean an already-UTF-8 string by removing (not replacing) any invalid # or undefined byte sequences. Suitable for terminal / UI rendering where # replacement characters would appear as visual noise. # # @param str [String, nil] nominally UTF-8 string # @return [String] clean UTF-8 string (invalid bytes silently dropped) def self.sanitize_utf8(str) return "" if str.nil? || str.empty? str.encode("UTF-8", "UTF-8", invalid: :replace, undef: :replace, replace: "") end # Convert raw shell command output to valid UTF-8. # Handles two common cases: # - Windows commands (e.g. powershell.exe) that output GBK/CP936 bytes # - Unix commands that output UTF-8 or ASCII bytes with ASCII-8BIT encoding # # Strategy: try GBK decode first (superset of ASCII, covers Chinese Windows); # if that fails fall back to UTF-8 scrub. # # @param data [String, nil] raw bytes from backtick / IO.popen # @param source_encoding [String] hint for source encoding (default: "GBK") # @return [String] valid UTF-8 string def self.cmd_to_utf8(data, source_encoding: "GBK") return "" if data.nil? || data.empty? data.dup .force_encoding(source_encoding) .encode("UTF-8", invalid: :replace, undef: :replace, replace: "") rescue Encoding::InvalidByteSequenceError, Encoding::UndefinedConversionError to_utf8(data) end # Return an ASCII-safe UTF-8 copy of *str* suitable for security regex # pattern matching. Any byte that is not valid in the source encoding, or # that cannot be represented in UTF-8, is replaced with '?'. The # original string is never mutated. # # @param str [String, nil] # @return [String] UTF-8 string safe for regex matching def self.safe_check(str) return "" if str.nil? || str.empty? str.encode("UTF-8", invalid: :replace, undef: :replace, replace: "?") end end end end