lib/rspec/support/encoded_string.rb
module RSpec module Support # @private class EncodedString # Reduce allocations by storing constants. UTF_8 = "UTF-8" US_ASCII = "US-ASCII" # Ruby's default replacement string is: # U+FFFD ("\xEF\xBF\xBD"), for Unicode encoding forms, else # ? ("\x3F") REPLACE = "?" def initialize(string, encoding=nil) @encoding = encoding @source_encoding = detect_source_encoding(string) @string = matching_encoding(string) end attr_reader :source_encoding delegated_methods = String.instance_methods.map(&:to_s) & %w[eql? lines == encoding empty?] delegated_methods.each do |name| define_method(name) { |*args, &block| @string.__send__(name, *args, &block) } end def <<(string) @string << matching_encoding(string) end if Ruby.jruby? def split(regex_or_string) @string.split(matching_encoding(regex_or_string)) rescue ArgumentError # JRuby raises an ArgumentError when splitting a source string that # contains invalid bytes. remove_invalid_bytes(@string).split regex_or_string end else def split(regex_or_string) @string.split(matching_encoding(regex_or_string)) end end def to_s @string end alias :to_str :to_s if String.method_defined?(:encoding) private # Encoding Exceptions: # # Raised by Encoding and String methods: # Encoding::UndefinedConversionError: # when a transcoding operation fails # if the String contains characters invalid for the target encoding # e.g. "\x80".encode('UTF-8','ASCII-8BIT') # vs "\x80".encode('UTF-8','ASCII-8BIT', undef: :replace, replace: '<undef>') # # => '<undef>' # Encoding::CompatibilityError # when Encoding.compatible?(str1, str2) is nil # e.g. utf_16le_emoji_string.split("\n") # e.g. valid_unicode_string.encode(utf8_encoding) << ascii_string # Encoding::InvalidByteSequenceError: # when the string being transcoded contains a byte invalid for # either the source or target encoding # e.g. "\x80".encode('UTF-8','US-ASCII') # vs "\x80".encode('UTF-8','US-ASCII', invalid: :replace, replace: '<byte>') # # => '<byte>' # ArgumentError # when operating on a string with invalid bytes # e.g."\x80".split("\n") # TypeError # when a symbol is passed as an encoding # Encoding.find(:"UTF-8") # when calling force_encoding on an object # that doesn't respond to #to_str # # Raised by transcoding methods: # Encoding::ConverterNotFoundError: # when a named encoding does not correspond with a known converter # e.g. 'abc'.force_encoding('UTF-8').encode('foo') # or a converter path cannot be found # e.g. "\x80".force_encoding('ASCII-8BIT').encode('Emacs-Mule') # # Raised by byte <-> char conversions # RangeError: out of char range # e.g. the UTF-16LE emoji: 128169.chr def matching_encoding(string) string = remove_invalid_bytes(string) string.encode(@encoding) rescue Encoding::UndefinedConversionError, Encoding::InvalidByteSequenceError # Originally defined as a constant to avoid unneeded allocations, this hash must # be defined inline (without {}) to avoid warnings on Ruby 2.7 # # In MRI 2.1 'invalid: :replace' changed to also replace an invalid byte sequence # see https://github.com/ruby/ruby/blob/v2_1_0/NEWS#L176 # https://www.ruby-forum.com/topic/6861247 # https://twitter.com/nalsh/status/553413844685438976 # # For example, given: # "\x80".force_encoding("Emacs-Mule").encode(:invalid => :replace).bytes.to_a # # On MRI 2.1 or above: 63 # '?' # else : 128 # "\x80" # string.encode(@encoding, :invalid => :replace, :undef => :replace, :replace => REPLACE) rescue Encoding::ConverterNotFoundError # Originally defined as a constant to avoid unneeded allocations, this hash must # be defined inline (without {}) to avoid warnings on Ruby 2.7 string.dup.force_encoding(@encoding).encode(:invalid => :replace, :replace => REPLACE) end # Prevents raising ArgumentError if String.method_defined?(:scrub) # https://github.com/ruby/ruby/blob/eeb05e8c11/doc/NEWS-2.1.0#L120-L123 # https://github.com/ruby/ruby/blob/v2_1_0/string.c#L8242 # https://github.com/hsbt/string-scrub # https://github.com/rubinius/rubinius/blob/v2.5.2/kernel/common/string.rb#L1913-L1972 def remove_invalid_bytes(string) string.scrub(REPLACE) end else # http://stackoverflow.com/a/8711118/879854 # Loop over chars in a string replacing chars # with invalid encoding, which is a pretty good proxy # for the invalid byte sequence that causes an ArgumentError def remove_invalid_bytes(string) string.chars.map do |char| char.valid_encoding? ? char : REPLACE end.join end end def detect_source_encoding(string) string.encoding end def self.pick_encoding(source_a, source_b) Encoding.compatible?(source_a, source_b) || Encoding.default_external end else def self.pick_encoding(_source_a, _source_b) end private def matching_encoding(string) string end def detect_source_encoding(_string) US_ASCII end end end end end