lib/patron/response_decoding.rb
module Patron # Contains methods used for decoding the HTTP response body. These are only ever used internally # by the Response class. module ResponseDecoding private CHARSET_CONTENT_TYPE_RE = /(?:charset|encoding)="?([a-z0-9-]+)"?/i.freeze MISREPORTED_ENCODING_ERROR = <<-EOF The server stated that the response has the charset matching %{declared}, but the actual response body failed to decode as such (not flagged as `valid_encoding?') Maybe the response body has a different encoding than suggested by the server, or a binary response has been tagged by the server as text by mistake. If you are performing requests against servers that are known to report wrong or invalid charsets, use `Response#body' instead and handle the character set coercion externally. For instance, you may elect to parse the resulting HTML/XML for charset declarations. EOF INVALID_CHARSET_NAME_ERROR = <<-EOF The server specified an invalid charset in the Content-Type header (%{content_type}), \ or Ruby does not support this charset. If you are performing requests against servers \ that are known to report wrong or invalid charsets, use 'Response#body` instead \ and handle the character set coercion at call site. EOF INTERNAL_CHARSET_MISMATCH_ERROR = <<-EOF The response body is %{source_encoding}, but the current \ `Encoding.default_internal' (or the encoding for a new empty string if you never \ set `Encoding.default_internal') - %{target_encoding} - cannot be used to represent the response body in \ a lossless way. Your options are: a) using `Response#body' instead b) switching your Ruby process to an encoding that supports the needed repertoire c) using `Response#inspectable_body' to convert the body in a lossy way EOF def decode_body(strict) # Try to detect the body encoding from headers body_encoding = encoding_from_headers_or_binary # See if the body actually _is_ in this encoding. encoding_matched = @body.force_encoding(body_encoding).valid_encoding? if !encoding_matched raise HeaderCharsetInvalid, MISREPORTED_ENCODING_ERROR % {declared: body_encoding} end if strict convert_encoding_and_raise(@body) else @body.encode(internal_encoding, :undefined => :replace, :replace => '?') end end def convert_encoding_and_raise(str) internal = internal_encoding str.encode(internal) rescue Encoding::UndefinedConversionError => e enc = str.encoding == Encoding::BINARY ? 'binary' : str.encoding.to_s raise NonRepresentableBody, INTERNAL_CHARSET_MISMATCH_ERROR % {source_encoding: enc, target_encoding: internal} end def charset_from_content_type return $1 if @headers["Content-Type"].to_s =~ CHARSET_CONTENT_TYPE_RE end def encoding_from_headers_or_binary return Encoding::BINARY unless charset_name = charset_from_content_type Encoding.find(charset_name) rescue ArgumentError => e # invalid charset name raise HeaderCharsetInvalid, INVALID_CHARSET_NAME_ERROR % {content_type: @headers['Content-Type'].inspect} end def internal_encoding # Use a trick here - instead of using `default_internal` we will create # an empty string, and then get it's encoding instead. For example, this holds # true on 2.1+ on OSX: # # Encoding.default_internal #=> nil # ''.encoding #=> #<Encoding:UTF-8> Encoding.default_internal || ''.encoding end def decode_header_data(str) # Header data is tricky. Strictly speaking, it _must_ be ISO-encoded. However, Content-Disposition # sometimes gets sent as raw UTF8 - and most browsers (except for localized IE versions on Windows) # treat it as such. So a fallback chain of 8859-1->UTF8->binary seems the most sane. tries = [Encoding::ISO8859_1, Encoding::UTF_8, Encoding::BINARY] tries.each do |possible_enc| begin return str.encode(possible_enc) rescue ::Encoding::UndefinedConversionError next end end str # if it doesn't encode, just give back what we got end end private_constant :ResponseDecoding if respond_to?(:private_constant) end