lib/sprockets/encoding_utils.rb



require 'base64'
require 'stringio'
require 'zlib'

module Sprockets
  # Internal: HTTP transport encoding and charset detecting related functions.
  # Mixed into Environment.
  module EncodingUtils
    extend self

    ## Binary encodings ##

    # Public: Use deflate to compress data.
    #
    # str - String data
    #
    # Returns a compressed String
    def deflate(str)
      deflater = Zlib::Deflate.new(
        Zlib::BEST_COMPRESSION,
        -Zlib::MAX_WBITS,
        Zlib::MAX_MEM_LEVEL,
        Zlib::DEFAULT_STRATEGY
      )
      deflater << str
      deflater.finish
    end

    # Internal: Unmarshal optionally deflated data.
    #
    # Checks leading marshal header to see if the bytes are uncompressed
    # otherwise inflate the data an unmarshal.
    #
    # str - Marshaled String
    # window_bits - Integer deflate window size. See ZLib::Inflate.new()
    #
    # Returns unmarshaled Object or raises an Exception.
    def unmarshaled_deflated(str, window_bits = -Zlib::MAX_WBITS)
      major, minor = str[0], str[1]
      if major && major.ord == Marshal::MAJOR_VERSION &&
          minor && minor.ord <= Marshal::MINOR_VERSION
        marshaled = str
      else
        begin
          marshaled = Zlib::Inflate.new(window_bits).inflate(str)
        rescue Zlib::DataError
          marshaled = str
        end
      end
      Marshal.load(marshaled)
    end

    # Public: Use gzip to compress data.
    #
    # str - String data
    #
    # Returns a compressed String
    def gzip(str)
      io = StringIO.new
      gz = Zlib::GzipWriter.new(io, Zlib::BEST_COMPRESSION)
      gz.mtime = 1
      gz << str
      gz.finish
      io.string
    end

    # Public: Use base64 to encode data.
    #
    # str - String data
    #
    # Returns a encoded String
    def base64(str)
      Base64.strict_encode64(str)
    end


    ## Charset encodings ##

    # Internal: Shorthand aliases for detecter functions.
    CHARSET_DETECT = {}

    # Internal: Mapping unicode encodings to byte order markers.
    BOM = {
      Encoding::UTF_32LE => [0xFF, 0xFE, 0x00, 0x00],
      Encoding::UTF_32BE => [0x00, 0x00, 0xFE, 0xFF],
      Encoding::UTF_8    => [0xEF, 0xBB, 0xBF],
      Encoding::UTF_16LE => [0xFF, 0xFE],
      Encoding::UTF_16BE => [0xFE, 0xFF]
    }

    # Public: Basic string detecter.
    #
    # Attempts to parse any Unicode BOM otherwise falls back to the
    # environment's external encoding.
    #
    # str - ASCII-8BIT encoded String
    #
    # Returns encoded String.
    def detect(str)
      str = detect_unicode_bom(str)

      # Attempt Charlock detection
      if str.encoding == Encoding::BINARY
        charlock_detect(str)
      end

      # Fallback to environment's external encoding
      if str.encoding == Encoding::BINARY
        str.force_encoding(Encoding.default_external)
      end

      str
    end
    CHARSET_DETECT[:default] = method(:detect)

    # Internal: Use Charlock Holmes to detect encoding.
    #
    # To enable this code path, require 'charlock_holmes'
    #
    # Returns encoded String.
    def charlock_detect(str)
      if defined? CharlockHolmes::EncodingDetector
        if detected = CharlockHolmes::EncodingDetector.detect(str)
          str.force_encoding(detected[:encoding]) if detected[:encoding]
        end
      end

      str
    end

    # Public: Detect Unicode string.
    #
    # Attempts to parse Unicode BOM and falls back to UTF-8.
    #
    # str - ASCII-8BIT encoded String
    #
    # Returns encoded String.
    def detect_unicode(str)
      str = detect_unicode_bom(str)

      # Fallback to UTF-8
      if str.encoding == Encoding::BINARY
        str.force_encoding(Encoding::UTF_8)
      end

      str
    end
    CHARSET_DETECT[:unicode] = method(:detect_unicode)

    # Public: Detect and strip BOM from possible unicode string.
    #
    # str - ASCII-8BIT encoded String
    #
    # Returns UTF 8/16/32 encoded String without BOM or the original String if
    # no BOM was present.
    def detect_unicode_bom(str)
      bom_bytes = str.byteslice(0, 4).bytes.to_a

      BOM.each do |encoding, bytes|
        if bom_bytes[0, bytes.size] == bytes
          str = str.dup
          str.force_encoding(Encoding::BINARY)
          str.slice!(0, bytes.size)
          str.force_encoding(encoding)
          return str
        end
      end

      return str
    end

    # Public: Detect and strip @charset from CSS style sheet.
    #
    # str - String.
    #
    # Returns a encoded String.
    def detect_css(str)
      str = detect_unicode_bom(str)

      if name = scan_css_charset(str)
        encoding = Encoding.find(name)
        str = str.dup
        str.force_encoding(encoding)
        len = "@charset \"#{name}\";".encode(encoding).size
        str.slice!(0, len)
        str
      end

      # Fallback to UTF-8
      if str.encoding == Encoding::BINARY
        str.force_encoding(Encoding::UTF_8)
      end

      str
    end
    CHARSET_DETECT[:css] = method(:detect_css)

    # Internal: @charset bytes
    CHARSET_START = [0x40, 0x63, 0x68, 0x61, 0x72, 0x73, 0x65, 0x74, 0x20, 0x22]
    CHARSET_SIZE  = CHARSET_START.size

    # Internal: Scan binary CSS string for @charset encoding name.
    #
    # str - ASCII-8BIT encoded String
    #
    # Returns encoding String name or nil.
    def scan_css_charset(str)
      buf = []
      i = 0

      str.each_byte.each do |byte|
        # Halt on line breaks
        break if byte == 0x0A || byte == 0x0D

        # Only ascii bytes
        next unless 0x0 < byte && byte <= 0xFF

        if i < CHARSET_SIZE
        elsif i == CHARSET_SIZE
          if buf == CHARSET_START
            buf = []
          else
            break
          end
        elsif byte == 0x22
          return buf.pack('C*')
        end

        buf << byte
        i += 1
      end

      nil
    end

    # Public: Detect charset from HTML document.
    #
    # Attempts to parse any Unicode BOM otherwise attempt Charlock detection
    # and finally falls back to the environment's external encoding.
    #
    # str - String.
    #
    # Returns a encoded String.
    def detect_html(str)
      str = detect_unicode_bom(str)

      # Attempt Charlock detection
      if str.encoding == Encoding::BINARY
        charlock_detect(str)
      end

      # Fallback to environment's external encoding
      if str.encoding == Encoding::BINARY
        str.force_encoding(Encoding.default_external)
      end

      str
    end
    CHARSET_DETECT[:html] = method(:detect_html)
  end
end