lib/rack/multipart/parser.rb



# frozen_string_literal: true

require 'strscan'

require_relative '../utils'

module Rack
  module Multipart
    class MultipartPartLimitError < Errno::EMFILE; end

    class MultipartTotalPartLimitError < StandardError; end

    # Use specific error class when parsing multipart request
    # that ends early.
    class EmptyContentError < ::EOFError; end

    # Base class for multipart exceptions that do not subclass from
    # other exception classes for backwards compatibility.
    class Error < StandardError; end

    EOL = "\r\n"
    MULTIPART = %r|\Amultipart/.*boundary=\"?([^\";,]+)\"?|ni
    TOKEN = /[^\s()<>,;:\\"\/\[\]?=]+/
    CONDISP = /Content-Disposition:\s*#{TOKEN}\s*/i
    VALUE = /"(?:\\"|[^"])*"|#{TOKEN}/
    BROKEN = /^#{CONDISP}.*;\s*filename=(#{VALUE})/i
    MULTIPART_CONTENT_TYPE = /Content-Type: (.*)#{EOL}/ni
    MULTIPART_CONTENT_DISPOSITION = /Content-Disposition:[^:]*;\s*name=(#{VALUE})/ni
    MULTIPART_CONTENT_ID = /Content-ID:\s*([^#{EOL}]*)/ni
    # Updated definitions from RFC 2231
    ATTRIBUTE_CHAR = %r{[^ \x00-\x1f\x7f)(><@,;:\\"/\[\]?='*%]}
    ATTRIBUTE = /#{ATTRIBUTE_CHAR}+/
    SECTION = /\*[0-9]+/
    REGULAR_PARAMETER_NAME = /#{ATTRIBUTE}#{SECTION}?/
    REGULAR_PARAMETER = /(#{REGULAR_PARAMETER_NAME})=(#{VALUE})/
    EXTENDED_OTHER_NAME = /#{ATTRIBUTE}\*[1-9][0-9]*\*/
    EXTENDED_OTHER_VALUE = /%[0-9a-fA-F]{2}|#{ATTRIBUTE_CHAR}/
    EXTENDED_OTHER_PARAMETER = /(#{EXTENDED_OTHER_NAME})=(#{EXTENDED_OTHER_VALUE}*)/
    EXTENDED_INITIAL_NAME = /#{ATTRIBUTE}(?:\*0)?\*/
    EXTENDED_INITIAL_VALUE = /[a-zA-Z0-9\-]*'[a-zA-Z0-9\-]*'#{EXTENDED_OTHER_VALUE}*/
    EXTENDED_INITIAL_PARAMETER = /(#{EXTENDED_INITIAL_NAME})=(#{EXTENDED_INITIAL_VALUE})/
    EXTENDED_PARAMETER = /#{EXTENDED_INITIAL_PARAMETER}|#{EXTENDED_OTHER_PARAMETER}/
    DISPPARM = /;\s*(?:#{REGULAR_PARAMETER}|#{EXTENDED_PARAMETER})\s*/
    RFC2183 = /^#{CONDISP}(#{DISPPARM})+$/i

    class Parser
      BUFSIZE = 1_048_576
      TEXT_PLAIN = "text/plain"
      TEMPFILE_FACTORY = lambda { |filename, content_type|
        Tempfile.new(["RackMultipart", ::File.extname(filename.gsub("\0", '%00'))])
      }

      class BoundedIO # :nodoc:
        def initialize(io, content_length)
          @io             = io
          @content_length = content_length
          @cursor = 0
        end

        def read(size, outbuf = nil)
          return if @cursor >= @content_length

          left = @content_length - @cursor

          str = if left < size
                  @io.read left, outbuf
                else
                  @io.read size, outbuf
                end

          if str
            @cursor += str.bytesize
          else
            # Raise an error for mismatching content-length and actual contents
            raise EOFError, "bad content body"
          end

          str
        end
      end

      MultipartInfo = Struct.new :params, :tmp_files
      EMPTY         = MultipartInfo.new(nil, [])

      def self.parse_boundary(content_type)
        return unless content_type
        data = content_type.match(MULTIPART)
        return unless data
        data[1]
      end

      def self.parse(io, content_length, content_type, tmpfile, bufsize, qp)
        return EMPTY if 0 == content_length

        boundary = parse_boundary content_type
        return EMPTY unless boundary

        if boundary.length > 70
          # RFC 1521 Section 7.2.1 imposes a 70 character maximum for the boundary.
          # Most clients use no more than 55 characters.
          raise Error, "multipart boundary size too large (#{boundary.length} characters)"
        end

        io = BoundedIO.new(io, content_length) if content_length

        parser = new(boundary, tmpfile, bufsize, qp)
        parser.parse(io)

        parser.result
      end

      class Collector
        class MimePart < Struct.new(:body, :head, :filename, :content_type, :name)
          def get_data
            data = body
            if filename == ""
              # filename is blank which means no file has been selected
              return
            elsif filename
              body.rewind if body.respond_to?(:rewind)

              # Take the basename of the upload's original filename.
              # This handles the full Windows paths given by Internet Explorer
              # (and perhaps other broken user agents) without affecting
              # those which give the lone filename.
              fn = filename.split(/[\/\\]/).last

              data = { filename: fn, type: content_type,
                      name: name, tempfile: body, head: head }
            end

            yield data
          end
        end

        class BufferPart < MimePart
          def file?; false; end
          def close; end
        end

        class TempfilePart < MimePart
          def file?; true; end
          def close; body.close; end
        end

        include Enumerable

        def initialize(tempfile)
          @tempfile = tempfile
          @mime_parts = []
          @open_files = 0
        end

        def each
          @mime_parts.each { |part| yield part }
        end

        def on_mime_head(mime_index, head, filename, content_type, name)
          if filename
            body = @tempfile.call(filename, content_type)
            body.binmode if body.respond_to?(:binmode)
            klass = TempfilePart
            @open_files += 1
          else
            body = String.new
            klass = BufferPart
          end

          @mime_parts[mime_index] = klass.new(body, head, filename, content_type, name)

          check_part_limits
        end

        def on_mime_body(mime_index, content)
          @mime_parts[mime_index].body << content
        end

        def on_mime_finish(mime_index)
        end

        private

        def check_part_limits
          file_limit = Utils.multipart_file_limit
          part_limit = Utils.multipart_total_part_limit

          if file_limit && file_limit > 0
            if @open_files >= file_limit
              @mime_parts.each(&:close)
              raise MultipartPartLimitError, 'Maximum file multiparts in content reached'
            end
          end

          if part_limit && part_limit > 0
            if @mime_parts.size >= part_limit
              @mime_parts.each(&:close)
              raise MultipartTotalPartLimitError, 'Maximum total multiparts in content reached'
            end
          end
        end
      end

      attr_reader :state

      def initialize(boundary, tempfile, bufsize, query_parser)
        @query_parser   = query_parser
        @params         = query_parser.make_params
        @bufsize        = bufsize

        @state = :FAST_FORWARD
        @mime_index = 0
        @collector = Collector.new tempfile

        @sbuf = StringScanner.new("".dup)
        @body_regex = /(?:#{EOL}|\A)--#{Regexp.quote(boundary)}(?:#{EOL}|--)/m
        @rx_max_size = boundary.bytesize + 6 # (\r\n-- at start, either \r\n or -- at finish)
        @head_regex = /(.*?#{EOL})#{EOL}/m
      end

      def parse(io)
        outbuf = String.new
        read_data(io, outbuf)

        loop do
          status =
            case @state
            when :FAST_FORWARD
              handle_fast_forward
            when :CONSUME_TOKEN
              handle_consume_token
            when :MIME_HEAD
              handle_mime_head
            when :MIME_BODY
              handle_mime_body
            else # when :DONE
              return
            end

          read_data(io, outbuf) if status == :want_read
        end
      end

      def result
        @collector.each do |part|
          part.get_data do |data|
            tag_multipart_encoding(part.filename, part.content_type, part.name, data)
            @query_parser.normalize_params(@params, part.name, data)
          end
        end
        MultipartInfo.new @params.to_params_hash, @collector.find_all(&:file?).map(&:body)
      end

      private

      def dequote(str) # From WEBrick::HTTPUtils
        ret = (/\A"(.*)"\Z/ =~ str) ? $1 : str.dup
        ret.gsub!(/\\(.)/, "\\1")
        ret
      end

      def read_data(io, outbuf)
        content = io.read(@bufsize, outbuf)
        handle_empty_content!(content)
        @sbuf.concat(content)
      end

      # This handles the initial parser state.  We read until we find the starting
      # boundary, then we can transition to the next state. If we find the ending
      # boundary, this is an invalid multipart upload, but keep scanning for opening
      # boundary in that case. If no boundary found, we need to keep reading data
      # and retry. It's highly unlikely the initial read will not consume the
      # boundary.  The client would have to deliberately craft a response
      # with the opening boundary beyond the buffer size for that to happen.
      def handle_fast_forward
        while true
          case consume_boundary
          when :BOUNDARY
            # found opening boundary, transition to next state
            @state = :MIME_HEAD
            return
          when :END_BOUNDARY
            # invalid multipart upload, but retry for opening boundary
          else
            # no boundary found, keep reading data
            return :want_read
          end
        end
      end

      def handle_consume_token
        tok = consume_boundary
        # break if we're at the end of a buffer, but not if it is the end of a field
        @state = if tok == :END_BOUNDARY || (@sbuf.eos? && tok != :BOUNDARY)
          :DONE
        else
          :MIME_HEAD
        end
      end

      def handle_mime_head
        if @sbuf.scan_until(@head_regex)
          head = @sbuf[1]
          content_type = head[MULTIPART_CONTENT_TYPE, 1]
          if name = head[MULTIPART_CONTENT_DISPOSITION, 1]
            name = dequote(name)
          else
            name = head[MULTIPART_CONTENT_ID, 1]
          end

          filename = get_filename(head)

          if name.nil? || name.empty?
            name = filename || "#{content_type || TEXT_PLAIN}[]".dup
          end

          @collector.on_mime_head @mime_index, head, filename, content_type, name
          @state = :MIME_BODY
        else
          :want_read
        end
      end

      def handle_mime_body
        if (body_with_boundary = @sbuf.check_until(@body_regex)) # check but do not advance the pointer yet
          body = body_with_boundary.sub(/#{@body_regex}\z/m, '') # remove the boundary from the string
          @collector.on_mime_body @mime_index, body
          @sbuf.pos += body.length + 2 # skip \r\n after the content
          @state = :CONSUME_TOKEN
          @mime_index += 1
        else
          # Save what we have so far
          if @rx_max_size < @sbuf.rest_size
            delta = @sbuf.rest_size - @rx_max_size
            @collector.on_mime_body @mime_index, @sbuf.peek(delta)
            @sbuf.pos += delta
            @sbuf.string = @sbuf.rest
          end
          :want_read
        end
      end

      # Scan until the we find the start or end of the boundary.
      # If we find it, return the appropriate symbol for the start or
      # end of the boundary.  If we don't find the start or end of the
      # boundary, clear the buffer and return nil.
      def consume_boundary
        if read_buffer = @sbuf.scan_until(@body_regex)
          read_buffer.end_with?(EOL) ? :BOUNDARY : :END_BOUNDARY
        else
          @sbuf.terminate
          nil
        end
      end

      def get_filename(head)
        filename = nil
        case head
        when RFC2183
          params = Hash[*head.scan(DISPPARM).flat_map(&:compact)]

          if filename = params['filename*']
            encoding, _, filename = filename.split("'", 3)
          elsif filename = params['filename']
            filename = $1 if filename =~ /^"(.*)"$/
          end
        when BROKEN
          filename = $1
          filename = $1 if filename =~ /^"(.*)"$/
        end

        return unless filename

        if filename.scan(/%.?.?/).all? { |s| /%[0-9a-fA-F]{2}/.match?(s) }
          filename = Utils.unescape_path(filename)
        end

        filename.scrub!

        if filename !~ /\\[^\\"]/
          filename = filename.gsub(/\\(.)/, '\1')
        end

        if encoding
          filename.force_encoding ::Encoding.find(encoding)
        end

        filename
      end

      CHARSET = "charset"
      deprecate_constant :CHARSET

      def tag_multipart_encoding(filename, content_type, name, body)
        name = name.to_s
        encoding = Encoding::UTF_8

        name.force_encoding(encoding)

        return if filename

        if content_type
          list         = content_type.split(';')
          type_subtype = list.first
          type_subtype.strip!
          if TEXT_PLAIN == type_subtype
            rest = list.drop 1
            rest.each do |param|
              k, v = param.split('=', 2)
              k.strip!
              v.strip!
              v = v[1..-2] if v.start_with?('"') && v.end_with?('"')
              if k == "charset"
                encoding = begin
                  Encoding.find v
                rescue ArgumentError
                  Encoding::BINARY
                end
              end
            end
          end
        end

        name.force_encoding(encoding)
        body.force_encoding(encoding)
      end

      def handle_empty_content!(content)
        if content.nil? || content.empty?
          raise EmptyContentError
        end
      end
    end
  end
end