lib/more_math/sequence.rb



require 'more_math'
require 'spruz/memoize'

module MoreMath
  # This class is used to contain elements and compute various statistical
  # values for them.
  class Sequence
    def initialize(elements)
      @elements = elements.dup.freeze
    end

    # Returns the array of elements.
    attr_reader :elements

    # Calls the +block+ for every element of this Sequence.
    def each(&block)
      @elements.each(&block)
    end
    include Enumerable

    # Returns true if this sequence is empty, otherwise false.
    def empty?
      @elements.empty?
    end

    # Returns the number of elements, on which the analysis is based.
    def size
      @elements.size
    end

    # Reset all memoized values of this sequence.
    def reset
      self.class.memoize_cache_clear
      self
    end

    # Returns the variance of the elements.
    def variance
      sum_of_squares / size
    end
    memoize_method :variance

    # Returns the sample_variance of the elements.
    def sample_variance
      size > 1 ? sum_of_squares / (size - 1.0) : 0.0
    end
    memoize_method :sample_variance

    # Returns the sum of squares (the sum of the squared deviations) of the
    # elements.
    def sum_of_squares
      @elements.inject(0.0) { |s, t| s + (t - arithmetic_mean) ** 2 }
    end
    memoize_method :sum_of_squares

    # Returns the standard deviation of the elements.
    def standard_deviation
      Math.sqrt(variance)
    end
    memoize_method :standard_deviation

    # Returns the standard deviation of the elements in percentage of the
    # arithmetic mean.
    def standard_deviation_percentage
      100.0 * standard_deviation / arithmetic_mean
    end
    memoize_method :standard_deviation_percentage

    # Returns the sample standard deviation of the elements.
    def sample_standard_deviation
      Math.sqrt(sample_variance)
    end
    memoize_method :sample_standard_deviation

    # Returns the sample standard deviation of the elements in percentage
    # of the arithmetic mean.
    def sample_standard_deviation_percentage
      100.0 * sample_standard_deviation / arithmetic_mean
    end
    memoize_method :sample_standard_deviation_percentage

    # Returns the sum of all elements.
    def sum
      @elements.inject(0.0) { |s, t| s + t }
    end
    memoize_method :sum

    # Returns the arithmetic mean of the elements.
    def arithmetic_mean
      sum / size
    end
    memoize_method :arithmetic_mean

    alias mean arithmetic_mean

    # Returns the harmonic mean of the elements. If any of the elements
    # is less than or equal to 0.0, this method returns NaN.
    def harmonic_mean
      sum = @elements.inject(0.0) { |s, t|
        if t > 0
          s + 1.0 / t
        else
          break nil
        end
      }
      sum ? size / sum : 0 / 0.0
    end
    memoize_method :harmonic_mean

    # Returns the geometric mean of the elements. If any of the
    # elements is less than 0.0, this method returns NaN.
    def geometric_mean
      sum = @elements.inject(0.0) { |s, t|
        case
        when t > 0
          s + Math.log(t)
        when t == 0
          break :null
        else
          break nil
        end
      }
      case sum
      when :null
        0.0
      when Float
        Math.exp(sum / size)
      else
        0 / 0.0
      end
    end
    memoize_method :geometric_mean

    # Returns the minimum of the elements.
    def min
      @elements.min
    end
    memoize_method :min

    # Returns the maximum of the elements.
    def max
      @elements.max
    end
    memoize_method :max

    # Return a sorted array of the elements.
    def sorted
      @elements.sort
    end
    memoize_method :sorted

    # Returns the +p+-percentile of the elements.
    # There are many methods to compute the percentile, this method uses the
    # the weighted average at x_(n + 1)p, which allows p to be in 0...100
    # (excluding the 100).
    def percentile(p = 50)
      (0...100).include?(p) or
        raise ArgumentError, "p = #{p}, but has to be in (0...100)"
      p /= 100.0
      sorted_elements = sorted
      r = p * (sorted_elements.size + 1)
      r_i = r.to_i
      r_f = r - r_i
      if r_i >= 1
        result = sorted_elements[r_i - 1]
        if r_i < sorted_elements.size
          result += r_f * (sorted_elements[r_i] - sorted_elements[r_i - 1])
        end
      else
        result = sorted_elements[0]
      end
      result
    end

    alias median percentile

    # Use an approximation of the Welch-Satterthwaite equation to compute the
    # degrees of freedom for Welch's t-test.
    def compute_welch_df(other)
      (sample_variance / size + other.sample_variance / other.size) ** 2 / (
        (sample_variance ** 2 / (size ** 2 * (size - 1))) +
        (other.sample_variance ** 2 / (other.size ** 2 * (other.size - 1))))
    end

    # Returns the t value of the Welch's t-test between this Sequence
    # instance and the +other+.
    def t_welch(other)
      signal = arithmetic_mean - other.arithmetic_mean
      noise = Math.sqrt(sample_variance / size +
        other.sample_variance / other.size)
      signal / noise
    rescue Errno::EDOM
      0.0
    end

    # Returns an estimation of the common standard deviation of the
    # elements of this and +other+.
    def common_standard_deviation(other)
      Math.sqrt(common_variance(other))
    end

    # Returns an estimation of the common variance of the elements of this
    # and +other+.
    def common_variance(other)
      (size - 1) * sample_variance + (other.size - 1) *
        other.sample_variance / (size + other.size - 2)
    end

    # Compute the # degrees of freedom for Student's t-test.
    def compute_student_df(other)
      size + other.size - 2
    end

    # Returns the t value of the Student's t-test between this Sequence
    # instance and the +other+.
    def t_student(other)
      signal = arithmetic_mean - other.arithmetic_mean
      noise = common_standard_deviation(other) *
        Math.sqrt(size ** -1 + size ** -1)
    rescue Errno::EDOM
      0.0
    end

    # Compute a sample size, that will more likely yield a mean difference
    # between this instance's elements and those of +other+. Use +alpha+
    # and +beta+ as levels for the first- and second-order errors.
    def suggested_sample_size(other, alpha = 0.05, beta = 0.05)
      alpha, beta = alpha.abs, beta.abs
      signal = arithmetic_mean - other.arithmetic_mean
      df = size + other.size - 2
      pooled_variance_estimate = (sum_of_squares + other.sum_of_squares) / df
      td = TDistribution.new df
      (((td.inverse_probability(alpha) + td.inverse_probability(beta)) *
        Math.sqrt(pooled_variance_estimate)) / signal) ** 2
    end

    # Return true, if the Sequence instance covers the +other+, that is their
    # arithmetic mean value is most likely to be equal for the +alpha+ error
    # level.
    def cover?(other, alpha = 0.05)
      t = t_welch(other)
      td = TDistribution.new(compute_welch_df(other))
      t.abs < td.inverse_probability(1 - alpha.abs / 2.0)
    end

    # Return the confidence interval for the arithmetic mean with alpha level +alpha+ of
    # the elements of this Sequence instance as a Range object.
    def confidence_interval(alpha = 0.05)
      td = TDistribution.new(size - 1)
      t = td.inverse_probability(alpha / 2).abs
      delta = t * sample_standard_deviation / Math.sqrt(size)
      (arithmetic_mean - delta)..(arithmetic_mean + delta)
    end

    # Returns the array of autovariances (of length size - 1).
    def autovariance
      Array.new(size - 1) do |k|
        s = 0.0
        0.upto(size - k - 1) do |i|
          s += (@elements[i] - arithmetic_mean) * (@elements[i + k] - arithmetic_mean)
        end
        s / size
      end
    end

    # Returns the array of autocorrelation values c_k / c_0 (of length size -
    # 1).
    def autocorrelation
      c = autovariance
      Array.new(c.size) { |k| c[k] / c[0] }
    end

    # Returns the d-value for the Durbin-Watson statistic. The value is d << 2
    # for positive, d >> 2 for negative and d around 2 for no autocorrelation.
    def durbin_watson_statistic
      e = linear_regression.residues
      e.size <= 1 and return 2.0
      (1...e.size).inject(0.0) { |s, i| s + (e[i] - e[i - 1]) ** 2 } /
        e.inject(0.0) { |s, x| s + x ** 2 }
    end

    # Returns the q value of the Ljung-Box statistic for the number of lags
    # +lags+. A higher value might indicate autocorrelation in the elements of
    # this Sequence instance. This method returns nil if there weren't enough
    # (at least lags) lags available.
    def ljung_box_statistic(lags = 20)
      r = autocorrelation
      lags >= r.size and return
      n = size
      n * (n + 2) * (1..lags).inject(0.0) { |s, i| s + r[i] ** 2 / (n - i) }
    end

    # This method tries to detect autocorrelation with the Ljung-Box
    # statistic. If enough lags can be considered it returns a hash with
    # results, otherwise nil is returned. The keys are
    # :lags:: the number of lags,
    # :alpha_level:: the alpha level for the test,
    # :q:: the value of the ljung_box_statistic,
    # :p:: the p-value computed, if p is higher than alpha no correlation was detected,
    # :detected:: true if a correlation was found.
    def detect_autocorrelation(lags = 20, alpha_level = 0.05)
      if q = ljung_box_statistic(lags)
        p = ChiSquareDistribution.new(lags).probability(q)
        return {
          :lags         => lags,
          :alpha_level  => alpha_level,
          :q            => q,
          :p            => p,
          :detected     => p >= 1 - alpha_level,
        }
      end
    end

    # Return a result hash with the number of :very_low, :low, :high, and
    # :very_high outliers, determined by the box plotting algorithm run with
    # :median and :iqr parameters. If no outliers were found or the iqr is
    # less than epsilon, nil is returned.
    def detect_outliers(factor = 3.0, epsilon = 1E-5)
      half_factor = factor / 2.0
      quartile1 = percentile(25)
      quartile3 = percentile(75)
      iqr = quartile3 - quartile1
      iqr < epsilon and return
      result = @elements.inject(Hash.new(0)) do |h, t|
        extreme =
          case t
          when -Infinity..(quartile1 - factor * iqr)
            :very_low
          when (quartile1 - factor * iqr)..(quartile1 - half_factor * iqr)
            :low
          when (quartile1 + half_factor * iqr)..(quartile3 + factor * iqr)
            :high
          when (quartile3 + factor * iqr)..Infinity
            :very_high
          end and h[extreme] += 1
        h
      end
      unless result.empty?
        result[:median] = median
        result[:iqr] = iqr
        result[:factor] = factor
        result
      end
    end

    # Returns the LinearRegression object for the equation a * x + b which
    # represents the line computed by the linear regression algorithm.
    def linear_regression
      LinearRegression.new @elements
    end
    memoize_method :linear_regression

    # Returns a Histogram instance with +bins+ as the number of bins for this
    # analysis' elements.
    def histogram(bins)
      Histogram.new(self, bins)
    end
  end
end