class Polars::LazyFrame

def describe(

Other tags:
    Example: Customize which percentiles are displayed, applying linear interpolation: -
    Example: Show default frame statistics: -

Other tags:
    Note: -
    Note: -
    Note: -

Returns:
  • (DataFrame) -

Parameters:
  • interpolation ('nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable') --
  • percentiles (Array) --
def describe(
  percentiles: [0.25, 0.5, 0.75],
  interpolation: "nearest"
)
  schema = collect_schema.to_h
  if schema.empty?
    msg = "cannot describe a LazyFrame that has no columns"
    raise TypeError, msg
  end
  # create list of metrics
  metrics = ["count", "null_count", "mean", "std", "min"]
  if (quantiles = Utils.parse_percentiles(percentiles)).any?
    metrics.concat(quantiles.map { |q| "%g%%" % [q * 100] })
  end
  metrics.append("max")
  skip_minmax = lambda do |dt|
    dt.nested? || [Categorical, Enum, Null, Object, Unknown].include?(dt)
  end
  # determine which columns will produce std/mean/percentile/etc
  # statistics in a single pass over the frame schema
  has_numeric_result, sort_cols = Set.new, Set.new
  metric_exprs = []
  null = F.lit(nil)
  schema.each do |c, dtype|
    is_numeric = dtype.numeric?
    is_temporal = !is_numeric && dtype.temporal?
    # counts
    count_exprs = [
      F.col(c).count.name.prefix("count:"),
      F.col(c).null_count.name.prefix("null_count:")
    ]
    # mean
    mean_expr =
      if is_temporal || is_numeric || dtype == Boolean
        F.col(c).mean
      else
        null
      end
    # standard deviation, min, max
    expr_std = is_numeric ? F.col(c).std : null
    min_expr = !skip_minmax.(dtype) ? F.col(c).min : null
    max_expr = !skip_minmax.(dtype) ? F.col(c).max : null
    # percentiles
    pct_exprs = []
    quantiles.each do |p|
      if is_numeric || is_temporal
        pct_expr =
          if is_temporal
            F.col(c).to_physical.quantile(p, interpolation: interpolation).cast(dtype)
          else
            F.col(c).quantile(p, interpolation: interpolation)
          end
        sort_cols.add(c)
      else
        pct_expr = null
      end
      pct_exprs << pct_expr.alias("#{p}:#{c}")
    end
    if is_numeric || dtype.nested? || [Null, Boolean].include?(dtype)
      has_numeric_result.add(c)
    end
    # add column expressions (in end-state 'metrics' list order)
    metric_exprs.concat(
      [
        *count_exprs,
        mean_expr.alias("mean:#{c}"),
        expr_std.alias("std:#{c}"),
        min_expr.alias("min:#{c}"),
        *pct_exprs,
        max_expr.alias("max:#{c}")
      ]
    )
  end
  # calculate requested metrics in parallel, then collect the result
  df_metrics = (
    (
      # if more than one quantile, sort the relevant columns to make them O(1)
      # TODO: drop sort once we have efficient retrieval of multiple quantiles
      sort_cols ? with_columns(sort_cols.map { |c| F.col(c).sort }) : self
    )
    .select(*metric_exprs)
    .collect
  )
  # reshape wide result
  n_metrics = metrics.length
  column_metrics =
    schema.length.times.map do |n|
      df_metrics.row(0)[(n * n_metrics)...((n + 1) * n_metrics)]
    end
  summary = schema.keys.zip(column_metrics).to_h
  # cast by column type (numeric/bool -> float), (other -> string)
  schema.each_key do |c|
    summary[c] =
      summary[c].map do |v|
        if v.nil? || v.is_a?(Hash)
          nil
        else
          if has_numeric_result.include?(c)
            if v == true
              1.0
            elsif v == false
              0.0
            else
              v.to_f
            end
          else
            "#{v}"
          end
        end
      end
  end
  # return results as a DataFrame
  df_summary = Polars.from_hash(summary)
  df_summary.insert_column(0, Polars::Series.new("statistic", metrics))
  df_summary
end