class Polars::LazyFrame
def describe(
- Example: Customize which percentiles are displayed, applying linear interpolation: -
Example: Show default frame statistics: -
Other tags:
- Note: -
Note: -
Note: -
Returns:
-
(DataFrame)-
Parameters:
-
interpolation('nearest', 'higher', 'lower', 'midpoint', 'linear', 'equiprobable') -- -
percentiles(Array) --
def describe( percentiles: [0.25, 0.5, 0.75], interpolation: "nearest" ) schema = collect_schema.to_h if schema.empty? msg = "cannot describe a LazyFrame that has no columns" raise TypeError, msg end # create list of metrics metrics = ["count", "null_count", "mean", "std", "min"] if (quantiles = Utils.parse_percentiles(percentiles)).any? metrics.concat(quantiles.map { |q| "%g%%" % [q * 100] }) end metrics.append("max") skip_minmax = lambda do |dt| dt.nested? || [Categorical, Enum, Null, Object, Unknown].include?(dt) end # determine which columns will produce std/mean/percentile/etc # statistics in a single pass over the frame schema has_numeric_result, sort_cols = Set.new, Set.new metric_exprs = [] null = F.lit(nil) schema.each do |c, dtype| is_numeric = dtype.numeric? is_temporal = !is_numeric && dtype.temporal? # counts count_exprs = [ F.col(c).count.name.prefix("count:"), F.col(c).null_count.name.prefix("null_count:") ] # mean mean_expr = if is_temporal || is_numeric || dtype == Boolean F.col(c).mean else null end # standard deviation, min, max expr_std = is_numeric ? F.col(c).std : null min_expr = !skip_minmax.(dtype) ? F.col(c).min : null max_expr = !skip_minmax.(dtype) ? F.col(c).max : null # percentiles pct_exprs = [] quantiles.each do |p| if is_numeric || is_temporal pct_expr = if is_temporal F.col(c).to_physical.quantile(p, interpolation: interpolation).cast(dtype) else F.col(c).quantile(p, interpolation: interpolation) end sort_cols.add(c) else pct_expr = null end pct_exprs << pct_expr.alias("#{p}:#{c}") end if is_numeric || dtype.nested? || [Null, Boolean].include?(dtype) has_numeric_result.add(c) end # add column expressions (in end-state 'metrics' list order) metric_exprs.concat( [ *count_exprs, mean_expr.alias("mean:#{c}"), expr_std.alias("std:#{c}"), min_expr.alias("min:#{c}"), *pct_exprs, max_expr.alias("max:#{c}") ] ) end # calculate requested metrics in parallel, then collect the result df_metrics = ( ( # if more than one quantile, sort the relevant columns to make them O(1) # TODO: drop sort once we have efficient retrieval of multiple quantiles sort_cols ? with_columns(sort_cols.map { |c| F.col(c).sort }) : self ) .select(*metric_exprs) .collect ) # reshape wide result n_metrics = metrics.length column_metrics = schema.length.times.map do |n| df_metrics.row(0)[(n * n_metrics)...((n + 1) * n_metrics)] end summary = schema.keys.zip(column_metrics).to_h # cast by column type (numeric/bool -> float), (other -> string) schema.each_key do |c| summary[c] = summary[c].map do |v| if v.nil? || v.is_a?(Hash) nil else if has_numeric_result.include?(c) if v == true 1.0 elsif v == false 0.0 else v.to_f end else "#{v}" end end end end # return results as a DataFrame df_summary = Polars.from_hash(summary) df_summary.insert_column(0, Polars::Series.new("statistic", metrics)) df_summary end