module Polars
module Functions
# Select a field in the current `struct.with_fields` scope.
#
# @param name [Object]
# Name of the field(s) to select.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new({"a" => [{"x" => 5, "y" => 2}, {"x" => 3, "y" => 4}]})
# df.select(Polars.col("a").struct.with_fields(Polars.field("x") ** 2))
# # =>
# # shape: (2, 1)
# # ┌───────────┐
# # │ a │
# # │ --- │
# # │ struct[2] │
# # ╞═══════════╡
# # │ {25,2} │
# # │ {9,4} │
# # └───────────┘
def field(name)
if name.is_a?(::String)
name = [name]
end
Utils.wrap_expr(Plr.field(name))
end
# Alias for an element in evaluated in an `eval` expression.
#
# @return [Expr]
#
# @example A horizontal rank computation by taking the elements of a list
# df = Polars::DataFrame.new({"a" => [1, 8, 3], "b" => [4, 5, 2]})
# df.with_columns(
# Polars.concat_list(["a", "b"]).list.eval(Polars.element.rank).alias("rank")
# )
# # =>
# # shape: (3, 3)
# # ┌─────┬─────┬────────────┐
# # │ a ┆ b ┆ rank │
# # │ --- ┆ --- ┆ --- │
# # │ i64 ┆ i64 ┆ list[f64] │
# # ╞═════╪═════╪════════════╡
# # │ 1 ┆ 4 ┆ [1.0, 2.0] │
# # │ 8 ┆ 5 ┆ [2.0, 1.0] │
# # │ 3 ┆ 2 ┆ [2.0, 1.0] │
# # └─────┴─────┴────────────┘
def element
Utils.wrap_expr(Plr.element)
end
# Return the number of non-null values in the column.
#
# This function is syntactic sugar for `col(columns).count`.
#
# Calling this function without any arguments returns the number of rows in the
# context. **This way of using the function is deprecated.** Please use `len`
# instead.
#
# @param columns [Array]
# One or more column names.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 2, nil],
# "b" => [3, nil, nil],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.count("a"))
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ u32 │
# # ╞═════╡
# # │ 2 │
# # └─────┘
#
# @example Return the number of non-null values in multiple columns.
# df.select(Polars.count("b", "c"))
# # =>
# # shape: (1, 2)
# # ┌─────┬─────┐
# # │ b ┆ c │
# # │ --- ┆ --- │
# # │ u32 ┆ u32 │
# # ╞═════╪═════╡
# # │ 1 ┆ 3 │
# # └─────┴─────┘
def count(*columns)
if columns.empty?
warn "`Polars.count` is deprecated. Use `Polars.length` instead."
return Utils.wrap_expr(Plr.len.alias("count"))
end
col(*columns).count
end
# Return the cumulative count of the non-null values in the column.
#
# This function is syntactic sugar for `col(columns).cum_count`.
#
# If no arguments are passed, returns the cumulative count of a context.
# Rows containing null values count towards the result.
#
# @param columns [Array]
# Name(s) of the columns to use.
# @param reverse [Boolean]
# Reverse the operation.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new({"a" => [1, 2, nil], "b" => [3, nil, nil]})
# df.select(Polars.cum_count("a"))
# # =>
# # shape: (3, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ u32 │
# # ╞═════╡
# # │ 1 │
# # │ 2 │
# # │ 2 │
# # └─────┘
def cum_count(*columns, reverse: false)
col(*columns).cum_count(reverse: reverse)
end
# Aggregate all column values into a list.
#
# This function is syntactic sugar for `col(name).implode`.
#
# @param columns [Array]
# One or more column names.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 2, 3],
# "b" => [9, 8, 7],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.implode("a"))
# # =>
# # shape: (1, 1)
# # ┌───────────┐
# # │ a │
# # │ --- │
# # │ list[i64] │
# # ╞═══════════╡
# # │ [1, 2, 3] │
# # └───────────┘
#
# @example
# df.select(Polars.implode("b", "c"))
# # =>
# # shape: (1, 2)
# # ┌───────────┬───────────────────────┐
# # │ b ┆ c │
# # │ --- ┆ --- │
# # │ list[i64] ┆ list[str] │
# # ╞═══════════╪═══════════════════════╡
# # │ [9, 8, 7] ┆ ["foo", "bar", "foo"] │
# # └───────────┴───────────────────────┘
def implode(*columns)
col(*columns).implode
end
# Get the standard deviation.
#
# This function is syntactic sugar for `col(column).std(ddof: ddof)`.
#
# @param column [Object]
# Column name.
# @param ddof [Integer]
# “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
# where N represents the number of elements.
# By default ddof is 1.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.std("a"))
# # =>
# # shape: (1, 1)
# # ┌──────────┐
# # │ a │
# # │ --- │
# # │ f64 │
# # ╞══════════╡
# # │ 3.605551 │
# # └──────────┘
#
# @example
# df["a"].std
# # => 3.605551275463989
def std(column, ddof: 1)
col(column).std(ddof: ddof)
end
# Get the variance.
#
# This function is syntactic sugar for `col(column).var(ddof: ddof)`.
#
# @param column [Object]
# Column name.
# @param ddof [Integer]
# “Delta Degrees of Freedom”: the divisor used in the calculation is N - ddof,
# where N represents the number of elements.
# By default ddof is 1.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.var("a"))
# # =>
# # shape: (1, 1)
# # ┌──────┐
# # │ a │
# # │ --- │
# # │ f64 │
# # ╞══════╡
# # │ 13.0 │
# # └──────┘
#
# @example
# df["a"].var
# # => 13.0
def var(column, ddof: 1)
col(column).var(ddof: ddof)
end
# Get the mean value.
#
# This function is syntactic sugar for `col(columns).mean`.
#
# @param columns [Array]
# One or more column names.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.mean("a"))
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ f64 │
# # ╞═════╡
# # │ 4.0 │
# # └─────┘
#
# @example
# df.select(Polars.mean("a", "b"))
# # =>
# # shape: (1, 2)
# # ┌─────┬──────────┐
# # │ a ┆ b │
# # │ --- ┆ --- │
# # │ f64 ┆ f64 │
# # ╞═════╪══════════╡
# # │ 4.0 ┆ 3.666667 │
# # └─────┴──────────┘
def mean(*columns)
col(*columns).mean
end
# Get the median value.
#
# This function is syntactic sugar for `pl.col(columns).median`.
#
# @param columns [Array]
# One or more column names.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.median("a"))
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ f64 │
# # ╞═════╡
# # │ 3.0 │
# # └─────┘
#
# @example
# df.select(Polars.median("a", "b"))
# # =>
# # shape: (1, 2)
# # ┌─────┬─────┐
# # │ a ┆ b │
# # │ --- ┆ --- │
# # │ f64 ┆ f64 │
# # ╞═════╪═════╡
# # │ 3.0 ┆ 4.0 │
# # └─────┴─────┘
def median(*columns)
col(*columns).median
end
# Count unique values.
#
# This function is syntactic sugar for `col(columns).n_unique`.
#
# @param columns [Array]
# One or more column names.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 1],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.n_unique("a"))
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ u32 │
# # ╞═════╡
# # │ 2 │
# # └─────┘
#
# @example
# df.select(Polars.n_unique("b", "c"))
# # =>
# # shape: (1, 2)
# # ┌─────┬─────┐
# # │ b ┆ c │
# # │ --- ┆ --- │
# # │ u32 ┆ u32 │
# # ╞═════╪═════╡
# # │ 3 ┆ 2 │
# # └─────┴─────┘
def n_unique(*columns)
col(*columns).n_unique
end
# Approximate count of unique values.
#
# This function is syntactic sugar for `col(columns).approx_n_unique`, and
# uses the HyperLogLog++ algorithm for cardinality estimation.
#
# @param columns [Array]
# One or more column names.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 1],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.approx_n_unique("a"))
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ u32 │
# # ╞═════╡
# # │ 2 │
# # └─────┘
#
# @example
# df.select(Polars.approx_n_unique("b", "c"))
# # =>
# # shape: (1, 2)
# # ┌─────┬─────┐
# # │ b ┆ c │
# # │ --- ┆ --- │
# # │ u32 ┆ u32 │
# # ╞═════╪═════╡
# # │ 3 ┆ 2 │
# # └─────┴─────┘
def approx_n_unique(*columns)
col(*columns).approx_n_unique
end
# Get the first value.
#
# @param columns [Array]
# One or more column names. If not provided (default), returns an expression
# to take the first column of the context instead.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "baz"]
# }
# )
# df.select(Polars.first)
# # =>
# # shape: (3, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 1 │
# # │ 8 │
# # │ 3 │
# # └─────┘
#
# @example
# df.select(Polars.first("b"))
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ b │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 4 │
# # └─────┘
#
# @example
# df.select(Polars.first("a", "c"))
# # =>
# # shape: (1, 2)
# # ┌─────┬─────┐
# # │ a ┆ c │
# # │ --- ┆ --- │
# # │ i64 ┆ str │
# # ╞═════╪═════╡
# # │ 1 ┆ foo │
# # └─────┴─────┘
def first(*columns)
if columns.empty?
return cs.first.as_expr
end
col(*columns).first
end
# Get the last value.
#
# @param columns [Array]
# One or more column names. If set to `nil` (default), returns an expression
# to take the last column of the context instead.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "baz"]
# }
# )
# df.select(Polars.last)
# # =>
# # shape: (3, 1)
# # ┌─────┐
# # │ c │
# # │ --- │
# # │ str │
# # ╞═════╡
# # │ foo │
# # │ bar │
# # │ baz │
# # └─────┘
#
# @example
# df.select(Polars.last("a"))
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 3 │
# # └─────┘
#
# @example
# df.select(Polars.last("b", "c"))
# # =>
# # shape: (1, 2)
# # ┌─────┬─────┐
# # │ b ┆ c │
# # │ --- ┆ --- │
# # │ i64 ┆ str │
# # ╞═════╪═════╡
# # │ 2 ┆ baz │
# # └─────┴─────┘
def last(*columns)
if columns.empty?
return cs.last.as_expr
end
col(*columns).last
end
# Get the nth column(s) of the context.
#
# @param indices [Array]
# One or more indices representing the columns to retrieve.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "baz"]
# }
# )
# df.select(Polars.nth(1))
# # =>
# # shape: (3, 1)
# # ┌─────┐
# # │ b │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 4 │
# # │ 5 │
# # │ 2 │
# # └─────┘
#
# @example
# df.select(Polars.nth(2, 0))
# # =>
# # shape: (3, 2)
# # ┌─────┬─────┐
# # │ c ┆ a │
# # │ --- ┆ --- │
# # │ str ┆ i64 │
# # ╞═════╪═════╡
# # │ foo ┆ 1 │
# # │ bar ┆ 8 │
# # │ baz ┆ 3 │
# # └─────┴─────┘
def nth(*indices, strict: true)
cs.by_index(*indices, require_all: strict).as_expr
end
# Get the first `n` rows.
#
# This function is syntactic sugar for `col(column).head(n)`.
#
# @param column [Object]
# Column name.
# @param n [Integer]
# Number of rows to return.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.head("a"))
# # =>
# # shape: (3, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 1 │
# # │ 8 │
# # │ 3 │
# # └─────┘
#
# @example
# df.select(Polars.head("a", 2))
# # =>
# # shape: (2, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 1 │
# # │ 8 │
# # └─────┘
def head(column, n = 10)
col(column).head(n)
end
# Get the last `n` rows.
#
# This function is syntactic sugar for `col(column).tail(n)`.
#
# @param column [Object]
# Column name.
# @param n [Integer]
# Number of rows to return.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.tail("a"))
# # =>
# # shape: (3, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 1 │
# # │ 8 │
# # │ 3 │
# # └─────┘
#
# @example
# df.select(Polars.tail("a", 2))
# # =>
# # shape: (2, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 8 │
# # │ 3 │
# # └─────┘
def tail(column, n = 10)
col(column).tail(n)
end
# Compute the Pearson's or Spearman rank correlation correlation between two columns.
#
# @param a [Object]
# Column name or Expression.
# @param b [Object]
# Column name or Expression.
# @param method ["pearson", "spearman"]
# Correlation method.
# @param ddof [Integer]
# "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
# where N represents the number of elements.
# By default ddof is 1.
# @param propagate_nans [Boolean]
# If `true` any `NaN` encountered will lead to `NaN` in the output.
# Defaults to `false` where `NaN` are regarded as larger than any finite number
# and thus lead to the highest rank.
# @param eager [Boolean]
# Evaluate immediately and return a `Series`; this requires that at least one
# of the given arguments is a `Series`. If set to `false` (default), return
# an expression instead.
#
# @return [Expr]
#
# @example Pearson's correlation:
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.corr("a", "b"))
# # =>
# # shape: (1, 1)
# # ┌──────────┐
# # │ a │
# # │ --- │
# # │ f64 │
# # ╞══════════╡
# # │ 0.544705 │
# # └──────────┘
#
# @example Spearman rank correlation:
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.corr("a", "b", method: "spearman"))
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ f64 │
# # ╞═════╡
# # │ 0.5 │
# # └─────┘
#
# @example Eager evaluation:
# s1 = Polars::Series.new("a", [1, 8, 3])
# s2 = Polars::Series.new("b", [4, 5, 2])
# Polars.corr(s1, s2, eager: true)
# # =>
# # shape: (1,)
# # Series: 'a' [f64]
# # [
# # 0.544705
# # ]
#
# @example
# Polars.corr(s1, s2, method: "spearman", eager: true)
# # =>
# # shape: (1,)
# # Series: 'a' [f64]
# # [
# # 0.5
# # ]
def corr(
a,
b,
method: "pearson",
ddof: nil,
propagate_nans: false,
eager: false
)
if !ddof.nil?
Utils.issue_deprecation_warning(
"The `ddof` parameter has no effect. Do not use it."
)
end
if eager
if !(a.is_a?(Series) || b.is_a?(Series))
msg = "expected at least one Series in 'corr' inputs if 'eager: true'"
raise ArgumentError, msg
end
frame = Polars::DataFrame.new([a, b].filter_map { |e| e if e.is_a?(Series) })
exprs = [a, b].map { |e| e.is_a?(Series) ? e.name : e }
frame.select(
corr(*exprs, eager: false, method: method, propagate_nans: propagate_nans)
).to_series
else
a = Utils.parse_into_expression(a)
b = Utils.parse_into_expression(b)
if method == "pearson"
Utils.wrap_expr(Plr.pearson_corr(a, b))
elsif method == "spearman"
Utils.wrap_expr(Plr.spearman_rank_corr(a, b, propagate_nans))
else
msg = "method must be one of {{'pearson', 'spearman'}}, got #{method}"
raise ArgumentError, msg
end
end
end
# Compute the covariance between two columns/ expressions.
#
# @param a [Object]
# Column name or Expression.
# @param b [Object]
# Column name or Expression.
# @param ddof [Integer]
# "Delta Degrees of Freedom": the divisor used in the calculation is N - ddof,
# where N represents the number of elements.
# By default ddof is 1.
# @param eager [Boolean]
# Evaluate immediately and return a `Series`; this requires that at least one
# of the given arguments is a `Series`. If set to `false` (default), return
# an expression instead.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 8, 3],
# "b" => [4, 5, 2],
# "c" => ["foo", "bar", "foo"]
# }
# )
# df.select(Polars.cov("a", "b"))
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ f64 │
# # ╞═════╡
# # │ 3.0 │
# # └─────┘
#
# @example Eager evaluation:
# s1 = Polars::Series.new("a", [1, 8, 3])
# s2 = Polars::Series.new("b", [4, 5, 2])
# Polars.cov(s1, s2, eager: true)
# # =>
# # shape: (1,)
# # Series: 'a' [f64]
# # [
# # 3.0
# # ]
def cov(a, b, ddof: 1, eager: false)
if eager
if !(a.is_a?(Series) || b.is_a?(Series))
msg = "expected at least one Series in 'cov' inputs if 'eager: true'"
raise ArgumentError, msg
end
frame = Polars::DataFrame.new([a, b].filter_map { |e| e if e.is_a?(Series) })
exprs = [a, b].map { |e| e.is_a?(Series) ? e.name : e }
frame.select(cov(*exprs, eager: false, ddof: ddof)).to_series
else
a_rbexpr = Utils.parse_into_expression(a)
b_rbexpr = Utils.parse_into_expression(b)
Utils.wrap_expr(Plr.cov(a_rbexpr, b_rbexpr, ddof))
end
end
# Map a custom function over multiple columns/expressions.
#
# Produces a single Series result.
#
# @note
# This method is much slower than the native expressions API.
# Only use it if you cannot implement your logic otherwise.
#
# @param exprs [Array]
# Expression(s) representing the input Series to the function.
# @param return_dtype [Object]
# Datatype of the output Series.
#
# It is recommended to set this whenever possible. If this is `nil`, it tries
# to infer the datatype by calling the function with dummy data and looking at
# the output.
# @param is_elementwise [Boolean]
# Set to true if the operations is elementwise for better performance
# and optimization.
#
# An elementwise operations has unit or equal length for all inputs
# and can be ran sequentially on slices without results being affected.
# @param returns_scalar [Boolean]
# If the function returns a scalar, by default it will be wrapped in
# a list in the output, since the assumption is that the function
# always returns something Series-like. If you want to keep the
# result as a scalar, set this argument to True.
#
# @return [Expr]
#
# @note
# A UDF passed to `map_batches` must be pure, meaning that it cannot modify
# or depend on state other than its arguments. We may call the function
# with arbitrary input data.
#
# @example
# test_func = lambda do |a, b, c|
# a + b + c
# end
# df = Polars::DataFrame.new(
# {
# "a" => [1, 2, 3, 4],
# "b" => [4, 5, 6, 7]
# }
# )
#
# df.with_columns(
# (
# Polars.struct(["a", "b"]).map_batches { |x| test_func.(x.struct.field("a"), x.struct.field("b"), 1) }
# ).alias("a+b+c")
# )
# # =>
# # shape: (4, 3)
# # ┌─────┬─────┬───────┐
# # │ a ┆ b ┆ a+b+c │
# # │ --- ┆ --- ┆ --- │
# # │ i64 ┆ i64 ┆ i64 │
# # ╞═════╪═════╪═══════╡
# # │ 1 ┆ 4 ┆ 6 │
# # │ 2 ┆ 5 ┆ 8 │
# # │ 3 ┆ 6 ┆ 10 │
# # │ 4 ┆ 7 ┆ 12 │
# # └─────┴─────┴───────┘
def map_batches(
exprs,
return_dtype: nil,
is_elementwise: false,
returns_scalar: false,
&function
)
rbexprs = Utils.parse_into_list_of_expressions(exprs)
return_dtype_expr =
if !return_dtype.nil?
Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
else
nil
end
Utils.wrap_expr(
Plr.map_expr(
rbexprs,
_map_batches_wrapper(function, returns_scalar: returns_scalar),
return_dtype_expr,
is_elementwise,
returns_scalar
)
)
end
# Apply a custom/user-defined function (UDF) in a GroupBy context.
#
# @note
# This method is much slower than the native expressions API.
# Only use it if you cannot implement your logic otherwise.
#
# @param exprs [Object]
# Expression(s) representing the input Series to the function.
# @param return_dtype [Object]
# Datatype of the output Series.
#
# It is recommended to set this whenever possible. If this is `nil`, it tries
# to infer the datatype by calling the function with dummy data and looking at
# the output.
# @param is_elementwise [Boolean]
# Set to true if the operations is elementwise for better performance
# and optimization.
#
# An elementwise operations has unit or equal length for all inputs
# and can be ran sequentially on slices without results being affected.
# @param returns_scalar [Boolean]
# If the function returns a single scalar as output.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "group" => [1, 1, 2],
# "a" => [1, 3, 3],
# "b" => [5, 6, 7]
# }
# )
# (
# df.group_by("group").agg(
# Polars.map_groups(["a", "b"], return_dtype: Polars::Float64) { |list_of_series| list_of_series[0] / list_of_series[0].sum + list_of_series[1] }
# .alias("my_custom_aggregation")
# )
# ).sort("group")
# # =>
# # shape: (2, 2)
# # ┌───────┬───────────────────────┐
# # │ group ┆ my_custom_aggregation │
# # │ --- ┆ --- │
# # │ i64 ┆ list[f64] │
# # ╞═══════╪═══════════════════════╡
# # │ 1 ┆ [5.25, 6.75] │
# # │ 2 ┆ [8.0] │
# # └───────┴───────────────────────┘
def map_groups(
exprs,
return_dtype: nil,
is_elementwise: false,
returns_scalar: false,
&function
)
map_batches(
exprs,
return_dtype: return_dtype,
is_elementwise: is_elementwise,
returns_scalar: returns_scalar,
&function
)
end
# Accumulate over multiple columns horizontally/row wise with a left fold.
#
# @return [Expr]
#
# @example Horizontally sum over all columns and add 1.
# df = Polars::DataFrame.new(
# {
# "a" => [1, 2, 3],
# "b" => [3, 4, 5],
# "c" => [5, 6, 7]
# }
# )
# df.select(
# Polars.fold(Polars.lit(1), Polars.col("*")) { |acc, x| acc + x }.alias("sum")
# )
# # =>
# # shape: (3, 1)
# # ┌─────┐
# # │ sum │
# # │ --- │
# # │ i32 │
# # ╞═════╡
# # │ 10 │
# # │ 13 │
# # │ 16 │
# # └─────┘
#
# @example You can also apply a condition/predicate on all columns:
# df = Polars::DataFrame.new(
# {
# "a" => [1, 2, 3],
# "b" => [0, 1, 2]
# }
# )
# df.filter(
# Polars.fold(Polars.lit(true), Polars.col("*") > 1) { |acc, x| acc & x }
# )
# # =>
# # shape: (1, 2)
# # ┌─────┬─────┐
# # │ a ┆ b │
# # │ --- ┆ --- │
# # │ i64 ┆ i64 │
# # ╞═════╪═════╡
# # │ 3 ┆ 2 │
# # └─────┴─────┘
def fold(
acc,
exprs,
returns_scalar: false,
return_dtype: nil,
&function
)
acc = Utils.parse_into_expression(acc, str_as_lit: true)
if exprs.is_a?(Expr)
exprs = [exprs]
end
rt = nil
if !return_dtype.nil?
rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
end
exprs = Utils.parse_into_list_of_expressions(exprs)
Utils.wrap_expr(
Plr.fold(
acc,
_wrap_acc_lambda(function),
exprs,
returns_scalar,
rt
)
)
end
# Accumulate over multiple columns horizontally/ row wise with a left fold.
#
# @param exprs [Object]
# Expressions to aggregate over. May also be a wildcard expression.
# @param returns_scalar [Boolean]
# Whether or not `function` applied returns a scalar. This must be set correctly
# by the user.
# @param return_dtype [Object]
# Output datatype.
# If not set, the dtype will be inferred based on the dtype of the input
# expressions.
#
# @return [Expr]
#
# @example Horizontally sum over all columns.
# df = Polars::DataFrame.new(
# {
# "a" => [1, 2, 3],
# "b" => [0, 1, 2]
# }
# )
# df.select(
# Polars.reduce(Polars.col("*")) { |acc, x| acc + x }.alias("sum")
# )
# # =>
# # shape: (3, 1)
# # ┌─────┐
# # │ sum │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 1 │
# # │ 3 │
# # │ 5 │
# # └─────┘
def reduce(
exprs,
returns_scalar: false,
return_dtype: nil,
&function
)
if exprs.is_a?(Expr)
exprs = [exprs]
end
rt = nil
if !return_dtype.nil?
rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
end
rbexprs = Utils.parse_into_list_of_expressions(exprs)
Utils.wrap_expr(
Plr.reduce(
_wrap_acc_lambda(function),
rbexprs,
returns_scalar,
rt
)
)
end
# Cumulatively accumulate over multiple columns horizontally/row wise with a left fold.
#
# Every cumulative result is added as a separate field in a Struct column.
#
# @param acc [Object]
# Accumulator Expression. This is the value that will be initialized when the fold
# starts. For a sum this could for instance be lit(0).
# @param exprs [Object]
# Expressions to aggregate over. May also be a wildcard expression.
# @param returns_scalar [Boolean]
# Whether or not `function` applied returns a scalar. This must be set correctly
# by the user.
# @param return_dtype [Object]
# Output datatype.
# If not set, the dtype will be inferred based on the dtype of the accumulator.
# @param include_init [Boolean]
# Include the initial accumulator state as struct field.
#
# @return [Object]
#
# @note
# If you simply want the first encountered expression as accumulator,
# consider using `cum_reduce`.
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 2, 3],
# "b" => [3, 4, 5],
# "c" => [5, 6, 7]
# }
# )
# df.with_columns(
# Polars.cum_fold(Polars.lit(1), Polars.all) { |acc, x| acc + x }
# )
# # =>
# # shape: (3, 4)
# # ┌─────┬─────┬─────┬───────────┐
# # │ a ┆ b ┆ c ┆ cum_fold │
# # │ --- ┆ --- ┆ --- ┆ --- │
# # │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
# # ╞═════╪═════╪═════╪═══════════╡
# # │ 1 ┆ 3 ┆ 5 ┆ {2,5,10} │
# # │ 2 ┆ 4 ┆ 6 ┆ {3,7,13} │
# # │ 3 ┆ 5 ┆ 7 ┆ {4,9,16} │
# # └─────┴─────┴─────┴───────────┘
def cum_fold(
acc,
exprs,
returns_scalar: false,
return_dtype: nil,
include_init: false,
&function
)
acc = Utils.parse_into_expression(acc, str_as_lit: true)
if exprs.is_a?(Expr)
exprs = [exprs]
end
rt = nil
if !return_dtype.nil?
rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
end
exprs = Utils.parse_into_list_of_expressions(exprs)
Utils.wrap_expr(
Plr.cum_fold(
acc,
_wrap_acc_lambda(function),
exprs,
returns_scalar,
rt,
include_init
).alias("cum_fold")
)
end
# Cumulatively reduce horizontally across columns with a left fold.
#
# Every cumulative result is added as a separate field in a Struct column.
#
# @param exprs [Object]
# Expressions to aggregate over. May also be a wildcard expression.
# @param returns_scalar [Boolean]
# Whether or not `function` applied returns a scalar. This must be set correctly
# by the user.
# @param return_dtype [Object]
# Output datatype.
# If not set, the dtype will be inferred based on the dtype of the input
# expressions.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, 2, 3],
# "b" => [3, 4, 5],
# "c" => [5, 6, 7]
# }
# )
# df.with_columns(Polars.cum_reduce(Polars.all) { |acc, x| acc + x })
# # =>
# # shape: (3, 4)
# # ┌─────┬─────┬─────┬────────────┐
# # │ a ┆ b ┆ c ┆ cum_reduce │
# # │ --- ┆ --- ┆ --- ┆ --- │
# # │ i64 ┆ i64 ┆ i64 ┆ struct[3] │
# # ╞═════╪═════╪═════╪════════════╡
# # │ 1 ┆ 3 ┆ 5 ┆ {1,4,9} │
# # │ 2 ┆ 4 ┆ 6 ┆ {2,6,12} │
# # │ 3 ┆ 5 ┆ 7 ┆ {3,8,15} │
# # └─────┴─────┴─────┴────────────┘
def cum_reduce(
exprs,
returns_scalar: false,
return_dtype: nil,
&function
)
if exprs.is_a?(Expr)
exprs = [exprs]
end
rt = nil
if !return_dtype.nil?
rt = Utils.parse_into_datatype_expr(return_dtype)._rbdatatype_expr
end
rbexprs = Utils.parse_into_list_of_expressions(exprs)
Utils.wrap_expr(
Plr.cum_reduce(
_wrap_acc_lambda(function),
rbexprs,
returns_scalar,
rt
).alias("cum_reduce")
)
end
# Compute two argument arctan in radians.
#
# Returns the angle (in radians) in the plane between the
# positive x-axis and the ray from the origin to (x,y).
#
# @param y [Object]
# Column name or Expression.
# @param x [Object]
# Column name or Expression.
#
# @return [Expr]
#
# @example
# c = Math.sqrt(2) / 2
# df = Polars::DataFrame.new(
# {
# "y" => [c, -c, c, -c],
# "x" => [c, c, -c, -c]
# }
# )
# df.with_columns(Polars.arctan2("y", "x").alias("atan2"))
# # =>
# # shape: (4, 3)
# # ┌───────────┬───────────┬───────────┐
# # │ y ┆ x ┆ atan2 │
# # │ --- ┆ --- ┆ --- │
# # │ f64 ┆ f64 ┆ f64 │
# # ╞═══════════╪═══════════╪═══════════╡
# # │ 0.707107 ┆ 0.707107 ┆ 0.785398 │
# # │ -0.707107 ┆ 0.707107 ┆ -0.785398 │
# # │ 0.707107 ┆ -0.707107 ┆ 2.356194 │
# # │ -0.707107 ┆ -0.707107 ┆ -2.356194 │
# # └───────────┴───────────┴───────────┘
def arctan2(y, x)
if Utils.strlike?(y)
y = col(y)
end
if Utils.strlike?(x)
x = col(x)
end
Utils.wrap_expr(Plr.arctan2(y._rbexpr, x._rbexpr))
end
# Exclude certain columns from a wildcard/regex selection.
#
# @param columns [Object]
# The name or datatype of the column(s) to exclude. Accepts regular expression
# input. Regular expressions should start with `^` and end with `$`.
# @param more_columns [Array]
# Additional names or datatypes of columns to exclude, specified as positional
# arguments.
#
# @return [Object]
#
# @example
# df = Polars::DataFrame.new(
# {
# "aa" => [1, 2, 3],
# "ba" => ["a", "b", nil],
# "cc" => [nil, 2.5, 1.5]
# }
# )
# # =>
# # shape: (3, 3)
# # ┌─────┬──────┬──────┐
# # │ aa ┆ ba ┆ cc │
# # │ --- ┆ --- ┆ --- │
# # │ i64 ┆ str ┆ f64 │
# # ╞═════╪══════╪══════╡
# # │ 1 ┆ a ┆ null │
# # │ 2 ┆ b ┆ 2.5 │
# # │ 3 ┆ null ┆ 1.5 │
# # └─────┴──────┴──────┘
#
# @example Exclude by column name(s):
# df.select(Polars.exclude("ba"))
# # =>
# # shape: (3, 2)
# # ┌─────┬──────┐
# # │ aa ┆ cc │
# # │ --- ┆ --- │
# # │ i64 ┆ f64 │
# # ╞═════╪══════╡
# # │ 1 ┆ null │
# # │ 2 ┆ 2.5 │
# # │ 3 ┆ 1.5 │
# # └─────┴──────┘
#
# @example Exclude by regex, e.g. removing all columns whose names end with the letter "a":
# df.select(Polars.exclude("^.*a$"))
# # =>
# # shape: (3, 1)
# # ┌──────┐
# # │ cc │
# # │ --- │
# # │ f64 │
# # ╞══════╡
# # │ null │
# # │ 2.5 │
# # │ 1.5 │
# # └──────┘
def exclude(columns, *more_columns)
col("*").exclude(columns, *more_columns)
end
# Syntactic sugar for `Polars.col("foo").agg_groups`.
#
# @return [Object]
def groups(column)
col(column).agg_groups
end
# Syntactic sugar for `Polars.col("foo").quantile(...)`.
#
# @param column [String]
# Column name.
# @param quantile [Float]
# Quantile between 0.0 and 1.0.
# @param interpolation ["nearest", "higher", "lower", "midpoint", "linear"]
# Interpolation method.
#
# @return [Expr]
def quantile(column, quantile, interpolation: "nearest")
col(column).quantile(quantile, interpolation: interpolation)
end
# Find the indexes that would sort the columns.
#
# Argsort by multiple columns. The first column will be used for the ordering.
# If there are duplicates in the first column, the second column will be used to
# determine the ordering and so on.
#
# @param exprs [Object]
# Columns use to determine the ordering.
# @param more_exprs [Array]
# Additional columns to arg sort by, specified as positional arguments.
# @param descending [Boolean]
# Default is ascending.
# @param nulls_last [Boolean]
# Place null values last.
# @param multithreaded [Boolean]
# Sort using multiple threads.
# @param maintain_order [Boolean]
# Whether the order should be maintained if elements are equal.
#
# @return [Expr]
#
# @example Pass a single column name to compute the arg sort by that column.
# df = Polars::DataFrame.new(
# {
# "a" => [0, 1, 1, 0],
# "b" => [3, 2, 3, 2],
# "c" => [1, 2, 3, 4]
# }
# )
# df.select(Polars.arg_sort_by("a"))
# # =>
# # shape: (4, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ u32 │
# # ╞═════╡
# # │ 0 │
# # │ 3 │
# # │ 1 │
# # │ 2 │
# # └─────┘
#
# @example Compute the arg sort by multiple columns by either passing a list of columns, or by specifying each column as a positional argument.
# df.select(Polars.arg_sort_by(["a", "b"], descending: true))
# # =>
# # shape: (4, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ u32 │
# # ╞═════╡
# # │ 2 │
# # │ 1 │
# # │ 0 │
# # │ 3 │
# # └─────┘
#
# @example Use gather to apply the arg sort to other columns.
# df.select(Polars.col("c").gather(Polars.arg_sort_by("a")))
# # =>
# # shape: (4, 1)
# # ┌─────┐
# # │ c │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 1 │
# # │ 4 │
# # │ 2 │
# # │ 3 │
# # └─────┘
def arg_sort_by(
exprs,
*more_exprs,
descending: false,
nulls_last: false,
multithreaded: true,
maintain_order: false
)
exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
descending = Utils.extend_bool(descending, exprs.length, "descending", "exprs")
nulls_last = Utils.extend_bool(nulls_last, exprs.length, "nulls_last", "exprs")
Utils.wrap_expr(Plr.arg_sort_by(exprs, descending, nulls_last, multithreaded, maintain_order))
end
# Collect multiple LazyFrames at the same time.
#
# This runs all the computation graphs in parallel on Polars threadpool.
#
# @param lazy_frames [Boolean]
# A list of LazyFrames to collect.
# @param optimizations
# The optimization passes done during query optimization.
#
# This has no effect if `lazy` is set to `true`.
# @param engine [String]
# Select the engine used to process the query, optional.
# At the moment, if set to `"auto"` (default), the query is run
# using the polars streaming engine. Polars will also
# attempt to use the engine set by the `POLARS_ENGINE_AFFINITY`
# environment variable. If it cannot run the query using the
# selected engine, the query is run using the polars streaming
# engine.
# @param lazy [Boolean]
# Return as LazyFrame that can be collected later.
# This is only correct if all inputs sink to disk.
#
# This functionality is considered **unstable**. It may be changed
# at any point without it being considered a breaking change.
#
# @return [Array]
def collect_all(
lazy_frames,
optimizations: DEFAULT_QUERY_OPT_FLAGS,
engine: "auto",
lazy: false
)
lfs = lazy_frames.map { |lf| lf._ldf }
if lazy
msg = "the `lazy` parameter of `collect_all` is considered unstable."
Utils.issue_unstable_warning(msg)
ldf = Plr.collect_all_lazy(lfs, optimizations._rboptflags)
lf = LazyFrame._from_pyldf(ldf)
return lf
end
engine = LazyFrame._select_engine(engine)
out = Plr.collect_all(lfs, engine, optimizations._rboptflags)
# wrap the rbdataframes into dataframe
result = out.map { |rbdf| Utils.wrap_df(rbdf) }
result
end
# Explain multiple LazyFrames as if passed to `collect_all`.
#
# Common Subplan Elimination is applied on the combined plan, meaning
# that diverging queries will run only once.
#
# @param lazy_frames [Array]
# A list of LazyFrames to collect.
# @param optimizations [Object]
# The optimization passes done during query optimization.
#
# @return [String]
def explain_all(
lazy_frames,
optimizations: DEFAULT_QUERY_OPT_FLAGS
)
lfs = lazy_frames.map { |lf| lf._ldf }
Plr.explain_all(lfs, optimizations._rboptflags)
end
# Run polars expressions without a context.
#
# This is syntactic sugar for running `df.select` on an empty DataFrame.
#
# @param exprs [Array]
# Column(s) to select, specified as positional arguments.
# Accepts expression input. Strings are parsed as column names,
# other non-expression inputs are parsed as literals.
# @param eager [Boolean]
# Evaluate immediately and return a `DataFrame` (default); if set to `false`,
# return a `LazyFrame` instead.
# @param named_exprs [Hash]
# Additional columns to select, specified as keyword arguments.
# The columns will be renamed to the keyword used.
#
# @return [DataFrame]
#
# @example
# foo = Polars::Series.new("foo", [1, 2, 3])
# bar = Polars::Series.new("bar", [3, 2, 1])
# Polars.select(min: Polars.min_horizontal(foo, bar))
# # =>
# # shape: (3, 1)
# # ┌─────┐
# # │ min │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 1 │
# # │ 2 │
# # │ 1 │
# # └─────┘
def select(*exprs, eager: true, **named_exprs)
empty_frame = eager ? Polars::DataFrame.new : Polars::LazyFrame.new
empty_frame.select(*exprs, **named_exprs)
end
# Return indices where `condition` evaluates `true`.
#
# @param condition [Expr]
# Boolean expression to evaluate
# @param eager [Boolean]
# Whether to apply this function eagerly (as opposed to lazily).
#
# @return [Expr, Series]
#
# @example
# df = Polars::DataFrame.new({"a" => [1, 2, 3, 4, 5]})
# df.select(
# [
# Polars.arg_where(Polars.col("a") % 2 == 0)
# ]
# ).to_series
# # =>
# # shape: (2,)
# # Series: 'a' [u32]
# # [
# # 1
# # 3
# # ]
def arg_where(condition, eager: false)
if eager
if !condition.is_a?(Series)
raise ArgumentError, "expected 'Series' in 'arg_where' if 'eager: true', got #{condition.class.name}"
end
condition.to_frame.select(arg_where(Polars.col(condition.name))).to_series
else
condition = Utils.parse_into_expression(condition, str_as_lit: true)
Utils.wrap_expr(Plr.arg_where(condition))
end
end
# Folds the columns from left to right, keeping the first non-null value.
#
# @param exprs [Array]
# Columns to coalesce. Accepts expression input. Strings are parsed as column
# names, other non-expression inputs are parsed as literals.
# @param more_exprs [Hash]
# Additional columns to coalesce, specified as positional arguments.
# @param eager [Boolean]
# Evaluate immediately and return a `Series`; this requires that at least one
# of the given arguments is a `Series`. If set to `false` (default), return
# an expression instead.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new(
# {
# "a" => [1, nil, nil, nil],
# "b" => [1, 2, nil, nil],
# "c" => [5, nil, 3, nil]
# }
# )
# df.with_columns(Polars.coalesce(["a", "b", "c", 10]).alias("d"))
# # =>
# # shape: (4, 4)
# # ┌──────┬──────┬──────┬─────┐
# # │ a ┆ b ┆ c ┆ d │
# # │ --- ┆ --- ┆ --- ┆ --- │
# # │ i64 ┆ i64 ┆ i64 ┆ i64 │
# # ╞══════╪══════╪══════╪═════╡
# # │ 1 ┆ 1 ┆ 5 ┆ 1 │
# # │ null ┆ 2 ┆ null ┆ 2 │
# # │ null ┆ null ┆ 3 ┆ 3 │
# # │ null ┆ null ┆ null ┆ 10 │
# # └──────┴──────┴──────┴─────┘
#
# @example
# df.with_columns(Polars.coalesce(Polars.col(["a", "b", "c"]), 10.0).alias("d"))
# # =>
# # shape: (4, 4)
# # ┌──────┬──────┬──────┬──────┐
# # │ a ┆ b ┆ c ┆ d │
# # │ --- ┆ --- ┆ --- ┆ --- │
# # │ i64 ┆ i64 ┆ i64 ┆ f64 │
# # ╞══════╪══════╪══════╪══════╡
# # │ 1 ┆ 1 ┆ 5 ┆ 1.0 │
# # │ null ┆ 2 ┆ null ┆ 2.0 │
# # │ null ┆ null ┆ 3 ┆ 3.0 │
# # │ null ┆ null ┆ null ┆ 10.0 │
# # └──────┴──────┴──────┴──────┘
#
# @example
# s1 = Polars::Series.new("a", [nil, 2, nil])
# s2 = Polars::Series.new("b", [1, nil, 3])
# Polars.coalesce(s1, s2, eager: true)
# # =>
# # shape: (3,)
# # Series: 'a' [i64]
# # [
# # 1
# # 2
# # 3
# # ]
def coalesce(exprs, *more_exprs, eager: false)
if eager
exprs = [exprs] + more_exprs
series = exprs.filter_map { |e| e if e.is_a?(Series) }
if !series.any?
msg = "expected at least one Series in 'coalesce' if 'eager: true'"
raise ArgumentError, msg
end
exprs = exprs.map { |e| e.is_a?(Series) ? e.name : e }
Polars::DataFrame.new(series).select(coalesce(exprs, eager: false)).to_series
else
exprs = Utils.parse_into_list_of_expressions(exprs, *more_exprs)
Utils.wrap_expr(Plr.coalesce(exprs))
end
end
# Utility function that parses an epoch timestamp (or Unix time) to Polars Date(time).
#
# Depending on the `unit` provided, this function will return a different dtype:
# - time_unit: "d" returns pl.Date
# - time_unit: "s" returns pl.Datetime["us"] (pl.Datetime's default)
# - time_unit: "ms" returns pl.Datetime["ms"]
# - time_unit: "us" returns pl.Datetime["us"]
# - time_unit: "ns" returns pl.Datetime["ns"]
#
# @param column [Object]
# Series or expression to parse integers to pl.Datetime.
# @param time_unit [String]
# The unit of the timesteps since epoch time.
#
# @return [Object]
#
# @example
# df = Polars::DataFrame.new({"timestamp" => [1666683077, 1666683099]}).lazy
# df.select(Polars.from_epoch(Polars.col("timestamp"), time_unit: "s")).collect
# # =>
# # shape: (2, 1)
# # ┌─────────────────────┐
# # │ timestamp │
# # │ --- │
# # │ datetime[μs] │
# # ╞═════════════════════╡
# # │ 2022-10-25 07:31:17 │
# # │ 2022-10-25 07:31:39 │
# # └─────────────────────┘
def from_epoch(column, time_unit: "s")
if Utils.strlike?(column)
column = F.col(column)
elsif !column.is_a?(Series) && !column.is_a?(Expr)
column = Series.new(column)
end
if time_unit == "d"
column.cast(Date)
elsif time_unit == "s"
(column.cast(Int64) * 1_000_000).cast(Datetime.new("us"))
elsif Utils::DTYPE_TEMPORAL_UNITS.include?(time_unit)
column.cast(Datetime.new(time_unit))
else
raise ArgumentError, "`time_unit` must be one of {{'ns', 'us', 'ms', 's', 'd'}}, got #{time_unit.inspect}."
end
end
# Compute the rolling covariance between two columns/ expressions.
#
# The window at a given row includes the row itself and the
# `window_size - 1` elements before it.
#
# @param a [Object]
# Column name or Expression.
# @param b [Object]
# Column name or Expression.
# @param window_size [Integer]
# The length of the window.
# @param min_samples [Integer]
# The number of values in the window that should be non-null before computing
# a result. If nil, it will be set equal to window size.
# @param ddof [Integer]
# Delta degrees of freedom. The divisor used in calculations
# is `N - ddof`, where `N` represents the number of elements.
#
# @return [Expr]
def rolling_cov(
a,
b,
window_size:,
min_samples: nil,
ddof: 1
)
if min_samples.nil?
min_samples = window_size
end
if Utils.strlike?(a)
a = F.col(a)
end
if Utils.strlike?(b)
b = F.col(b)
end
Utils.wrap_expr(
Plr.rolling_cov(a._rbexpr, b._rbexpr, window_size, min_samples, ddof)
)
end
# Compute the rolling correlation between two columns/ expressions.
#
# The window at a given row includes the row itself and the
# `window_size - 1` elements before it.
#
# @param a [Object]
# Column name or Expression.
# @param b [Object]
# Column name or Expression.
# @param window_size [Integer]
# The length of the window.
# @param min_samples [Integer]
# The number of values in the window that should be non-null before computing
# a result. If nil, it will be set equal to window size.
# @param ddof [Integer]
# Delta degrees of freedom. The divisor used in calculations
# is `N - ddof`, where `N` represents the number of elements.
#
# @return [Expr]
def rolling_corr(
a,
b,
window_size:,
min_samples: nil,
ddof: 1
)
if min_samples.nil?
min_samples = window_size
end
if Utils.strlike?(a)
a = F.col(a)
end
if Utils.strlike?(b)
b = F.col(b)
end
Utils.wrap_expr(
Plr.rolling_corr(a._rbexpr, b._rbexpr, window_size, min_samples, ddof)
)
end
# Parse one or more SQL expressions to polars expression(s).
#
# @param sql [Object]
# One or more SQL expressions.
#
# @return [Expr]
#
# @example Parse a single SQL expression:
# df = Polars::DataFrame.new({"a" => [2, 1]})
# expr = Polars.sql_expr("MAX(a)")
# df.select(expr)
# # =>
# # shape: (1, 1)
# # ┌─────┐
# # │ a │
# # │ --- │
# # │ i64 │
# # ╞═════╡
# # │ 2 │
# # └─────┘
#
# @example Parse multiple SQL expressions:
# df.with_columns(
# *Polars.sql_expr(["POWER(a,a) AS a_a", "CAST(a AS TEXT) AS a_txt"])
# )
# # =>
# # shape: (2, 3)
# # ┌─────┬─────┬───────┐
# # │ a ┆ a_a ┆ a_txt │
# # │ --- ┆ --- ┆ --- │
# # │ i64 ┆ i64 ┆ str │
# # ╞═════╪═════╪═══════╡
# # │ 2 ┆ 4 ┆ 2 │
# # │ 1 ┆ 1 ┆ 1 │
# # └─────┴─────┴───────┘
def sql_expr(sql)
if sql.is_a?(::String)
Utils.wrap_expr(Plr.sql_expr(sql))
else
sql.map { |q| Utils.wrap_expr(Plr.sql_expr(q)) }
end
end
# Generates a sequence of integers.
#
# The length of the returned sequence will match the context length, and the
# datatype will match the one returned by `get_index_dtype()`.
#
# If you would like to generate sequences with custom offsets / length /
# step size / datatypes, it is recommended to use `int_range` instead.
#
# @note
# This functionality is considered **unstable**. It may be changed
# at any point without it being considered a breaking change.
#
# @param name [String]
# Name of the returned column.
#
# @return [Expr]
#
# @example
# df = Polars::DataFrame.new({"x" => ["A", "A", "B", "B", "B"]})
# df.with_columns(Polars.row_index, Polars.row_index("another_index"))
# # =>
# # shape: (5, 3)
# # ┌─────┬───────┬───────────────┐
# # │ x ┆ index ┆ another_index │
# # │ --- ┆ --- ┆ --- │
# # │ str ┆ u32 ┆ u32 │
# # ╞═════╪═══════╪═══════════════╡
# # │ A ┆ 0 ┆ 0 │
# # │ A ┆ 1 ┆ 1 │
# # │ B ┆ 2 ┆ 2 │
# # │ B ┆ 3 ┆ 3 │
# # │ B ┆ 4 ┆ 4 │
# # └─────┴───────┴───────────────┘
#
# @example
# df.group_by("x").agg(Polars.row_index).sort("x")
# # =>
# # shape: (2, 2)
# # ┌─────┬───────────┐
# # │ x ┆ index │
# # │ --- ┆ --- │
# # │ str ┆ list[u32] │
# # ╞═════╪═══════════╡
# # │ A ┆ [0, 1] │
# # │ B ┆ [0, 1, 2] │
# # └─────┴───────────┘
#
# @example
# df.select(Polars.row_index)
# # =>
# # shape: (5, 1)
# # ┌───────┐
# # │ index │
# # │ --- │
# # │ u32 │
# # ╞═══════╡
# # │ 0 │
# # │ 1 │
# # │ 2 │
# # │ 3 │
# # │ 4 │
# # └───────┘
def row_index(name = "index")
# Notes
# * Dispatching to `int_range` means that we cannot accept an offset
# parameter, as unlike `DataFrame.with_row_index`, `int_range` will simply
# truncate instead of raising an error.
F.int_range(
F.len,
dtype: get_index_type
).alias(name)
end
private
def _wrap_acc_lambda(function)
lambda do |t|
a, b = t
function.(Utils.wrap_s(a), Utils.wrap_s(b))._s
end
end
def _map_batches_wrapper(function, returns_scalar:)
lambda do |sl, *args, **kwargs|
return_dtype = kwargs[:return_dtype]
slp = sl.map { |s| Utils.wrap_s(s) }
rv = nil
begin
rv = function.(slp, *args, **kwargs)
rescue ArgumentError => e
if e.message == "wrong number of arguments (given 2, expected 1)"
kwargs.delete(:return_dtype)
rv = function.(slp, *args, **kwargs)
else
raise
end
end
if rv.is_a?(Series)
rv._s
elsif returns_scalar
Series.new([rv], dtype: return_dtype)._s
else
msg = "`map` with `returns_scalar: false` must return a Series; found #{rv.inspect}.\n\nIf `returns_scalar` is set to `true`, a returned value can be a scalar value."
raise TypeError, msg
end
end
end
end
end