Skip to content

Commit d235e7a

Browse files
phoflnoatamir
authored andcommitted
CLN: Use cython algo for groupby var with ddof != 1 (pandas-dev#48152)
* CLN: Use cython algo for groupby var with ddof != 1 * Adress review
1 parent 72725b3 commit d235e7a

File tree

3 files changed

+16
-16
lines changed

3 files changed

+16
-16
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -896,6 +896,7 @@ Performance improvements
896896
- Performance improvement in :meth:`.GroupBy.transform` for user-defined functions when only a single group exists (:issue:`44977`)
897897
- Performance improvement in :meth:`.GroupBy.apply` when grouping on a non-unique unsorted index (:issue:`46527`)
898898
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`45681`, :issue:`46040`, :issue:`46330`)
899+
- Performance improvement in :meth:`.GroupBy.var` with ``ddof`` other than one (:issue:`48152`)
899900
- Performance improvement in :meth:`DataFrame.to_records` when the index is a :class:`MultiIndex` (:issue:`47263`)
900901
- Performance improvement in :attr:`MultiIndex.values` when the MultiIndex contains levels of type DatetimeIndex, TimedeltaIndex or ExtensionDtypes (:issue:`46288`)
901902
- Performance improvement in :func:`merge` when left and/or right are empty (:issue:`45838`)

pandas/core/groupby/groupby.py

+14-15
Original file line numberDiff line numberDiff line change
@@ -1744,6 +1744,7 @@ def _cython_agg_general(
17441744
numeric_only: bool | lib.NoDefault,
17451745
min_count: int = -1,
17461746
ignore_failures: bool = True,
1747+
**kwargs,
17471748
):
17481749
# Note: we never get here with how="ohlc" for DataFrameGroupBy;
17491750
# that goes through SeriesGroupBy
@@ -1768,7 +1769,12 @@ def _cython_agg_general(
17681769
def array_func(values: ArrayLike) -> ArrayLike:
17691770
try:
17701771
result = self.grouper._cython_operation(
1771-
"aggregate", values, how, axis=data.ndim - 1, min_count=min_count
1772+
"aggregate",
1773+
values,
1774+
how,
1775+
axis=data.ndim - 1,
1776+
min_count=min_count,
1777+
**kwargs,
17721778
)
17731779
except NotImplementedError:
17741780
# generally if we have numeric_only=False
@@ -2311,20 +2317,13 @@ def var(
23112317

23122318
return self._numba_agg_general(sliding_var, engine_kwargs, ddof)
23132319
else:
2314-
numeric_only_bool = self._resolve_numeric_only("var", numeric_only, axis=0)
2315-
if ddof == 1:
2316-
return self._cython_agg_general(
2317-
"var",
2318-
alt=lambda x: Series(x).var(ddof=ddof),
2319-
numeric_only=numeric_only,
2320-
ignore_failures=numeric_only is lib.no_default,
2321-
)
2322-
else:
2323-
func = lambda x: x.var(ddof=ddof)
2324-
with self._group_selection_context():
2325-
return self._python_agg_general(
2326-
func, raise_on_typeerror=not numeric_only_bool
2327-
)
2320+
return self._cython_agg_general(
2321+
"var",
2322+
alt=lambda x: Series(x).var(ddof=ddof),
2323+
numeric_only=numeric_only,
2324+
ignore_failures=numeric_only is lib.no_default,
2325+
ddof=ddof,
2326+
)
23282327

23292328
@final
23302329
@Substitution(name="groupby")

pandas/core/groupby/ops.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -600,7 +600,7 @@ def _call_cython_op(
600600
elif self.how == "ohlc":
601601
func(result, counts, values, comp_ids, min_count, mask, result_mask)
602602
else:
603-
func(result, counts, values, comp_ids, min_count)
603+
func(result, counts, values, comp_ids, min_count, **kwargs)
604604
else:
605605
# TODO: min_count
606606
if self.uses_mask():

0 commit comments

Comments
 (0)