Skip to content

Commit 355cbb1

Browse files
mroeschkeKevin D Smith
authored and
Kevin D Smith
committed
PERF: ExpandingGroupby (pandas-dev#37064)
1 parent 57f1968 commit 355cbb1

File tree

7 files changed

+167
-182
lines changed

7 files changed

+167
-182
lines changed

asv_bench/benchmarks/rolling.py

+9
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,21 @@ class ExpandingMethods:
7676

7777
def setup(self, constructor, dtype, method):
7878
N = 10 ** 5
79+
N_groupby = 100
7980
arr = (100 * np.random.random(N)).astype(dtype)
8081
self.expanding = getattr(pd, constructor)(arr).expanding()
82+
self.expanding_groupby = (
83+
pd.DataFrame({"A": arr[:N_groupby], "B": range(N_groupby)})
84+
.groupby("B")
85+
.expanding()
86+
)
8187

8288
def time_expanding(self, constructor, dtype, method):
8389
getattr(self.expanding, method)()
8490

91+
def time_expanding_groupby(self, constructor, dtype, method):
92+
getattr(self.expanding_groupby, method)()
93+
8594

8695
class EWMMethods:
8796

doc/source/whatsnew/v1.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ Performance improvements
314314
avoiding creating these again, if created on either. This can speed up operations that depend on creating copies of existing indexes (:issue:`36840`)
315315
- Performance improvement in :meth:`RollingGroupby.count` (:issue:`35625`)
316316
- Small performance decrease to :meth:`Rolling.min` and :meth:`Rolling.max` for fixed windows (:issue:`36567`)
317+
- Performance improvement in :class:`ExpandingGroupby` (:issue:`37064`)
317318

318319
.. ---------------------------------------------------------------------------
319320

pandas/_libs/window/aggregations.pyx

+1-1
Original file line numberDiff line numberDiff line change
@@ -302,7 +302,7 @@ cdef inline float64_t calc_var(int64_t minp, int ddof, float64_t nobs,
302302
result = ssqdm_x / (nobs - <float64_t>ddof)
303303
# Fix for numerical imprecision.
304304
# Can be result < 0 once Kahan Summation is implemented
305-
if result < 1e-15:
305+
if result < 1e-14:
306306
result = 0
307307
else:
308308
result = NaN

pandas/core/window/common.py

-65
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,11 @@
11
"""Common utility functions for rolling operations"""
22
from collections import defaultdict
3-
from typing import Callable, Optional
43
import warnings
54

65
import numpy as np
76

87
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
98

10-
from pandas.core.groupby.base import GotItemMixin
119
from pandas.core.indexes.api import MultiIndex
1210
from pandas.core.shared_docs import _shared_docs
1311

@@ -27,69 +25,6 @@
2725
"""
2826

2927

30-
def _dispatch(name: str, *args, **kwargs):
31-
"""
32-
Dispatch to apply.
33-
"""
34-
35-
def outer(self, *args, **kwargs):
36-
def f(x):
37-
x = self._shallow_copy(x, groupby=self._groupby)
38-
return getattr(x, name)(*args, **kwargs)
39-
40-
return self._groupby.apply(f)
41-
42-
outer.__name__ = name
43-
return outer
44-
45-
46-
class WindowGroupByMixin(GotItemMixin):
47-
"""
48-
Provide the groupby facilities.
49-
"""
50-
51-
def __init__(self, obj, *args, **kwargs):
52-
kwargs.pop("parent", None)
53-
groupby = kwargs.pop("groupby", None)
54-
if groupby is None:
55-
groupby, obj = obj, obj._selected_obj
56-
self._groupby = groupby
57-
self._groupby.mutated = True
58-
self._groupby.grouper.mutated = True
59-
super().__init__(obj, *args, **kwargs)
60-
61-
corr = _dispatch("corr", other=None, pairwise=None)
62-
cov = _dispatch("cov", other=None, pairwise=None)
63-
64-
def _apply(
65-
self,
66-
func: Callable,
67-
require_min_periods: int = 0,
68-
floor: int = 1,
69-
is_weighted: bool = False,
70-
name: Optional[str] = None,
71-
use_numba_cache: bool = False,
72-
**kwargs,
73-
):
74-
"""
75-
Dispatch to apply; we are stripping all of the _apply kwargs and
76-
performing the original function call on the grouped object.
77-
"""
78-
kwargs.pop("floor", None)
79-
kwargs.pop("original_func", None)
80-
81-
# TODO: can we de-duplicate with _dispatch?
82-
def f(x, name=name, *args):
83-
x = self._shallow_copy(x)
84-
85-
if isinstance(name, str):
86-
return getattr(x, name)(*args, **kwargs)
87-
88-
return x.apply(name, *args, **kwargs)
89-
90-
return self._groupby.apply(f)
91-
92-
9328
def flex_binary_moment(arg1, arg2, f, pairwise=False):
9429

9530
if not (

pandas/core/window/expanding.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,9 @@
44
from pandas.compat.numpy import function as nv
55
from pandas.util._decorators import Appender, Substitution, doc
66

7-
from pandas.core.window.common import WindowGroupByMixin, _doc_template, _shared_docs
8-
from pandas.core.window.rolling import RollingAndExpandingMixin
7+
from pandas.core.window.common import _doc_template, _shared_docs
8+
from pandas.core.window.indexers import ExpandingIndexer, GroupbyIndexer
9+
from pandas.core.window.rolling import BaseWindowGroupby, RollingAndExpandingMixin
910

1011

1112
class Expanding(RollingAndExpandingMixin):
@@ -253,11 +254,26 @@ def corr(self, other=None, pairwise=None, **kwargs):
253254
return super().corr(other=other, pairwise=pairwise, **kwargs)
254255

255256

256-
class ExpandingGroupby(WindowGroupByMixin, Expanding):
257+
class ExpandingGroupby(BaseWindowGroupby, Expanding):
257258
"""
258259
Provide a expanding groupby implementation.
259260
"""
260261

261-
@property
262-
def _constructor(self):
263-
return Expanding
262+
def _get_window_indexer(self, window: int) -> GroupbyIndexer:
263+
"""
264+
Return an indexer class that will compute the window start and end bounds
265+
266+
Parameters
267+
----------
268+
window : int
269+
window size for FixedWindowIndexer (unused)
270+
271+
Returns
272+
-------
273+
GroupbyIndexer
274+
"""
275+
window_indexer = GroupbyIndexer(
276+
groupby_indicies=self._groupby.indices,
277+
window_indexer=ExpandingIndexer,
278+
)
279+
return window_indexer

pandas/core/window/indexers.py

+21-9
Original file line numberDiff line numberDiff line change
@@ -259,26 +259,38 @@ def get_window_bounds(
259259
return start, end
260260

261261

262-
class GroupbyRollingIndexer(BaseIndexer):
262+
class GroupbyIndexer(BaseIndexer):
263263
"""Calculate bounds to compute groupby rolling, mimicking df.groupby().rolling()"""
264264

265265
def __init__(
266266
self,
267-
index_array: Optional[np.ndarray],
268-
window_size: int,
269-
groupby_indicies: Dict,
270-
rolling_indexer: Type[BaseIndexer],
271-
indexer_kwargs: Optional[Dict],
267+
index_array: Optional[np.ndarray] = None,
268+
window_size: int = 0,
269+
groupby_indicies: Optional[Dict] = None,
270+
window_indexer: Type[BaseIndexer] = BaseIndexer,
271+
indexer_kwargs: Optional[Dict] = None,
272272
**kwargs,
273273
):
274274
"""
275275
Parameters
276276
----------
277+
index_array : np.ndarray or None
278+
np.ndarray of the index of the original object that we are performing
279+
a chained groupby operation over. This index has been pre-sorted relative to
280+
the groups
281+
window_size : int
282+
window size during the windowing operation
283+
groupby_indicies : dict or None
284+
dict of {group label: [positional index of rows belonging to the group]}
285+
window_indexer : BaseIndexer
286+
BaseIndexer class determining the start and end bounds of each group
287+
indexer_kwargs : dict or None
288+
Custom kwargs to be passed to window_indexer
277289
**kwargs :
278290
keyword arguments that will be available when get_window_bounds is called
279291
"""
280-
self.groupby_indicies = groupby_indicies
281-
self.rolling_indexer = rolling_indexer
292+
self.groupby_indicies = groupby_indicies or {}
293+
self.window_indexer = window_indexer
282294
self.indexer_kwargs = indexer_kwargs or {}
283295
super().__init__(
284296
index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs
@@ -303,7 +315,7 @@ def get_window_bounds(
303315
index_array = self.index_array.take(ensure_platform_int(indices))
304316
else:
305317
index_array = self.index_array
306-
indexer = self.rolling_indexer(
318+
indexer = self.window_indexer(
307319
index_array=index_array,
308320
window_size=self.window_size,
309321
**self.indexer_kwargs,

0 commit comments

Comments
 (0)