Skip to content

Commit b2671cc

Browse files
authored
PERF: Rolling/Expanding.cov/corr (#39591)
1 parent cb1486c commit b2671cc

File tree

5 files changed

+169
-86
lines changed

5 files changed

+169
-86
lines changed

asv_bench/benchmarks/rolling.py

+10
Original file line numberDiff line numberDiff line change
@@ -140,8 +140,11 @@ class Pairwise:
140140

141141
def setup(self, window, method, pairwise):
142142
N = 10 ** 4
143+
n_groups = 20
144+
groups = [i for _ in range(N // n_groups) for i in range(n_groups)]
143145
arr = np.random.random(N)
144146
self.df = pd.DataFrame(arr)
147+
self.df_group = pd.DataFrame({"A": groups, "B": arr}).groupby("A")
145148

146149
def time_pairwise(self, window, method, pairwise):
147150
if window is None:
@@ -150,6 +153,13 @@ def time_pairwise(self, window, method, pairwise):
150153
r = self.df.rolling(window=window)
151154
getattr(r, method)(self.df, pairwise=pairwise)
152155

156+
def time_groupby(self, window, method, pairwise):
157+
if window is None:
158+
r = self.df_group.expanding()
159+
else:
160+
r = self.df_group.rolling(window=window)
161+
getattr(r, method)(self.df, pairwise=pairwise)
162+
153163

154164
class Quantile:
155165
params = (

doc/source/whatsnew/v1.3.0.rst

+4-1
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,8 @@ Performance improvements
249249
- Performance improvement in :meth:`Series.mean` for nullable data types (:issue:`34814`)
250250
- Performance improvement in :meth:`Series.isin` for nullable data types (:issue:`38340`)
251251
- Performance improvement in :meth:`DataFrame.corr` for method=kendall (:issue:`28329`)
252-
- Performance improvement in :meth:`core.window.Rolling.corr` and :meth:`core.window.Rolling.cov` (:issue:`39388`)
252+
- Performance improvement in :meth:`core.window.rolling.Rolling.corr` and :meth:`core.window.rolling.Rolling.cov` (:issue:`39388`)
253+
- Performance improvement in :meth:`core.window.rolling.RollingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr`, :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` (:issue:`39591`)
253254

254255
.. ---------------------------------------------------------------------------
255256
@@ -407,6 +408,8 @@ Groupby/resample/rolling
407408
- Bug in :meth:`.Resampler.aggregate` and :meth:`DataFrame.transform` raising ``TypeError`` instead of ``SpecificationError`` when missing keys had mixed dtypes (:issue:`39025`)
408409
- Bug in :meth:`.DataFrameGroupBy.idxmin` and :meth:`.DataFrameGroupBy.idxmax` with ``ExtensionDtype`` columns (:issue:`38733`)
409410
- Bug in :meth:`Series.resample` would raise when the index was a :class:`PeriodIndex` consisting of ``NaT`` (:issue:`39227`)
411+
- Bug in :meth:`core.window.rolling.RollingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.corr` where the groupby column would return 0 instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`)
412+
- Bug in :meth:`core.window.expanding.ExpandingGroupby.corr` and :meth:`core.window.expanding.ExpandingGroupby.cov` where 1 would be returned instead of ``np.nan`` when providing ``other`` that was longer than each group (:issue:`39591`)
410413

411414
Reshaping
412415
^^^^^^^^^

pandas/core/window/ewm.py

+41-51
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@
33
import datetime
44
from functools import partial
55
from textwrap import dedent
6-
from typing import TYPE_CHECKING, Optional, Union
6+
from typing import Optional, Union
77
import warnings
88

99
import numpy as np
1010

1111
from pandas._libs.tslibs import Timedelta
1212
import pandas._libs.window.aggregations as window_aggregations
13-
from pandas._typing import FrameOrSeries, TimedeltaConvertibleTypes
13+
from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, TimedeltaConvertibleTypes
1414
from pandas.compat.numpy import function as nv
1515
from pandas.util._decorators import doc
1616

@@ -19,7 +19,7 @@
1919

2020
import pandas.core.common as common
2121
from pandas.core.util.numba_ import maybe_use_numba
22-
from pandas.core.window.common import flex_binary_moment, zsqrt
22+
from pandas.core.window.common import zsqrt
2323
from pandas.core.window.doc import (
2424
_shared_docs,
2525
args_compat,
@@ -35,10 +35,7 @@
3535
GroupbyIndexer,
3636
)
3737
from pandas.core.window.numba_ import generate_numba_groupby_ewma_func
38-
from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby, dispatch
39-
40-
if TYPE_CHECKING:
41-
from pandas import Series
38+
from pandas.core.window.rolling import BaseWindow, BaseWindowGroupby
4239

4340

4441
def get_center_of_mass(
@@ -74,13 +71,20 @@ def get_center_of_mass(
7471
return float(comass)
7572

7673

77-
def wrap_result(obj: Series, result: np.ndarray) -> Series:
74+
def dispatch(name: str, *args, **kwargs):
7875
"""
79-
Wrap a single 1D result.
76+
Dispatch to groupby apply.
8077
"""
81-
obj = obj._selected_obj
8278

83-
return obj._constructor(result, obj.index, name=obj.name)
79+
def outer(self, *args, **kwargs):
80+
def f(x):
81+
x = self._shallow_copy(x, groupby=self._groupby)
82+
return getattr(x, name)(*args, **kwargs)
83+
84+
return self._groupby.apply(f)
85+
86+
outer.__name__ = name
87+
return outer
8488

8589

8690
class ExponentialMovingWindow(BaseWindow):
@@ -443,36 +447,30 @@ def var_func(values, begin, end, min_periods):
443447
)
444448
def cov(
445449
self,
446-
other: Optional[Union[np.ndarray, FrameOrSeries]] = None,
450+
other: Optional[FrameOrSeriesUnion] = None,
447451
pairwise: Optional[bool] = None,
448452
bias: bool = False,
449453
**kwargs,
450454
):
451-
if other is None:
452-
other = self._selected_obj
453-
# only default unset
454-
pairwise = True if pairwise is None else pairwise
455-
other = self._shallow_copy(other)
456-
457-
def _get_cov(X, Y):
458-
X = self._shallow_copy(X)
459-
Y = self._shallow_copy(Y)
460-
cov = window_aggregations.ewmcov(
461-
X._prep_values(),
455+
from pandas import Series
456+
457+
def cov_func(x, y):
458+
x_array = self._prep_values(x)
459+
y_array = self._prep_values(y)
460+
result = window_aggregations.ewmcov(
461+
x_array,
462462
np.array([0], dtype=np.int64),
463463
np.array([0], dtype=np.int64),
464464
self.min_periods,
465-
Y._prep_values(),
465+
y_array,
466466
self.com,
467467
self.adjust,
468468
self.ignore_na,
469469
bias,
470470
)
471-
return wrap_result(X, cov)
471+
return Series(result, index=x.index, name=x.name)
472472

473-
return flex_binary_moment(
474-
self._selected_obj, other._selected_obj, _get_cov, pairwise=bool(pairwise)
475-
)
473+
return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func)
476474

477475
@doc(
478476
template_header,
@@ -502,45 +500,37 @@ def _get_cov(X, Y):
502500
)
503501
def corr(
504502
self,
505-
other: Optional[Union[np.ndarray, FrameOrSeries]] = None,
503+
other: Optional[FrameOrSeriesUnion] = None,
506504
pairwise: Optional[bool] = None,
507505
**kwargs,
508506
):
509-
if other is None:
510-
other = self._selected_obj
511-
# only default unset
512-
pairwise = True if pairwise is None else pairwise
513-
other = self._shallow_copy(other)
507+
from pandas import Series
514508

515-
def _get_corr(X, Y):
516-
X = self._shallow_copy(X)
517-
Y = self._shallow_copy(Y)
509+
def cov_func(x, y):
510+
x_array = self._prep_values(x)
511+
y_array = self._prep_values(y)
518512

519-
def _cov(x, y):
513+
def _cov(X, Y):
520514
return window_aggregations.ewmcov(
521-
x,
515+
X,
522516
np.array([0], dtype=np.int64),
523517
np.array([0], dtype=np.int64),
524518
self.min_periods,
525-
y,
519+
Y,
526520
self.com,
527521
self.adjust,
528522
self.ignore_na,
529523
1,
530524
)
531525

532-
x_values = X._prep_values()
533-
y_values = Y._prep_values()
534526
with np.errstate(all="ignore"):
535-
cov = _cov(x_values, y_values)
536-
x_var = _cov(x_values, x_values)
537-
y_var = _cov(y_values, y_values)
538-
corr = cov / zsqrt(x_var * y_var)
539-
return wrap_result(X, corr)
540-
541-
return flex_binary_moment(
542-
self._selected_obj, other._selected_obj, _get_corr, pairwise=bool(pairwise)
543-
)
527+
cov = _cov(x_array, y_array)
528+
x_var = _cov(x_array, x_array)
529+
y_var = _cov(y_array, y_array)
530+
result = cov / zsqrt(x_var * y_var)
531+
return Series(result, index=x.index, name=x.name)
532+
533+
return self._apply_pairwise(self._selected_obj, other, pairwise, cov_func)
544534

545535

546536
class ExponentialMovingWindowGroupby(BaseWindowGroupby, ExponentialMovingWindow):

0 commit comments

Comments
 (0)