Skip to content

ENH: Add method='table' for EWM.mean #42273

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
Jun 29, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -296,5 +296,8 @@ def time_apply(self, method):
table_method_func, raw=True, engine="numba"
)

def time_ewm_mean(self, method):
self.df.ewm(1, method=method).mean(engine="numba")


from .pandas_vb_common import setup # noqa: F401 isort:skip
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ For example:
Other enhancements
^^^^^^^^^^^^^^^^^^

- :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, and :meth:`Series.expanding` now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`15095`, :issue:`38995`)
- :meth:`DataFrame.rolling`, :meth:`Series.rolling`, :meth:`DataFrame.expanding`, :meth:`Series.ewm`, :meth:`DataFrame.ewm`, :meth:`Series.expanding` now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview <window.overview>` for performance and functional benefits (:issue:`15095`, :issue:`38995`, :issue:`42273`)
- :class:`.ExponentialMovingWindow` now support a ``online`` method that can perform ``mean`` calculations in an online fashion. See :ref:`Window Overview <window.overview>` (:issue:`41673`)
- Added :meth:`MultiIndex.dtypes` (:issue:`37062`)
- Added ``end`` and ``end_day`` options for the ``origin`` argument in :meth:`DataFrame.resample` (:issue:`37804`)
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10905,6 +10905,7 @@ def ewm(
ignore_na: bool_t = False,
axis: Axis = 0,
times: str | np.ndarray | FrameOrSeries | None = None,
method: str = "single",
) -> ExponentialMovingWindow:
axis = self._get_axis_number(axis)
# error: Value of type variable "FrameOrSeries" of "ExponentialMovingWindow"
Expand All @@ -10920,6 +10921,7 @@ def ewm(
ignore_na=ignore_na,
axis=axis,
times=times,
method=method,
)

# ----------------------------------------------------------------------
Expand Down
34 changes: 28 additions & 6 deletions pandas/core/window/ewm.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,10 @@
ExponentialMovingWindowIndexer,
GroupbyIndexer,
)
from pandas.core.window.numba_ import generate_numba_ewma_func
from pandas.core.window.numba_ import (
generate_ewma_numba_table_func,
generate_numba_ewma_func,
)
from pandas.core.window.online import (
EWMMeanState,
generate_online_numba_ewma_func,
Expand Down Expand Up @@ -200,6 +203,16 @@ class ExponentialMovingWindow(BaseWindow):
If 1-D array like, a sequence with the same shape as the observations.

Only applicable to ``mean()``.
method : str {'single', 'table'}, default 'single'
Execute the rolling operation per single column or row (``'single'``)
or over the entire object (``'table'``).

This argument is only implemented when specifying ``engine='numba'``
in the method call.

Only applicable to ``mean()``

.. versionadded:: 1.3.0

Returns
-------
Expand Down Expand Up @@ -258,6 +271,7 @@ class ExponentialMovingWindow(BaseWindow):
"ignore_na",
"axis",
"times",
"method",
]

def __init__(
Expand All @@ -272,6 +286,7 @@ def __init__(
ignore_na: bool = False,
axis: Axis = 0,
times: str | np.ndarray | FrameOrSeries | None = None,
method: str = "single",
*,
selection=None,
):
Expand All @@ -281,7 +296,7 @@ def __init__(
on=None,
center=False,
closed=None,
method="single",
method=method,
axis=axis,
selection=selection,
)
Expand Down Expand Up @@ -437,12 +452,19 @@ def aggregate(self, func, *args, **kwargs):
)
def mean(self, *args, engine=None, engine_kwargs=None, **kwargs):
if maybe_use_numba(engine):
ewma_func = generate_numba_ewma_func(
engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
)
if self.method == "single":
ewma_func = generate_numba_ewma_func(
engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
)
numba_cache_key = (lambda x: x, "ewma")
else:
ewma_func = generate_ewma_numba_table_func(
engine_kwargs, self._com, self.adjust, self.ignore_na, self._deltas
)
numba_cache_key = (lambda x: x, "ewma_table")
return self._apply(
ewma_func,
numba_cache_key=(lambda x: x, "ewma"),
numba_cache_key=numba_cache_key,
)
elif engine in ("cython", None):
if engine_kwargs is not None:
Expand Down
80 changes: 80 additions & 0 deletions pandas/core/window/numba_.py
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,83 @@ def nan_agg_with_axis(table):
return result

return nan_agg_with_axis


def generate_ewma_numba_table_func(
engine_kwargs: dict[str, bool] | None,
com: float,
adjust: bool,
ignore_na: bool,
deltas: np.ndarray,
):
"""
Generate a numba jitted ewma function applied table wise specified
by values from engine_kwargs.

Parameters
----------
engine_kwargs : dict
dictionary of arguments to be passed into numba.jit
com : float
adjust : bool
ignore_na : bool
deltas : numpy.ndarray

Returns
-------
Numba function
"""
nopython, nogil, parallel = get_jit_arguments(engine_kwargs)

cache_key = (lambda x: x, "ewma_table")
if cache_key in NUMBA_FUNC_CACHE:
return NUMBA_FUNC_CACHE[cache_key]

numba = import_optional_dependency("numba")

@numba.jit(nopython=nopython, nogil=nogil, parallel=parallel)
def ewma_table(
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stuartarchibald if you have time, could you review if this function looks if I have any bad practices in this function. Some context:

  1. Generally this is run with @numba.jit(nopython=True, nogil=False, parallel=False)
  2. This function has variables from a global scope com, adjust, ignore_na, deltas
  3. I cache this function in a dictionary this first time it runs (in NUMBA_FUNC_CACHE) and later calls will return the cached fuction.

When using this function, sometimes I get a segfault:

(pandas-dev) matthewroeschke@x86_64-apple-darwin13 pandas-mroeschke % ipython
Python 3.8.6 | packaged by conda-forge | (default, Dec 26 2020, 04:50:20)
Type 'copyright', 'credits' or 'license' for more information
IPython 7.25.0 -- An enhanced Interactive Python. Type '?' for help.

In [1]: df = DataFrame(np.random.rand(3, 30))

In [2]: ewm = df.ewm(com=1, method="table")

In [3]: ewm.mean(engine="numba")
Out[3]:
         0         1         2         3         4   ...        25        26        27        28        29
0  0.770949  0.396205  0.728331  0.417464  0.715142  ...  0.237903  0.383182  0.021471  0.837933  0.590543
1  0.417231  0.369916  0.793127  0.961175  0.531131  ...  0.145222  0.152615  0.714216  0.515145  0.238283
2  0.441215  0.220925  0.868065  0.736689  0.858115  ...  0.165413  0.977209  0.549496  0.472534  0.555430

[3 rows x 30 columns]

In [4]: ewm.mean(engine="numba")
Out[4]:
         0         1         2         3         4   ...        25        26        27        28        29
0  0.770949  0.396205  0.728331  0.417464  0.715142  ...  0.237903  0.383182  0.021471  0.837933  0.590543
1  0.417231  0.369916  0.793127  0.417464  0.531131  ...  0.145222  0.152615  0.714216  0.515145  0.238283
2  0.441215  0.220925  0.868065  0.417464  0.858115  ...  0.165413  0.977209  0.549496  0.472534  0.555430

[3 rows x 30 columns]

In [5]: ewm.mean(engine="numba")
Out[5]:
         0         1         2         3         4   ...        25        26        27        28        29
0  0.770949  0.396205  0.728331  0.417464  0.715142  ...  0.237903  0.383182  0.021471  0.837933  0.590543
1  0.417231  0.369916  0.793127  0.961175  0.531131  ...  0.145222  0.152615  0.714216  0.515145  0.238283
2  0.441215  0.220925  0.868065  0.736689  0.858115  ...  0.165413  0.977209  0.549496  0.472534  0.555430

[3 rows x 30 columns]

In [6]: ewm.mean(engine="numba")
python(3345,0x111f53dc0) malloc: Incorrect checksum for freed object 0x7fd3255c55a8: probably modified after being freed.
Corrupt value: 0x3ff0000000000000
python(3345,0x111f53dc0) malloc: *** set a breakpoint in malloc_error_break to debug
zsh: abort      ipython

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke I'll take a look. Did you already try running it a) without the @jit decorator and b) with @jit(boundscheck=True) ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry for the noise @stuartarchibald. I found a bug in my algorithm. I should have tested without the @jit decorator first.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@mroeschke no worries. Numba also has this environment variable to switch off JIT compilation without needing to modify code: https://numba.readthedocs.io/en/stable/reference/envvars.html#envvar-NUMBA_DISABLE_JIT, it might be useful for helping with development/testing.

values: np.ndarray,
begin: np.ndarray,
end: np.ndarray,
minimum_periods: int,
) -> np.ndarray:
alpha = 1.0 / (1.0 + com)
old_wt_factor = 1.0 - alpha
new_wt = 1.0 if adjust else alpha
old_wt = np.ones(values.shape[0])

result = np.empty(values.shape)
weighted_avg = values[0].copy()
nobs = (~np.isnan(weighted_avg)).astype(np.int64)
result[0] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)

for i in range(1, len(values)):
cur = values[i]
is_observations = ~np.isnan(cur)
nobs += is_observations.astype(np.int64)
for j in numba.prange(len(cur)):
if not np.isnan(weighted_avg[j]):
if is_observations[j] or not ignore_na:

# note that len(deltas) = len(vals) - 1 and deltas[i] is to be
# used in conjunction with vals[i+1]
old_wt[j] *= old_wt_factor ** deltas[j - 1]
if is_observations[j]:
# avoid numerical errors on constant series
if weighted_avg[j] != cur[j]:
weighted_avg[j] = (
(old_wt[j] * weighted_avg[j]) + (new_wt * cur[j])
) / (old_wt[j] + new_wt)
if adjust:
old_wt[j] += new_wt
else:
old_wt[j] = 1.0
elif is_observations[j]:
weighted_avg[j] = cur[j]

result[i] = np.where(nobs >= minimum_periods, weighted_avg, np.nan)

return result

return ewma_table
13 changes: 13 additions & 0 deletions pandas/tests/window/test_numba.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,3 +304,16 @@ def test_table_method_expanding_methods(
engine_kwargs=engine_kwargs, engine="numba"
)
tm.assert_frame_equal(result, expected)

def test_table_method_ewm(self, axis, nogil, parallel, nopython):
engine_kwargs = {"nogil": nogil, "parallel": parallel, "nopython": nopython}

df = DataFrame(np.eye(3))

result = df.ewm(com=1, method="table", axis=axis).mean(
engine_kwargs=engine_kwargs, engine="numba"
)
expected = df.ewm(com=1, method="single", axis=axis).mean(
engine_kwargs=engine_kwargs, engine="numba"
)
tm.assert_frame_equal(result, expected)