Skip to content

BUG: rolling_apply(..., center=True) should not append NaNs #8275

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 17, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ API changes
:func:`rolling_window`, and :func:`rolling_apply` with ``center=True`` previously would return a result of the same
structure as the input ``arg`` with ``NaN`` in the final ``(window-1)/2`` entries.
Now the final ``(window-1)/2`` entries of the result are calculated as if the input ``arg`` were followed
by ``(window-1)/2`` ``NaN`` values. (:issue:`7925`)
by ``(window-1)/2`` ``NaN`` values (or with shrinking windows, in the case of :func:`rolling_apply`).
(:issue:`7925`, :issue:`8269`)

Prior behavior (note final value is ``NaN``):

Expand Down
33 changes: 19 additions & 14 deletions pandas/algos.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1846,8 +1846,9 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int win,

return output

def roll_generic(ndarray[float64_t, cast=True] input, int win,
int minp, object func, object args, object kwargs):
def roll_generic(ndarray[float64_t, cast=True] input,
int win, int minp, int offset,
object func, object args, object kwargs):
cdef ndarray[double_t] output, counts, bufarr
cdef Py_ssize_t i, n
cdef float64_t *buf
Expand All @@ -1856,37 +1857,41 @@ def roll_generic(ndarray[float64_t, cast=True] input, int win,
if not input.flags.c_contiguous:
input = input.copy('C')

buf = <float64_t*> input.data

n = len(input)
if n == 0:
return input

minp = _check_minp(win, minp, n, floor=0)
output = np.empty(n, dtype=float)
counts = roll_sum(np.isfinite(input).astype(float), win, minp)
counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:]

bufarr = np.empty(win, dtype=float)
oldbuf = <float64_t*> bufarr.data

n = len(input)
for i from 0 <= i < int_min(win, n):
# truncated windows at the beginning, through first full-length window
for i from 0 <= i < (int_min(win, n) - offset):
if counts[i] >= minp:
output[i] = func(input[int_max(i - win + 1, 0) : i + 1], *args,
**kwargs)
output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs)
else:
output[i] = NaN

for i from win <= i < n:
# remaining full-length windows
buf = <float64_t*> input.data
bufarr = np.empty(win, dtype=float)
oldbuf = <float64_t*> bufarr.data
for i from (win - offset) <= i < (n - offset):
buf = buf + 1
bufarr.data = <char*> buf
if counts[i] >= minp:
output[i] = func(bufarr, *args, **kwargs)
else:
output[i] = NaN

bufarr.data = <char*> oldbuf

# truncated windows at the end
for i from int_max(n - offset, 0) <= i < n:
if counts[i] >= minp:
output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs)
else:
output[i] = NaN

return output


Expand Down
5 changes: 3 additions & 2 deletions pandas/stats/moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -738,11 +738,12 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None,
frequency by resampling the data. This is done with the default parameters
of :meth:`~pandas.Series.resample` (i.e. using the `mean`).
"""
offset = int((window - 1) / 2.) if center else 0
def call_cython(arg, window, minp, args, kwargs):
minp = _use_window(minp, window)
return algos.roll_generic(arg, window, minp, func, args, kwargs)
return algos.roll_generic(arg, window, minp, offset, func, args, kwargs)
return _rolling_moment(arg, window, call_cython, min_periods, freq=freq,
center=center, args=args, kwargs=kwargs)
center=False, args=args, kwargs=kwargs)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, roll_generic is hard-coded to always return "centered" results, correct? It looks like the center argument should be removed from the rolling_apply function declaration since you're hard-coding center=False in your call to _rolling_moment (and the API change should probably be noted).

Otherwise, this looks OK to me.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, it is not hard-coded to return centered results; center affects the value of offset.

On Sep 16, 2014, at 11:33 PM, stahlous [email protected] wrote:

In pandas/stats/moments.py:

 return _rolling_moment(arg, window, call_cython, min_periods, freq=freq,
  •                       center=center, args=args, kwargs=kwargs)
    
  •                       center=False, args=args, kwargs=kwargs)
    
    So, this is hard-coded to always return "centered" results, correct? It looks like center=False should be removed from the function declaration (and the API change noted in the docs).

Otherwise, this looks OK to me.


Reply to this email directly or view it on GitHub.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah. Duh. Looks good then.



def rolling_window(arg, window=None, win_type=None, min_periods=None,
Expand Down
129 changes: 94 additions & 35 deletions pandas/stats/tests/test_moments.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from datetime import datetime
from numpy.random import randn
import numpy as np
from distutils.version import LooseVersion

from pandas import Series, DataFrame, Panel, bdate_range, isnull, notnull
from pandas.util.testing import (
Expand Down Expand Up @@ -877,6 +878,45 @@ def _ewma(s, com, min_periods, adjust, ignore_na):
var_debiasing_factors=lambda x: _variance_debiasing_factors(x, com=com, adjust=adjust, ignore_na=ignore_na))

def test_expanding_consistency(self):
base_functions = [
(mom.expanding_count, lambda v: Series(v).count(), None),
(mom.expanding_max, lambda v: Series(v).max(), None),
(mom.expanding_min, lambda v: Series(v).min(), None),
(mom.expanding_sum, lambda v: Series(v).sum(), None),
(mom.expanding_mean, lambda v: Series(v).mean(), None),
(mom.expanding_std, lambda v: Series(v).std(), 1),
(mom.expanding_cov, lambda v: Series(v).cov(Series(v)), None),
(mom.expanding_corr, lambda v: Series(v).corr(Series(v)), None),
(mom.expanding_var, lambda v: Series(v).var(), 1),
#(mom.expanding_skew, lambda v: Series(v).skew(), 3), # restore once GH 8086 is fixed
#(mom.expanding_kurt, lambda v: Series(v).kurt(), 4), # restore once GH 8086 is fixed
#(lambda x, min_periods: mom.expanding_quantile(x, 0.3, min_periods=min_periods),
# lambda v: Series(v).quantile(0.3), None), # restore once GH 8084 is fixed
(mom.expanding_median, lambda v: Series(v).median(), None),
(mom.expanding_max, np.nanmax, 1),
(mom.expanding_min, np.nanmin, 1),
(mom.expanding_sum, np.nansum, 1),
]
if np.__version__ >= LooseVersion('1.8.0'):
base_functions += [
(mom.expanding_mean, np.nanmean, 1),
(mom.expanding_std, lambda v: np.nanstd(v, ddof=1), 1),
(mom.expanding_var, lambda v: np.nanvar(v, ddof=1), 1),
]
if np.__version__ >= LooseVersion('1.9.0'):
base_functions += [
(mom.expanding_median, np.nanmedian, 1),
]
no_nan_functions = [
(mom.expanding_max, np.max, None),
(mom.expanding_min, np.min, None),
(mom.expanding_sum, np.sum, None),
(mom.expanding_mean, np.mean, None),
(mom.expanding_std, lambda v: np.std(v, ddof=1), 1),
(mom.expanding_var, lambda v: np.var(v, ddof=1), 1),
(mom.expanding_median, np.median, None),
]

for min_periods in [0, 1, 2, 3, 4]:

# test consistency between different expanding_* moments
Expand All @@ -895,25 +935,15 @@ def test_expanding_consistency(self):
var_debiasing_factors=lambda x: mom.expanding_count(x) / (mom.expanding_count(x) - 1.).replace(0., np.nan)
)

# test consistency between expanding_xyz() and expanding_apply of Series/DataFrame.xyz()
# test consistency between expanding_xyz() and either (a) expanding_apply of Series.xyz(),
# or (b) expanding_apply of np.nanxyz()
for x in self._test_data():
assert_equal = assert_series_equal if isinstance(x, Series) else assert_frame_equal
for (expanding_f, f, require_min_periods) in [
(mom.expanding_count, lambda v: Series(v).count(), None),
(mom.expanding_max, lambda v: Series(v).max(), None),
(mom.expanding_min, lambda v: Series(v).min(), None),
(mom.expanding_sum, lambda v: Series(v).sum(), None),
(mom.expanding_mean, lambda v: Series(v).mean(), None),
(mom.expanding_std, lambda v: Series(v).std(), 1),
(mom.expanding_cov, lambda v: Series(v).cov(Series(v)), None),
(mom.expanding_corr, lambda v: Series(v).corr(Series(v)), None),
(mom.expanding_var, lambda v: Series(v).var(), 1),
#(mom.expanding_skew, lambda v: Series(v).skew(), 3), # restore once GH 8086 is fixed
#(mom.expanding_kurt, lambda v: Series(v).kurt(), 4), # restore once GH 8086 is fixed
#(lambda x, min_periods: mom.expanding_quantile(x, 0.3, min_periods=min_periods),
# lambda v: Series(v).quantile(0.3), None), # restore once GH 8084 is fixed
(mom.expanding_median, lambda v: Series(v).median(), None),
]:
functions = base_functions
# GH 8269
if x.notnull().all().all():
functions = base_functions + no_nan_functions
for (expanding_f, f, require_min_periods) in functions:
if require_min_periods and (min_periods is not None) and (min_periods < require_min_periods):
continue

Expand All @@ -938,7 +968,46 @@ def test_expanding_consistency(self):
assert_panel_equal(expanding_f_result, expected)

def test_rolling_consistency(self):
for window in [1, 3, 10, 20]:
base_functions = [
(mom.rolling_count, lambda v: Series(v).count(), None),
(mom.rolling_max, lambda v: Series(v).max(), None),
(mom.rolling_min, lambda v: Series(v).min(), None),
(mom.rolling_sum, lambda v: Series(v).sum(), None),
(mom.rolling_mean, lambda v: Series(v).mean(), None),
(mom.rolling_std, lambda v: Series(v).std(), 1),
(mom.rolling_cov, lambda v: Series(v).cov(Series(v)), None),
(mom.rolling_corr, lambda v: Series(v).corr(Series(v)), None),
(mom.rolling_var, lambda v: Series(v).var(), 1),
#(mom.rolling_skew, lambda v: Series(v).skew(), 3), # restore once GH 8086 is fixed
# (mom.rolling_kurt, lambda v: Series(v).kurt(), 4), # restore once GH 8086 is fixed
#(lambda x, window, min_periods, center: mom.rolling_quantile(x, window, 0.3, min_periods=min_periods, center=center),
# lambda v: Series(v).quantile(0.3), None), # restore once GH 8084 is fixed
(mom.rolling_median, lambda v: Series(v).median(), None),
(mom.rolling_max, np.nanmax, 1),
(mom.rolling_min, np.nanmin, 1),
(mom.rolling_sum, np.nansum, 1),
]
if np.__version__ >= LooseVersion('1.8.0'):
base_functions += [
(mom.rolling_mean, np.nanmean, 1),
(mom.rolling_std, lambda v: np.nanstd(v, ddof=1), 1),
(mom.rolling_var, lambda v: np.nanvar(v, ddof=1), 1),
]
if np.__version__ >= LooseVersion('1.9.0'):
base_functions += [
(mom.rolling_median, np.nanmedian, 1),
]
no_nan_functions = [
(mom.rolling_max, np.max, None),
(mom.rolling_min, np.min, None),
(mom.rolling_sum, np.sum, None),
(mom.rolling_mean, np.mean, None),
(mom.rolling_std, lambda v: np.std(v, ddof=1), 1),
(mom.rolling_var, lambda v: np.var(v, ddof=1), 1),
(mom.rolling_median, np.median, None),
]

for window in [1, 2, 3, 10, 20]:
for min_periods in set([0, 1, 2, 3, 4, window]):
if min_periods and (min_periods > window):
continue
Expand All @@ -962,25 +1031,15 @@ def test_rolling_consistency(self):
(mom.rolling_count(x, window=window, center=center) - 1.).replace(0., np.nan)),
)

# test consistency between rolling_xyz and rolling_apply of Series/DataFrame.xyz
# test consistency between rolling_xyz() and either (a) rolling_apply of Series.xyz(),
# or (b) rolling_apply of np.nanxyz()
for x in self._test_data():
assert_equal = assert_series_equal if isinstance(x, Series) else assert_frame_equal
for (rolling_f, f, require_min_periods) in [
(mom.rolling_count, lambda v: Series(v).count(), None),
(mom.rolling_max, lambda v: Series(v).max(), None),
(mom.rolling_min, lambda v: Series(v).min(), None),
(mom.rolling_sum, lambda v: Series(v).sum(), None),
(mom.rolling_mean, lambda v: Series(v).mean(), None),
(mom.rolling_std, lambda v: Series(v).std(), 1),
(mom.rolling_cov, lambda v: Series(v).cov(Series(v)), None),
(mom.rolling_corr, lambda v: Series(v).corr(Series(v)), None),
(mom.rolling_var, lambda v: Series(v).var(), 1),
#(mom.rolling_skew, lambda v: Series(v).skew(), 3), # restore once GH 8086 is fixed
# (mom.rolling_kurt, lambda v: Series(v).kurt(), 4), # restore once GH 8086 is fixed
#(lambda x, window, min_periods, center: mom.rolling_quantile(x, window, 0.3, min_periods=min_periods, center=center),
# lambda v: Series(v).quantile(0.3), None), # restore once GH 8084 is fixed
(mom.rolling_median, lambda v: Series(v).median(), None),
]:
functions = base_functions
# GH 8269
if x.notnull().all().all():
functions = base_functions + no_nan_functions
for (rolling_f, f, require_min_periods) in functions:
if require_min_periods and (min_periods is not None) and (min_periods < require_min_periods):
continue

Expand Down