Skip to content

DEPR: min_periods=None behavior for Rolling.count #36649

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Oct 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.2.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,7 @@ Deprecations
- Deprecated indexing :class:`DataFrame` rows with datetime-like strings ``df[string]``, use ``df.loc[string]`` instead (:issue:`36179`)
- Deprecated casting an object-dtype index of ``datetime`` objects to :class:`DatetimeIndex` in the :class:`Series` constructor (:issue:`23598`)
- Deprecated :meth:`Index.is_all_dates` (:issue:`27744`)
- :meth:`Rolling.count` with ``min_periods=None`` will default to the size of the window in a future version (:issue:`31302`)

.. ---------------------------------------------------------------------------

Expand Down Expand Up @@ -404,6 +405,7 @@ Groupby/resample/rolling
- Bug in :meth:`DataFrame.groupby` does not always maintain column index name for ``any``, ``all``, ``bfill``, ``ffill``, ``shift`` (:issue:`29764`)
- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`)
- Bug in :meth:`Rolling.sum()` returned wrong values when dtypes where mixed between float and integer and axis was equal to one (:issue:`20649`, :issue:`35596`)
- Bug in :meth:`Rolling.count` returned ``np.nan`` with :class:`pandas.api.indexers.FixedForwardWindowIndexer` as window, ``min_periods=0`` and only missing values in window (:issue:`35579`)

Reshaping
^^^^^^^^^
Expand Down
56 changes: 0 additions & 56 deletions pandas/_libs/window/aggregations.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -89,62 +89,6 @@ cdef bint is_monotonic_start_end_bounds(
# Physical description: 366 p.
# Series: Prentice-Hall Series in Automatic Computation

# ----------------------------------------------------------------------
# Rolling count
# this is only an impl for index not None, IOW, freq aware


def roll_count(
ndarray[float64_t] values,
ndarray[int64_t] start,
ndarray[int64_t] end,
int64_t minp,
):
cdef:
float64_t val, count_x = 0.0
int64_t s, e, nobs, N = len(values)
Py_ssize_t i, j
ndarray[float64_t] output

output = np.empty(N, dtype=float)

with nogil:

for i in range(0, N):
s = start[i]
e = end[i]

if i == 0:

# setup
count_x = 0.0
for j in range(s, e):
val = values[j]
if notnan(val):
count_x += 1.0

else:

# calculate deletes
for j in range(start[i - 1], s):
val = values[j]
if notnan(val):
count_x -= 1.0

# calculate adds
for j in range(end[i - 1], e):
val = values[j]
if notnan(val):
count_x += 1.0

if count_x >= minp:
output[i] = count_x
else:
output[i] = NaN

return output


# ----------------------------------------------------------------------
# Rolling sum

Expand Down
66 changes: 26 additions & 40 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
Type,
Union,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -469,31 +470,39 @@ def _get_window_indexer(self, window: int) -> BaseIndexer:
return VariableWindowIndexer(index_array=self._on.asi8, window_size=window)
return FixedWindowIndexer(window_size=window)

def _apply_series(self, homogeneous_func: Callable[..., ArrayLike]) -> "Series":
def _apply_series(
self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None
) -> "Series":
"""
Series version of _apply_blockwise
"""
obj = self._create_data(self._selected_obj)

try:
values = self._prep_values(obj.values)
# GH 12541: Special case for count where we support date-like types
input = obj.values if name != "count" else notna(obj.values).astype(int)
values = self._prep_values(input)
except (TypeError, NotImplementedError) as err:
raise DataError("No numeric types to aggregate") from err

result = homogeneous_func(values)
return obj._constructor(result, index=obj.index, name=obj.name)

def _apply_blockwise(
self, homogeneous_func: Callable[..., ArrayLike]
self, homogeneous_func: Callable[..., ArrayLike], name: Optional[str] = None
) -> FrameOrSeriesUnion:
"""
Apply the given function to the DataFrame broken down into homogeneous
sub-frames.
"""
if self._selected_obj.ndim == 1:
return self._apply_series(homogeneous_func)
return self._apply_series(homogeneous_func, name)

obj = self._create_data(self._selected_obj)
if name == "count":
# GH 12541: Special case for count where we support date-like types
obj = notna(obj).astype(int)
obj._mgr = obj._mgr.consolidate()
mgr = obj._mgr

def hfunc(bvalues: ArrayLike) -> ArrayLike:
Expand Down Expand Up @@ -606,7 +615,7 @@ def calc(x):

return result

return self._apply_blockwise(homogeneous_func)
return self._apply_blockwise(homogeneous_func, name)

def aggregate(self, func, *args, **kwargs):
result, how = self._aggregate(func, *args, **kwargs)
Expand Down Expand Up @@ -1265,33 +1274,8 @@ class RollingAndExpandingMixin(BaseWindow):
)

def count(self):
# GH 32865. Using count with custom BaseIndexer subclass
# implementations shouldn't end up here
assert not isinstance(self.window, BaseIndexer)

obj = self._create_data(self._selected_obj)

def hfunc(values: np.ndarray) -> np.ndarray:
result = notna(values)
result = result.astype(int)
frame = type(obj)(result.T)
result = self._constructor(
frame,
window=self._get_window(),
min_periods=self.min_periods or 0,
center=self.center,
axis=self.axis,
closed=self.closed,
).sum()
return result.values.T

new_mgr = obj._mgr.apply(hfunc)
out = obj._constructor(new_mgr)
if obj.ndim == 1:
out.name = obj.name
else:
self._insert_on_column(out, obj)
return out
window_func = self._get_cython_func_type("roll_sum")
return self._apply(window_func, center=self.center, name="count")

_shared_docs["apply"] = dedent(
r"""
Expand Down Expand Up @@ -2050,14 +2034,16 @@ def aggregate(self, func, *args, **kwargs):
@Substitution(name="rolling")
@Appender(_shared_docs["count"])
def count(self):

# different impl for freq counting
# GH 32865. Use a custom count function implementation
# when using a BaseIndexer subclass as a window
if self.is_freq_type or isinstance(self.window, BaseIndexer):
window_func = self._get_roll_func("roll_count")
return self._apply(window_func, center=self.center, name="count")

if self.min_periods is None:
warnings.warn(
(
"min_periods=None will default to the size of window "
"consistent with other methods in a future version. "
"Specify min_periods=0 instead."
),
FutureWarning,
)
self.min_periods = 0
return super().count()

@Substitution(name="rolling")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -452,7 +452,7 @@ def test_moment_functions_zero_length():
df2_expected = df2

functions = [
lambda x: x.rolling(window=10).count(),
lambda x: x.rolling(window=10, min_periods=0).count(),
lambda x: x.rolling(window=10, min_periods=5).cov(x, pairwise=False),
lambda x: x.rolling(window=10, min_periods=5).corr(x, pairwise=False),
lambda x: x.rolling(window=10, min_periods=5).max(),
Expand Down
14 changes: 12 additions & 2 deletions pandas/tests/window/moments/test_moments_rolling_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
[
[np.mean, "mean", {}],
[np.nansum, "sum", {}],
[lambda x: np.isfinite(x).astype(float).sum(), "count", {}],
pytest.param(
lambda x: np.isfinite(x).astype(float).sum(),
"count",
{},
marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"),
),
[np.median, "median", {}],
[np.min, "min", {}],
[np.max, "max", {}],
Expand All @@ -33,7 +38,12 @@ def test_series(series, compare_func, roll_func, kwargs):
[
[np.mean, "mean", {}],
[np.nansum, "sum", {}],
[lambda x: np.isfinite(x).astype(float).sum(), "count", {}],
pytest.param(
lambda x: np.isfinite(x).astype(float).sum(),
"count",
{},
marks=pytest.mark.filterwarnings("ignore:min_periods:FutureWarning"),
),
[np.median, "median", {}],
[np.min, "min", {}],
[np.max, "max", {}],
Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/window/test_base_indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def get_window_bounds(self, num_values, min_periods, center, closed):
),
],
)
@pytest.mark.filterwarnings("ignore:min_periods:FutureWarning")
def test_rolling_forward_window(constructor, func, np_func, expected, np_kwargs):
# GH 32865
values = np.arange(10.0)
Expand Down Expand Up @@ -253,3 +254,12 @@ def test_non_fixed_variable_window_indexer(closed, expected_data):
result = df.rolling(indexer, closed=closed).sum()
expected = DataFrame(expected_data, index=index)
tm.assert_frame_equal(result, expected)


def test_fixed_forward_indexer_count():
# GH: 35579
df = DataFrame({"b": [None, None, None, 7]})
indexer = FixedForwardWindowIndexer(window_size=2)
result = df.rolling(window=indexer, min_periods=0).count()
expected = DataFrame({"b": [0.0, 0.0, 1.0, 1.0]})
tm.assert_frame_equal(result, expected)
Loading