Skip to content

Backport PR #46567 on branch 1.4.x (BUG: groupby().rolling(freq) with monotonic dates within groups #46065 ) #46592

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v1.4.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ Bug fixes
- Fix some cases for subclasses that define their ``_constructor`` properties as general callables (:issue:`46018`)
- Fixed "longtable" formatting in :meth:`.Styler.to_latex` when ``column_format`` is given in extended format (:issue:`46037`)
- Fixed incorrect rendering in :meth:`.Styler.format` with ``hyperlinks="html"`` when the url contains a colon or other special characters (:issue:`46389`)
- Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`)
- Fixed :meth:`Groupby.rolling` with a frequency window that would raise a ``ValueError`` even if the datetimes within each group were monotonic (:issue:`46061`)

.. ---------------------------------------------------------------------------

Expand Down
52 changes: 30 additions & 22 deletions pandas/core/window/rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -837,12 +837,6 @@ def _gotitem(self, key, ndim, subset=None):
subset = self.obj.set_index(self._on)
return super()._gotitem(key, ndim, subset=subset)

def _validate_monotonic(self):
"""
Validate that "on" is monotonic; already validated at a higher level.
"""
pass


class Window(BaseWindow):
"""
Expand Down Expand Up @@ -1687,7 +1681,7 @@ def _validate(self):
or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex))
) and isinstance(self.window, (str, BaseOffset, timedelta)):

self._validate_monotonic()
self._validate_datetimelike_monotonic()

# this will raise ValueError on non-fixed freqs
try:
Expand All @@ -1712,18 +1706,24 @@ def _validate(self):
elif not is_integer(self.window) or self.window < 0:
raise ValueError("window must be an integer 0 or greater")

def _validate_monotonic(self):
def _validate_datetimelike_monotonic(self):
"""
Validate monotonic (increasing or decreasing).
Validate self._on is monotonic (increasing or decreasing) and has
no NaT values for frequency windows.
"""
if self._on.hasnans:
self._raise_monotonic_error("values must not have NaT")
if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing):
self._raise_monotonic_error()
self._raise_monotonic_error("values must be monotonic")

def _raise_monotonic_error(self):
formatted = self.on
if self.on is None:
formatted = "index"
raise ValueError(f"{formatted} must be monotonic")
def _raise_monotonic_error(self, msg: str):
on = self.on
if on is None:
if self.axis == 0:
on = "index"
else:
on = "column"
raise ValueError(f"{on} {msg}")

@doc(
_shared_docs["aggregate"],
Expand Down Expand Up @@ -2631,12 +2631,20 @@ def _get_window_indexer(self) -> GroupbyIndexer:
)
return window_indexer

def _validate_monotonic(self):
def _validate_datetimelike_monotonic(self):
"""
Validate that on is monotonic;
Validate that each group in self._on is monotonic
"""
if (
not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing)
or self._on.hasnans
):
self._raise_monotonic_error()
# GH 46061
if self._on.hasnans:
self._raise_monotonic_error("values must not have NaT")
for group_indices in self._grouper.indices.values():
group_on = self._on.take(group_indices)
if not (
group_on.is_monotonic_increasing or group_on.is_monotonic_decreasing
):
on = "index" if self.on is None else self.on
raise ValueError(
f"Each group within {on} must be monotonic. "
f"Sort the values in {on} first."
)
79 changes: 78 additions & 1 deletion pandas/tests/window/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,7 +651,7 @@ def test_groupby_rolling_nans_in_index(self, rollings, key):
)
if key == "index":
df = df.set_index("a")
with pytest.raises(ValueError, match=f"{key} must be monotonic"):
with pytest.raises(ValueError, match=f"{key} values must not have NaT"):
df.groupby("c").rolling("60min", **rollings)

@pytest.mark.parametrize("group_keys", [True, False])
Expand Down Expand Up @@ -895,6 +895,83 @@ def test_nan_and_zero_endpoints(self):
)
tm.assert_series_equal(result, expected)

def test_groupby_rolling_non_monotonic(self):
# GH 43909

shuffled = [3, 0, 1, 2]
sec = 1_000
df = DataFrame(
[{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
)
with pytest.raises(ValueError, match=r".* must be monotonic"):
df.groupby("c").rolling(on="t", window="3s")

def test_groupby_monotonic(self):

# GH 15130
# we don't need to validate monotonicity when grouping

# GH 43909 we should raise an error here to match
# behaviour of non-groupby rolling.

data = [
["David", "1/1/2015", 100],
["David", "1/5/2015", 500],
["David", "5/30/2015", 50],
["David", "7/25/2015", 50],
["Ryan", "1/4/2014", 100],
["Ryan", "1/19/2015", 500],
["Ryan", "3/31/2016", 50],
["Joe", "7/1/2015", 100],
["Joe", "9/9/2015", 500],
["Joe", "10/15/2015", 50],
]

df = DataFrame(data=data, columns=["name", "date", "amount"])
df["date"] = to_datetime(df["date"])
df = df.sort_values("date")

expected = (
df.set_index("date")
.groupby("name")
.apply(lambda x: x.rolling("180D")["amount"].sum())
)
result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
tm.assert_series_equal(result, expected)

def test_datelike_on_monotonic_within_each_group(self):
# GH 13966 (similar to #15130, closed by #15175)

# superseded by 43909
# GH 46061: OK if the on is monotonic relative to each each group

dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
df = DataFrame(
{
"A": [1] * 20 + [2] * 12 + [3] * 8,
"B": np.concatenate((dates, dates)),
"C": np.arange(40),
}
)

expected = (
df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
)
result = df.groupby("A").rolling("4s", on="B").C.mean()
tm.assert_series_equal(result, expected)

def test_datelike_on_not_monotonic_within_each_group(self):
# GH 46061
df = DataFrame(
{
"A": [1] * 3 + [2] * 3,
"B": [Timestamp(year, 1, 1) for year in [2020, 2021, 2019]] * 2,
"C": range(6),
}
)
with pytest.raises(ValueError, match="Each group within B must be monotonic."):
df.groupby("A").rolling("365D", on="B")


class TestExpanding:
def setup_method(self):
Expand Down
12 changes: 0 additions & 12 deletions pandas/tests/window/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -1419,18 +1419,6 @@ def test_groupby_rolling_nan_included():
tm.assert_frame_equal(result, expected)


def test_groupby_rolling_non_monotonic():
# GH 43909

shuffled = [3, 0, 1, 2]
sec = 1_000
df = DataFrame(
[{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
)
with pytest.raises(ValueError, match=r".* must be monotonic"):
df.groupby("c").rolling(on="t", window="3s")


@pytest.mark.parametrize("method", ["skew", "kurt"])
def test_rolling_skew_kurt_numerical_stability(method):
# GH#6929
Expand Down
72 changes: 11 additions & 61 deletions pandas/tests/window/test_timeseries_window.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
DataFrame,
Index,
MultiIndex,
NaT,
Series,
Timestamp,
date_range,
to_datetime,
)
import pandas._testing as tm

Expand Down Expand Up @@ -134,7 +134,7 @@ def test_non_monotonic_on(self):

assert not df.index.is_monotonic

msg = "index must be monotonic"
msg = "index values must be monotonic"
with pytest.raises(ValueError, match=msg):
df.rolling("2s").sum()

Expand Down Expand Up @@ -643,65 +643,6 @@ def agg_by_day(x):

tm.assert_frame_equal(result, expected)

def test_groupby_monotonic(self):

# GH 15130
# we don't need to validate monotonicity when grouping

# GH 43909 we should raise an error here to match
# behaviour of non-groupby rolling.

data = [
["David", "1/1/2015", 100],
["David", "1/5/2015", 500],
["David", "5/30/2015", 50],
["David", "7/25/2015", 50],
["Ryan", "1/4/2014", 100],
["Ryan", "1/19/2015", 500],
["Ryan", "3/31/2016", 50],
["Joe", "7/1/2015", 100],
["Joe", "9/9/2015", 500],
["Joe", "10/15/2015", 50],
]

df = DataFrame(data=data, columns=["name", "date", "amount"])
df["date"] = to_datetime(df["date"])
df = df.sort_values("date")

expected = (
df.set_index("date")
.groupby("name")
.apply(lambda x: x.rolling("180D")["amount"].sum())
)
result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
tm.assert_series_equal(result, expected)

def test_non_monotonic_raises(self):
# GH 13966 (similar to #15130, closed by #15175)

# superseded by 43909

dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
df = DataFrame(
{
"A": [1] * 20 + [2] * 12 + [3] * 8,
"B": np.concatenate((dates, dates)),
"C": np.arange(40),
}
)

expected = (
df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
)
with pytest.raises(ValueError, match=r".* must be monotonic"):
df.groupby("A").rolling(
"4s", on="B"
).C.mean() # should raise for non-monotonic t series

df2 = df.sort_values("B")
result = df2.groupby("A").rolling("4s", on="B").C.mean()
tm.assert_series_equal(result, expected)

def test_rolling_cov_offset(self):
# GH16058

Expand Down Expand Up @@ -757,3 +698,12 @@ def test_rolling_on_multi_index_level(self):
{"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("msg, axis", [["column", 1], ["index", 0]])
def test_nat_axis_error(msg, axis):
idx = [Timestamp("2020"), NaT]
kwargs = {"columns" if axis == 1 else "index": idx}
df = DataFrame(np.eye(2), **kwargs)
with pytest.raises(ValueError, match=f"{msg} values must not have NaT"):
df.rolling("D", axis=axis).mean()