Skip to content

Commit 9c6d84c

Browse files
Backport PR #46567 on branch 1.4.x (BUG: groupby().rolling(freq) with monotonic dates within groups #46065 ) (#46592)
* ENH: Improve error message when rolling over NaT value (#46087) * BUG: groupby().rolling(freq) with monotonic dates within groups #46065 (#46567) (cherry picked from commit d2aa44f) Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 3ff438e commit 9c6d84c

File tree

5 files changed

+121
-96
lines changed

5 files changed

+121
-96
lines changed

doc/source/whatsnew/v1.4.2.rst

+2
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ Bug fixes
3232
- Fix some cases for subclasses that define their ``_constructor`` properties as general callables (:issue:`46018`)
3333
- Fixed "longtable" formatting in :meth:`.Styler.to_latex` when ``column_format`` is given in extended format (:issue:`46037`)
3434
- Fixed incorrect rendering in :meth:`.Styler.format` with ``hyperlinks="html"`` when the url contains a colon or other special characters (:issue:`46389`)
35+
- Improved error message in :class:`~pandas.core.window.Rolling` when ``window`` is a frequency and ``NaT`` is in the rolling axis (:issue:`46087`)
36+
- Fixed :meth:`Groupby.rolling` with a frequency window that would raise a ``ValueError`` even if the datetimes within each group were monotonic (:issue:`46061`)
3537

3638
.. ---------------------------------------------------------------------------
3739

pandas/core/window/rolling.py

+30-22
Original file line numberDiff line numberDiff line change
@@ -837,12 +837,6 @@ def _gotitem(self, key, ndim, subset=None):
837837
subset = self.obj.set_index(self._on)
838838
return super()._gotitem(key, ndim, subset=subset)
839839

840-
def _validate_monotonic(self):
841-
"""
842-
Validate that "on" is monotonic; already validated at a higher level.
843-
"""
844-
pass
845-
846840

847841
class Window(BaseWindow):
848842
"""
@@ -1687,7 +1681,7 @@ def _validate(self):
16871681
or isinstance(self._on, (DatetimeIndex, TimedeltaIndex, PeriodIndex))
16881682
) and isinstance(self.window, (str, BaseOffset, timedelta)):
16891683

1690-
self._validate_monotonic()
1684+
self._validate_datetimelike_monotonic()
16911685

16921686
# this will raise ValueError on non-fixed freqs
16931687
try:
@@ -1712,18 +1706,24 @@ def _validate(self):
17121706
elif not is_integer(self.window) or self.window < 0:
17131707
raise ValueError("window must be an integer 0 or greater")
17141708

1715-
def _validate_monotonic(self):
1709+
def _validate_datetimelike_monotonic(self):
17161710
"""
1717-
Validate monotonic (increasing or decreasing).
1711+
Validate self._on is monotonic (increasing or decreasing) and has
1712+
no NaT values for frequency windows.
17181713
"""
1714+
if self._on.hasnans:
1715+
self._raise_monotonic_error("values must not have NaT")
17191716
if not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing):
1720-
self._raise_monotonic_error()
1717+
self._raise_monotonic_error("values must be monotonic")
17211718

1722-
def _raise_monotonic_error(self):
1723-
formatted = self.on
1724-
if self.on is None:
1725-
formatted = "index"
1726-
raise ValueError(f"{formatted} must be monotonic")
1719+
def _raise_monotonic_error(self, msg: str):
1720+
on = self.on
1721+
if on is None:
1722+
if self.axis == 0:
1723+
on = "index"
1724+
else:
1725+
on = "column"
1726+
raise ValueError(f"{on} {msg}")
17271727

17281728
@doc(
17291729
_shared_docs["aggregate"],
@@ -2631,12 +2631,20 @@ def _get_window_indexer(self) -> GroupbyIndexer:
26312631
)
26322632
return window_indexer
26332633

2634-
def _validate_monotonic(self):
2634+
def _validate_datetimelike_monotonic(self):
26352635
"""
2636-
Validate that on is monotonic;
2636+
Validate that each group in self._on is monotonic
26372637
"""
2638-
if (
2639-
not (self._on.is_monotonic_increasing or self._on.is_monotonic_decreasing)
2640-
or self._on.hasnans
2641-
):
2642-
self._raise_monotonic_error()
2638+
# GH 46061
2639+
if self._on.hasnans:
2640+
self._raise_monotonic_error("values must not have NaT")
2641+
for group_indices in self._grouper.indices.values():
2642+
group_on = self._on.take(group_indices)
2643+
if not (
2644+
group_on.is_monotonic_increasing or group_on.is_monotonic_decreasing
2645+
):
2646+
on = "index" if self.on is None else self.on
2647+
raise ValueError(
2648+
f"Each group within {on} must be monotonic. "
2649+
f"Sort the values in {on} first."
2650+
)

pandas/tests/window/test_groupby.py

+78-1
Original file line numberDiff line numberDiff line change
@@ -651,7 +651,7 @@ def test_groupby_rolling_nans_in_index(self, rollings, key):
651651
)
652652
if key == "index":
653653
df = df.set_index("a")
654-
with pytest.raises(ValueError, match=f"{key} must be monotonic"):
654+
with pytest.raises(ValueError, match=f"{key} values must not have NaT"):
655655
df.groupby("c").rolling("60min", **rollings)
656656

657657
@pytest.mark.parametrize("group_keys", [True, False])
@@ -895,6 +895,83 @@ def test_nan_and_zero_endpoints(self):
895895
)
896896
tm.assert_series_equal(result, expected)
897897

898+
def test_groupby_rolling_non_monotonic(self):
899+
# GH 43909
900+
901+
shuffled = [3, 0, 1, 2]
902+
sec = 1_000
903+
df = DataFrame(
904+
[{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
905+
)
906+
with pytest.raises(ValueError, match=r".* must be monotonic"):
907+
df.groupby("c").rolling(on="t", window="3s")
908+
909+
def test_groupby_monotonic(self):
910+
911+
# GH 15130
912+
# we don't need to validate monotonicity when grouping
913+
914+
# GH 43909 we should raise an error here to match
915+
# behaviour of non-groupby rolling.
916+
917+
data = [
918+
["David", "1/1/2015", 100],
919+
["David", "1/5/2015", 500],
920+
["David", "5/30/2015", 50],
921+
["David", "7/25/2015", 50],
922+
["Ryan", "1/4/2014", 100],
923+
["Ryan", "1/19/2015", 500],
924+
["Ryan", "3/31/2016", 50],
925+
["Joe", "7/1/2015", 100],
926+
["Joe", "9/9/2015", 500],
927+
["Joe", "10/15/2015", 50],
928+
]
929+
930+
df = DataFrame(data=data, columns=["name", "date", "amount"])
931+
df["date"] = to_datetime(df["date"])
932+
df = df.sort_values("date")
933+
934+
expected = (
935+
df.set_index("date")
936+
.groupby("name")
937+
.apply(lambda x: x.rolling("180D")["amount"].sum())
938+
)
939+
result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
940+
tm.assert_series_equal(result, expected)
941+
942+
def test_datelike_on_monotonic_within_each_group(self):
943+
# GH 13966 (similar to #15130, closed by #15175)
944+
945+
# superseded by 43909
946+
# GH 46061: OK if the on is monotonic relative to each each group
947+
948+
dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
949+
df = DataFrame(
950+
{
951+
"A": [1] * 20 + [2] * 12 + [3] * 8,
952+
"B": np.concatenate((dates, dates)),
953+
"C": np.arange(40),
954+
}
955+
)
956+
957+
expected = (
958+
df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
959+
)
960+
result = df.groupby("A").rolling("4s", on="B").C.mean()
961+
tm.assert_series_equal(result, expected)
962+
963+
def test_datelike_on_not_monotonic_within_each_group(self):
964+
# GH 46061
965+
df = DataFrame(
966+
{
967+
"A": [1] * 3 + [2] * 3,
968+
"B": [Timestamp(year, 1, 1) for year in [2020, 2021, 2019]] * 2,
969+
"C": range(6),
970+
}
971+
)
972+
with pytest.raises(ValueError, match="Each group within B must be monotonic."):
973+
df.groupby("A").rolling("365D", on="B")
974+
898975

899976
class TestExpanding:
900977
def setup_method(self):

pandas/tests/window/test_rolling.py

-12
Original file line numberDiff line numberDiff line change
@@ -1419,18 +1419,6 @@ def test_groupby_rolling_nan_included():
14191419
tm.assert_frame_equal(result, expected)
14201420

14211421

1422-
def test_groupby_rolling_non_monotonic():
1423-
# GH 43909
1424-
1425-
shuffled = [3, 0, 1, 2]
1426-
sec = 1_000
1427-
df = DataFrame(
1428-
[{"t": Timestamp(2 * x * sec), "x": x + 1, "c": 42} for x in shuffled]
1429-
)
1430-
with pytest.raises(ValueError, match=r".* must be monotonic"):
1431-
df.groupby("c").rolling(on="t", window="3s")
1432-
1433-
14341422
@pytest.mark.parametrize("method", ["skew", "kurt"])
14351423
def test_rolling_skew_kurt_numerical_stability(method):
14361424
# GH#6929

pandas/tests/window/test_timeseries_window.py

+11-61
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
DataFrame,
66
Index,
77
MultiIndex,
8+
NaT,
89
Series,
910
Timestamp,
1011
date_range,
11-
to_datetime,
1212
)
1313
import pandas._testing as tm
1414

@@ -134,7 +134,7 @@ def test_non_monotonic_on(self):
134134

135135
assert not df.index.is_monotonic
136136

137-
msg = "index must be monotonic"
137+
msg = "index values must be monotonic"
138138
with pytest.raises(ValueError, match=msg):
139139
df.rolling("2s").sum()
140140

@@ -643,65 +643,6 @@ def agg_by_day(x):
643643

644644
tm.assert_frame_equal(result, expected)
645645

646-
def test_groupby_monotonic(self):
647-
648-
# GH 15130
649-
# we don't need to validate monotonicity when grouping
650-
651-
# GH 43909 we should raise an error here to match
652-
# behaviour of non-groupby rolling.
653-
654-
data = [
655-
["David", "1/1/2015", 100],
656-
["David", "1/5/2015", 500],
657-
["David", "5/30/2015", 50],
658-
["David", "7/25/2015", 50],
659-
["Ryan", "1/4/2014", 100],
660-
["Ryan", "1/19/2015", 500],
661-
["Ryan", "3/31/2016", 50],
662-
["Joe", "7/1/2015", 100],
663-
["Joe", "9/9/2015", 500],
664-
["Joe", "10/15/2015", 50],
665-
]
666-
667-
df = DataFrame(data=data, columns=["name", "date", "amount"])
668-
df["date"] = to_datetime(df["date"])
669-
df = df.sort_values("date")
670-
671-
expected = (
672-
df.set_index("date")
673-
.groupby("name")
674-
.apply(lambda x: x.rolling("180D")["amount"].sum())
675-
)
676-
result = df.groupby("name").rolling("180D", on="date")["amount"].sum()
677-
tm.assert_series_equal(result, expected)
678-
679-
def test_non_monotonic_raises(self):
680-
# GH 13966 (similar to #15130, closed by #15175)
681-
682-
# superseded by 43909
683-
684-
dates = date_range(start="2016-01-01 09:30:00", periods=20, freq="s")
685-
df = DataFrame(
686-
{
687-
"A": [1] * 20 + [2] * 12 + [3] * 8,
688-
"B": np.concatenate((dates, dates)),
689-
"C": np.arange(40),
690-
}
691-
)
692-
693-
expected = (
694-
df.set_index("B").groupby("A").apply(lambda x: x.rolling("4s")["C"].mean())
695-
)
696-
with pytest.raises(ValueError, match=r".* must be monotonic"):
697-
df.groupby("A").rolling(
698-
"4s", on="B"
699-
).C.mean() # should raise for non-monotonic t series
700-
701-
df2 = df.sort_values("B")
702-
result = df2.groupby("A").rolling("4s", on="B").C.mean()
703-
tm.assert_series_equal(result, expected)
704-
705646
def test_rolling_cov_offset(self):
706647
# GH16058
707648

@@ -757,3 +698,12 @@ def test_rolling_on_multi_index_level(self):
757698
{"column": [0.0, 1.0, 3.0, 6.0, 10.0, 15.0]}, index=df.index
758699
)
759700
tm.assert_frame_equal(result, expected)
701+
702+
703+
@pytest.mark.parametrize("msg, axis", [["column", 1], ["index", 0]])
704+
def test_nat_axis_error(msg, axis):
705+
idx = [Timestamp("2020"), NaT]
706+
kwargs = {"columns" if axis == 1 else "index": idx}
707+
df = DataFrame(np.eye(2), **kwargs)
708+
with pytest.raises(ValueError, match=f"{msg} values must not have NaT"):
709+
df.rolling("D", axis=axis).mean()

0 commit comments

Comments
 (0)