diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index 17abb3debe3e7..5fc4c5e0b04aa 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -168,6 +168,7 @@ Performance improvements Bug fixes ~~~~~~~~~ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) +- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) Categorical ^^^^^^^^^^^ diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 02934346130a5..7b306c5e681e0 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -138,6 +138,8 @@ def calculate_variable_window_bounds( break # end bound is previous end # or current index + elif index[end[i - 1]] == end_bound and not right_closed: + end[i] = end[i - 1] + 1 elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: end[i] = i + 1 else: diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 694a420ad2494..c13ec51ff3851 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -262,7 +262,9 @@ def get_window_bounds( # end bound is previous end # or current index end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign - if end_diff <= zero: + if end_diff == zero and not right_closed: + end[i] = end[i - 1] + 1 + elif end_diff <= zero: end[i] = i + 1 else: end[i] = end[i - 1] diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index ab00e18fc4812..46ab00c3e2284 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -466,20 +466,23 @@ def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) result = ( df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", @@ -490,10 +493,14 @@ def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) @@ -503,10 +510,9 @@ def test_groupby_subset_rolling_subset_with_closed(self): .sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f4d903dc19fb7..a02f132e540ac 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -304,6 +304,76 @@ def test_datetimelike_nonunique_index_centering( tm.assert_equal(result, expected) +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]), + ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]), + ], +) +def test_variable_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + result = df.rolling("2D", closed=closed).sum() + + tm.assert_equal(result, expected) + + +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]), + ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]), + ], +) +def test_variable_offset_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + offset = BusinessDay(2) + indexer = VariableOffsetWindowIndexer(index=index, offset=offset) + result = df.rolling(indexer, closed=closed, min_periods=1).sum() + + tm.assert_equal(result, expected) + + def test_even_number_window_alignment(): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3))