From 8b751a7755b2d1e728c550ee0eb03dc66da28a50 Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Fri, 1 Sep 2023 00:34:11 +0100 Subject: [PATCH 1/6] Add bugfix for rolling window with nonunique datetimelike index --- pandas/_libs/window/indexers.pyx | 7 ++++++- pandas/tests/window/test_groupby.py | 30 ++++++++++++++--------------- pandas/tests/window/test_rolling.py | 24 +++++++++++++++++++++++ 3 files changed, 44 insertions(+), 17 deletions(-) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index 02934346130a5..f5ac1f9a81fd9 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -138,7 +138,12 @@ def calculate_variable_window_bounds( break # end bound is previous end # or current index - elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: + elif index[end[i - 1]] == end_bound: + if right_closed: + end[i] = i + 1 + else: + end[i] = end[i - 1] + 1 + elif (index[end[i - 1]] - end_bound) * index_growth_sign < 0: end[i] = i + 1 else: end[i] = end[i - 1] diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index ab00e18fc4812..9277231f465fb 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -466,20 +466,19 @@ def test_groupby_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [Timestamp(date) for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]] * 2 } ) result = ( df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", @@ -490,10 +489,10 @@ def test_groupby_subset_rolling_subset_with_closed(self): # GH 35549 df = DataFrame( { - "column1": range(6), - "column2": range(6), - "group": 3 * ["A", "B"], - "date": [Timestamp("2019-01-01")] * 6, + "column1": range(8), + "column2": range(8), + "group": ["A"] * 4 + ["B"] * 4, + "date": [Timestamp(date) for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]] * 2 } ) @@ -503,10 +502,9 @@ def test_groupby_subset_rolling_subset_with_closed(self): .sum() ) expected = Series( - [np.nan, 0.0, 2.0, np.nan, 1.0, 4.0], - index=MultiIndex.from_tuples( - [("A", Timestamp("2019-01-01"))] * 3 - + [("B", Timestamp("2019-01-01"))] * 3, + [np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0], + index=MultiIndex.from_frame( + df[["group", "date"]], names=["group", "date"], ), name="column1", diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index f4d903dc19fb7..34d324d6fef27 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -304,6 +304,30 @@ def test_datetimelike_nonunique_index_centering( tm.assert_equal(result, expected) +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14]), + ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5]), + ("right", [0, 1, 3, 6, 10, 14, 11, 18]), + ("both", [0, 1, 3, 6, 10, 15, 20, 27]), + ], +) +def test_datetimelike_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex([ + "2011-01-01", "2011-01-01", "2011-01-02", "2011-01-02", + "2011-01-02", "2011-01-03", "2011-01-04", "2011-01-04", + ]) + + df = frame_or_series(range(8), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + result = df.rolling('2D', closed=closed).sum() + + tm.assert_equal(result, expected) + + def test_even_number_window_alignment(): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3)) From 8d9ce73de92fc0130fd3d0ea076595ef302a7871 Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Fri, 1 Sep 2023 00:51:40 +0100 Subject: [PATCH 2/6] Run black --- pandas/tests/window/test_groupby.py | 12 ++++++++++-- pandas/tests/window/test_rolling.py | 18 +++++++++++++----- 2 files changed, 23 insertions(+), 7 deletions(-) diff --git a/pandas/tests/window/test_groupby.py b/pandas/tests/window/test_groupby.py index 9277231f465fb..46ab00c3e2284 100644 --- a/pandas/tests/window/test_groupby.py +++ b/pandas/tests/window/test_groupby.py @@ -469,7 +469,11 @@ def test_groupby_rolling_subset_with_closed(self): "column1": range(8), "column2": range(8), "group": ["A"] * 4 + ["B"] * 4, - "date": [Timestamp(date) for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]] * 2 + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) result = ( @@ -492,7 +496,11 @@ def test_groupby_subset_rolling_subset_with_closed(self): "column1": range(8), "column2": range(8), "group": ["A"] * 4 + ["B"] * 4, - "date": [Timestamp(date) for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]] * 2 + "date": [ + Timestamp(date) + for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"] + ] + * 2, } ) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 34d324d6fef27..59f670848f13a 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -315,15 +315,23 @@ def test_datetimelike_nonunique_index_centering( ) def test_datetimelike_nonunique(closed, expected, frame_or_series): # GH 20712 - index = DatetimeIndex([ - "2011-01-01", "2011-01-01", "2011-01-02", "2011-01-02", - "2011-01-02", "2011-01-03", "2011-01-04", "2011-01-04", - ]) + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + ] + ) df = frame_or_series(range(8), index=index, dtype=float) expected = frame_or_series(expected, index=index, dtype=float) - result = df.rolling('2D', closed=closed).sum() + result = df.rolling("2D", closed=closed).sum() tm.assert_equal(result, expected) From dc61063b9f0b0498f97b7a4faa25e0c690f8be1e Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Fri, 1 Sep 2023 00:54:47 +0100 Subject: [PATCH 3/6] Add entry to whatsnew --- doc/source/whatsnew/v2.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.2.0.rst b/doc/source/whatsnew/v2.2.0.rst index e49436c4ed549..2c5b9d2531555 100644 --- a/doc/source/whatsnew/v2.2.0.rst +++ b/doc/source/whatsnew/v2.2.0.rst @@ -166,6 +166,7 @@ Performance improvements Bug fixes ~~~~~~~~~ - Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`) +- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`) Categorical ^^^^^^^^^^^ From 4f35883dafe1b1ac36f6ab7b22532aa84f4d5f52 Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Sat, 2 Sep 2023 03:13:09 +0100 Subject: [PATCH 4/6] Fix VariableOffsetWindowIndexer --- pandas/core/indexers/objects.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/indexers/objects.py b/pandas/core/indexers/objects.py index 694a420ad2494..c13ec51ff3851 100644 --- a/pandas/core/indexers/objects.py +++ b/pandas/core/indexers/objects.py @@ -262,7 +262,9 @@ def get_window_bounds( # end bound is previous end # or current index end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign - if end_diff <= zero: + if end_diff == zero and not right_closed: + end[i] = end[i - 1] + 1 + elif end_diff <= zero: end[i] = i + 1 else: end[i] = end[i - 1] From 8cb84993dc4d719a4e720727eb03b4bffff72027 Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Sat, 2 Sep 2023 03:14:24 +0100 Subject: [PATCH 5/6] Simplify change in indexers.pyx --- pandas/_libs/window/indexers.pyx | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pandas/_libs/window/indexers.pyx b/pandas/_libs/window/indexers.pyx index f5ac1f9a81fd9..7b306c5e681e0 100644 --- a/pandas/_libs/window/indexers.pyx +++ b/pandas/_libs/window/indexers.pyx @@ -138,12 +138,9 @@ def calculate_variable_window_bounds( break # end bound is previous end # or current index - elif index[end[i - 1]] == end_bound: - if right_closed: - end[i] = i + 1 - else: - end[i] = end[i - 1] + 1 - elif (index[end[i - 1]] - end_bound) * index_growth_sign < 0: + elif index[end[i - 1]] == end_bound and not right_closed: + end[i] = end[i - 1] + 1 + elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0: end[i] = i + 1 else: end[i] = end[i - 1] From 8b6684b177a39c8deca19563308a20ea8752a78d Mon Sep 17 00:00:00 2001 From: Abdullah Ihsan Secer Date: Sat, 2 Sep 2023 03:15:24 +0100 Subject: [PATCH 6/6] Add test --- pandas/tests/window/test_rolling.py | 50 +++++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/pandas/tests/window/test_rolling.py b/pandas/tests/window/test_rolling.py index 59f670848f13a..a02f132e540ac 100644 --- a/pandas/tests/window/test_rolling.py +++ b/pandas/tests/window/test_rolling.py @@ -307,13 +307,13 @@ def test_datetimelike_nonunique_index_centering( @pytest.mark.parametrize( "closed,expected", [ - ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14]), - ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5]), - ("right", [0, 1, 3, 6, 10, 14, 11, 18]), - ("both", [0, 1, 3, 6, 10, 15, 20, 27]), + ("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]), + ("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]), ], ) -def test_datetimelike_nonunique(closed, expected, frame_or_series): +def test_variable_window_nonunique(closed, expected, frame_or_series): # GH 20712 index = DatetimeIndex( [ @@ -325,10 +325,12 @@ def test_datetimelike_nonunique(closed, expected, frame_or_series): "2011-01-03", "2011-01-04", "2011-01-04", + "2011-01-05", + "2011-01-06", ] ) - df = frame_or_series(range(8), index=index, dtype=float) + df = frame_or_series(range(10), index=index, dtype=float) expected = frame_or_series(expected, index=index, dtype=float) result = df.rolling("2D", closed=closed).sum() @@ -336,6 +338,42 @@ def test_datetimelike_nonunique(closed, expected, frame_or_series): tm.assert_equal(result, expected) +@pytest.mark.parametrize( + "closed,expected", + [ + ("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]), + ("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]), + ("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]), + ("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]), + ], +) +def test_variable_offset_window_nonunique(closed, expected, frame_or_series): + # GH 20712 + index = DatetimeIndex( + [ + "2011-01-01", + "2011-01-01", + "2011-01-02", + "2011-01-02", + "2011-01-02", + "2011-01-03", + "2011-01-04", + "2011-01-04", + "2011-01-05", + "2011-01-06", + ] + ) + + df = frame_or_series(range(10), index=index, dtype=float) + expected = frame_or_series(expected, index=index, dtype=float) + + offset = BusinessDay(2) + indexer = VariableOffsetWindowIndexer(index=index, offset=offset) + result = df.rolling(indexer, closed=closed, min_periods=1).sum() + + tm.assert_equal(result, expected) + + def test_even_number_window_alignment(): # see discussion in GH 38780 s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3))