Skip to content

Commit e7a1f9d

Browse files
authored
BUG: Fix Rolling where duplicate datetimelike indexes are treated as consecutive rather than equal with closed='left' and closed='neither' (#54917)
* Add bugfix for rolling window with nonunique datetimelike index * Run black * Add entry to whatsnew * Fix VariableOffsetWindowIndexer * Simplify change in indexers.pyx * Add test
1 parent e1ec244 commit e7a1f9d

File tree

5 files changed

+98
-17
lines changed

5 files changed

+98
-17
lines changed

doc/source/whatsnew/v2.2.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -168,6 +168,7 @@ Performance improvements
168168
Bug fixes
169169
~~~~~~~~~
170170
- Bug in :class:`AbstractHolidayCalendar` where timezone data was not propagated when computing holiday observances (:issue:`54580`)
171+
- Bug in :class:`pandas.core.window.Rolling` where duplicate datetimelike indexes are treated as consecutive rather than equal with ``closed='left'`` and ``closed='neither'`` (:issue:`20712`)
171172

172173
Categorical
173174
^^^^^^^^^^^

pandas/_libs/window/indexers.pyx

+2
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,8 @@ def calculate_variable_window_bounds(
138138
break
139139
# end bound is previous end
140140
# or current index
141+
elif index[end[i - 1]] == end_bound and not right_closed:
142+
end[i] = end[i - 1] + 1
141143
elif (index[end[i - 1]] - end_bound) * index_growth_sign <= 0:
142144
end[i] = i + 1
143145
else:

pandas/core/indexers/objects.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,9 @@ def get_window_bounds(
262262
# end bound is previous end
263263
# or current index
264264
end_diff = (self.index[end[i - 1]] - end_bound) * index_growth_sign
265-
if end_diff <= zero:
265+
if end_diff == zero and not right_closed:
266+
end[i] = end[i - 1] + 1
267+
elif end_diff <= zero:
266268
end[i] = i + 1
267269
else:
268270
end[i] = end[i - 1]

pandas/tests/window/test_groupby.py

+22-16
Original file line numberDiff line numberDiff line change
@@ -466,20 +466,23 @@ def test_groupby_rolling_subset_with_closed(self):
466466
# GH 35549
467467
df = DataFrame(
468468
{
469-
"column1": range(6),
470-
"column2": range(6),
471-
"group": 3 * ["A", "B"],
472-
"date": [Timestamp("2019-01-01")] * 6,
469+
"column1": range(8),
470+
"column2": range(8),
471+
"group": ["A"] * 4 + ["B"] * 4,
472+
"date": [
473+
Timestamp(date)
474+
for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
475+
]
476+
* 2,
473477
}
474478
)
475479
result = (
476480
df.groupby("group").rolling("1D", on="date", closed="left")["column1"].sum()
477481
)
478482
expected = Series(
479-
[np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
480-
index=MultiIndex.from_tuples(
481-
[("A", Timestamp("2019-01-01"))] * 3
482-
+ [("B", Timestamp("2019-01-01"))] * 3,
483+
[np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
484+
index=MultiIndex.from_frame(
485+
df[["group", "date"]],
483486
names=["group", "date"],
484487
),
485488
name="column1",
@@ -490,10 +493,14 @@ def test_groupby_subset_rolling_subset_with_closed(self):
490493
# GH 35549
491494
df = DataFrame(
492495
{
493-
"column1": range(6),
494-
"column2": range(6),
495-
"group": 3 * ["A", "B"],
496-
"date": [Timestamp("2019-01-01")] * 6,
496+
"column1": range(8),
497+
"column2": range(8),
498+
"group": ["A"] * 4 + ["B"] * 4,
499+
"date": [
500+
Timestamp(date)
501+
for date in ["2019-01-01", "2019-01-01", "2019-01-02", "2019-01-02"]
502+
]
503+
* 2,
497504
}
498505
)
499506

@@ -503,10 +510,9 @@ def test_groupby_subset_rolling_subset_with_closed(self):
503510
.sum()
504511
)
505512
expected = Series(
506-
[np.nan, 0.0, 2.0, np.nan, 1.0, 4.0],
507-
index=MultiIndex.from_tuples(
508-
[("A", Timestamp("2019-01-01"))] * 3
509-
+ [("B", Timestamp("2019-01-01"))] * 3,
513+
[np.nan, np.nan, 1.0, 1.0, np.nan, np.nan, 9.0, 9.0],
514+
index=MultiIndex.from_frame(
515+
df[["group", "date"]],
510516
names=["group", "date"],
511517
),
512518
name="column1",

pandas/tests/window/test_rolling.py

+70
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,76 @@ def test_datetimelike_nonunique_index_centering(
304304
tm.assert_equal(result, expected)
305305

306306

307+
@pytest.mark.parametrize(
308+
"closed,expected",
309+
[
310+
("left", [np.nan, np.nan, 1, 1, 1, 10, 14, 14, 18, 21]),
311+
("neither", [np.nan, np.nan, 1, 1, 1, 9, 5, 5, 13, 8]),
312+
("right", [0, 1, 3, 6, 10, 14, 11, 18, 21, 17]),
313+
("both", [0, 1, 3, 6, 10, 15, 20, 27, 26, 30]),
314+
],
315+
)
316+
def test_variable_window_nonunique(closed, expected, frame_or_series):
317+
# GH 20712
318+
index = DatetimeIndex(
319+
[
320+
"2011-01-01",
321+
"2011-01-01",
322+
"2011-01-02",
323+
"2011-01-02",
324+
"2011-01-02",
325+
"2011-01-03",
326+
"2011-01-04",
327+
"2011-01-04",
328+
"2011-01-05",
329+
"2011-01-06",
330+
]
331+
)
332+
333+
df = frame_or_series(range(10), index=index, dtype=float)
334+
expected = frame_or_series(expected, index=index, dtype=float)
335+
336+
result = df.rolling("2D", closed=closed).sum()
337+
338+
tm.assert_equal(result, expected)
339+
340+
341+
@pytest.mark.parametrize(
342+
"closed,expected",
343+
[
344+
("left", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 18, 21]),
345+
("neither", [np.nan, np.nan, 1, 1, 1, 10, 15, 15, 13, 8]),
346+
("right", [0, 1, 3, 6, 10, 15, 21, 28, 21, 17]),
347+
("both", [0, 1, 3, 6, 10, 15, 21, 28, 26, 30]),
348+
],
349+
)
350+
def test_variable_offset_window_nonunique(closed, expected, frame_or_series):
351+
# GH 20712
352+
index = DatetimeIndex(
353+
[
354+
"2011-01-01",
355+
"2011-01-01",
356+
"2011-01-02",
357+
"2011-01-02",
358+
"2011-01-02",
359+
"2011-01-03",
360+
"2011-01-04",
361+
"2011-01-04",
362+
"2011-01-05",
363+
"2011-01-06",
364+
]
365+
)
366+
367+
df = frame_or_series(range(10), index=index, dtype=float)
368+
expected = frame_or_series(expected, index=index, dtype=float)
369+
370+
offset = BusinessDay(2)
371+
indexer = VariableOffsetWindowIndexer(index=index, offset=offset)
372+
result = df.rolling(indexer, closed=closed, min_periods=1).sum()
373+
374+
tm.assert_equal(result, expected)
375+
376+
307377
def test_even_number_window_alignment():
308378
# see discussion in GH 38780
309379
s = Series(range(3), index=date_range(start="2020-01-01", freq="D", periods=3))

0 commit comments

Comments
 (0)