Skip to content

Commit 4b9a480

Browse files
committed
BUG: Fixes to FixedForwardWindowIndexer and GroupbyIndexer (#43267)
1 parent 02fdbde commit 4b9a480

File tree

7 files changed

+202
-27
lines changed

7 files changed

+202
-27
lines changed

Diff for: doc/source/whatsnew/v1.3.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Fixed regressions
2323

2424
Bug fixes
2525
~~~~~~~~~
26+
- Bug in :meth:`pandas.DataFrame.groupby.rolling` and :class:`pandas.api.indexers.FixedForwardWindowIndexer` leading to segfaults and window endpoints being mixed across groups (:issue:`43267`)
2627
-
2728
-
2829

Diff for: pandas/core/indexers/objects.py

+21-19
Original file line numberDiff line numberDiff line change
@@ -266,9 +266,8 @@ def get_window_bounds(
266266
)
267267

268268
start = np.arange(num_values, dtype="int64")
269-
end_s = start[: -self.window_size] + self.window_size
270-
end_e = np.full(self.window_size, num_values, dtype="int64")
271-
end = np.concatenate([end_s, end_e])
269+
end = start + self.window_size
270+
end[end > num_values] = num_values
272271

273272
return start, end
274273

@@ -280,7 +279,7 @@ def __init__(
280279
self,
281280
index_array: np.ndarray | None = None,
282281
window_size: int = 0,
283-
groupby_indicies: dict | None = None,
282+
groupby_indices: dict | None = None,
284283
window_indexer: type[BaseIndexer] = BaseIndexer,
285284
indexer_kwargs: dict | None = None,
286285
**kwargs,
@@ -294,7 +293,7 @@ def __init__(
294293
the groups
295294
window_size : int
296295
window size during the windowing operation
297-
groupby_indicies : dict or None
296+
groupby_indices : dict or None
298297
dict of {group label: [positional index of rows belonging to the group]}
299298
window_indexer : BaseIndexer
300299
BaseIndexer class determining the start and end bounds of each group
@@ -303,11 +302,13 @@ def __init__(
303302
**kwargs :
304303
keyword arguments that will be available when get_window_bounds is called
305304
"""
306-
self.groupby_indicies = groupby_indicies or {}
305+
self.groupby_indices = groupby_indices or {}
307306
self.window_indexer = window_indexer
308-
self.indexer_kwargs = indexer_kwargs or {}
307+
self.indexer_kwargs = indexer_kwargs.copy() if indexer_kwargs else {}
309308
super().__init__(
310-
index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs
309+
index_array=index_array,
310+
window_size=self.indexer_kwargs.pop("window_size", window_size),
311+
**kwargs,
311312
)
312313

313314
@Appender(get_window_bounds_doc)
@@ -323,8 +324,8 @@ def get_window_bounds(
323324
# 3) Append the window bounds in group order
324325
start_arrays = []
325326
end_arrays = []
326-
window_indicies_start = 0
327-
for key, indices in self.groupby_indicies.items():
327+
window_indices_start = 0
328+
for key, indices in self.groupby_indices.items():
328329
index_array: np.ndarray | None
329330

330331
if self.index_array is not None:
@@ -341,18 +342,19 @@ def get_window_bounds(
341342
)
342343
start = start.astype(np.int64)
343344
end = end.astype(np.int64)
344-
# Cannot use groupby_indicies as they might not be monotonic with the object
345+
assert len(start) == len(end), 'these should be equal in length from get_window_bounds'
346+
# Cannot use groupby_indices as they might not be monotonic with the object
345347
# we're rolling over
346-
window_indicies = np.arange(
347-
window_indicies_start, window_indicies_start + len(indices)
348+
window_indices = np.arange(
349+
window_indices_start, window_indices_start + len(indices)
348350
)
349-
window_indicies_start += len(indices)
351+
window_indices_start += len(indices)
350352
# Extend as we'll be slicing window like [start, end)
351-
window_indicies = np.append(
352-
window_indicies, [window_indicies[-1] + 1]
353-
).astype(np.int64)
354-
start_arrays.append(window_indicies.take(ensure_platform_int(start)))
355-
end_arrays.append(window_indicies.take(ensure_platform_int(end)))
353+
window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype(
354+
np.int64, copy=False
355+
)
356+
start_arrays.append(window_indices.take(ensure_platform_int(start)))
357+
end_arrays.append(window_indices.take(ensure_platform_int(end)))
356358
start = np.concatenate(start_arrays)
357359
end = np.concatenate(end_arrays)
358360
return start, end

Diff for: pandas/core/window/ewm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -735,7 +735,7 @@ def _get_window_indexer(self) -> GroupbyIndexer:
735735
GroupbyIndexer
736736
"""
737737
window_indexer = GroupbyIndexer(
738-
groupby_indicies=self._grouper.indices,
738+
groupby_indices=self._grouper.indices,
739739
window_indexer=ExponentialMovingWindowIndexer,
740740
)
741741
return window_indexer

Diff for: pandas/core/window/expanding.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ def _get_window_indexer(self) -> GroupbyIndexer:
684684
GroupbyIndexer
685685
"""
686686
window_indexer = GroupbyIndexer(
687-
groupby_indicies=self._grouper.indices,
687+
groupby_indices=self._grouper.indices,
688688
window_indexer=ExpandingIndexer,
689689
)
690690
return window_indexer

Diff for: pandas/core/window/rolling.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -300,8 +300,8 @@ def __iter__(self):
300300
center=self.center,
301301
closed=self.closed,
302302
)
303-
# From get_window_bounds, those two should be equal in length of array
304-
assert len(start) == len(end)
303+
304+
assert len(start) == len(end), 'these should be equal in length from get_window_bounds'
305305

306306
for s, e in zip(start, end):
307307
result = obj.iloc[slice(s, e)]
@@ -533,6 +533,7 @@ def _apply(
533533
y : type of input
534534
"""
535535
window_indexer = self._get_window_indexer()
536+
536537
min_periods = (
537538
self.min_periods
538539
if self.min_periods is not None
@@ -546,12 +547,15 @@ def homogeneous_func(values: np.ndarray):
546547
return values.copy()
547548

548549
def calc(x):
550+
549551
start, end = window_indexer.get_window_bounds(
550552
num_values=len(x),
551553
min_periods=min_periods,
552554
center=self.center,
553555
closed=self.closed,
554556
)
557+
assert len(start) == len(end), 'these should be equal in length from get_window_bounds'
558+
555559
return func(x, start, end, min_periods, *numba_args)
556560

557561
with np.errstate(all="ignore"):
@@ -1434,6 +1438,9 @@ def cov_func(x, y):
14341438
center=self.center,
14351439
closed=self.closed,
14361440
)
1441+
1442+
assert len(start) == len(end), 'these should be equal in length from get_window_bounds'
1443+
14371444
with np.errstate(all="ignore"):
14381445
mean_x_y = window_aggregations.roll_mean(
14391446
x_array * y_array, start, end, min_periods
@@ -1473,6 +1480,9 @@ def corr_func(x, y):
14731480
center=self.center,
14741481
closed=self.closed,
14751482
)
1483+
1484+
assert len(start) == len(end), 'these should be equal in length from get_window_bounds'
1485+
14761486
with np.errstate(all="ignore"):
14771487
mean_x_y = window_aggregations.roll_mean(
14781488
x_array * y_array, start, end, min_periods
@@ -2346,26 +2356,29 @@ def _get_window_indexer(self) -> GroupbyIndexer:
23462356
rolling_indexer: type[BaseIndexer]
23472357
indexer_kwargs: dict[str, Any] | None = None
23482358
index_array = self._index_array
2359+
23492360
if isinstance(self.window, BaseIndexer):
23502361
rolling_indexer = type(self.window)
2351-
indexer_kwargs = self.window.__dict__
2362+
indexer_kwargs = self.window.__dict__.copy()
23522363
assert isinstance(indexer_kwargs, dict) # for mypy
23532364
# We'll be using the index of each group later
23542365
indexer_kwargs.pop("index_array", None)
2355-
window = 0
2366+
window = self.window
23562367
elif self._win_freq_i8 is not None:
23572368
rolling_indexer = VariableWindowIndexer
23582369
window = self._win_freq_i8
23592370
else:
23602371
rolling_indexer = FixedWindowIndexer
23612372
window = self.window
2373+
23622374
window_indexer = GroupbyIndexer(
23632375
index_array=index_array,
23642376
window_size=window,
2365-
groupby_indicies=self._grouper.indices,
2377+
groupby_indices=self._grouper.indices,
23662378
window_indexer=rolling_indexer,
23672379
indexer_kwargs=indexer_kwargs,
23682380
)
2381+
23692382
return window_indexer
23702383

23712384
def _validate_monotonic(self):

Diff for: pandas/tests/groupby/test_missing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_min_count(func, min_count, value):
146146
tm.assert_frame_equal(result, expected)
147147

148148

149-
def test_indicies_with_missing():
149+
def test_indices_with_missing():
150150
# GH 9304
151151
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
152152
g = df.groupby(["a", "b"])

Diff for: pandas/tests/window/test_base_indexer.py

+159
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33

44
from pandas import (
55
DataFrame,
6+
MultiIndex,
67
Series,
8+
concat,
79
date_range,
810
)
911
import pandas._testing as tm
@@ -13,6 +15,7 @@
1315
)
1416
from pandas.core.indexers.objects import (
1517
ExpandingIndexer,
18+
FixedWindowIndexer,
1619
VariableOffsetWindowIndexer,
1720
)
1821

@@ -293,3 +296,159 @@ def get_window_bounds(self, num_values, min_periods, center, closed):
293296
result = getattr(df.rolling(indexer), func)(*args)
294297
expected = DataFrame({"values": values})
295298
tm.assert_frame_equal(result, expected)
299+
300+
301+
@pytest.mark.parametrize(
302+
"indexer_class", [FixedWindowIndexer, FixedForwardWindowIndexer, ExpandingIndexer]
303+
)
304+
@pytest.mark.parametrize("window_size", [1, 2, 12])
305+
@pytest.mark.parametrize(
306+
"df_data",
307+
[
308+
{"a": [1, 1], "b": [0, 1]},
309+
{"a": [1, 2], "b": [0, 1]},
310+
{"a": [1] * 16, "b": [np.nan, 1, 2, np.nan] + list(range(4, 16))},
311+
],
312+
)
313+
def test_indexers_are_reusable_after_groupby_rolling(
314+
indexer_class, window_size, df_data
315+
):
316+
# GH 43267
317+
df = DataFrame(df_data)
318+
num_trials = 3
319+
indexer = indexer_class(window_size=window_size)
320+
original_window_size = indexer.window_size
321+
for i in range(num_trials):
322+
df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
323+
assert indexer.window_size == original_window_size
324+
325+
326+
@pytest.mark.parametrize(
327+
"window_size, num_values, expected_start, expected_end",
328+
[
329+
(1, 1, [0], [1]),
330+
(1, 2, [0, 1], [1, 2]),
331+
(2, 1, [0], [1]),
332+
(2, 2, [0, 1], [2, 2]),
333+
(5, 12, range(12), list(range(5, 12)) + [12] * 5),
334+
(12, 5, range(5), [5] * 5),
335+
(0, 0, np.array([]), np.array([])),
336+
(1, 0, np.array([]), np.array([])),
337+
(0, 1, [0], [0]),
338+
],
339+
)
340+
def test_fixed_forward_indexer_bounds(
341+
window_size, num_values, expected_start, expected_end
342+
):
343+
# GH 43267
344+
indexer = FixedForwardWindowIndexer(window_size=window_size)
345+
start, end = indexer.get_window_bounds(num_values=num_values)
346+
347+
tm.assert_numpy_array_equal(start, np.array(expected_start), check_dtype=False)
348+
tm.assert_numpy_array_equal(end, np.array(expected_end), check_dtype=False)
349+
assert len(start) == len(end)
350+
351+
352+
@pytest.mark.parametrize(
353+
"df, window_size, expected",
354+
[
355+
(
356+
DataFrame({"b": [0, 1, 2], "a": [1, 2, 2]}),
357+
2,
358+
Series(
359+
[0, 1.5, 2.0],
360+
index=MultiIndex.from_arrays([[1, 2, 2], range(3)], names=["a", None]),
361+
name="b",
362+
dtype=np.float64,
363+
),
364+
),
365+
(
366+
DataFrame(
367+
{
368+
"b": [np.nan, 1, 2, np.nan] + list(range(4, 18)),
369+
"a": [1] * 7 + [2] * 11,
370+
"c": range(18),
371+
}
372+
),
373+
12,
374+
Series(
375+
[
376+
3.6,
377+
3.6,
378+
4.25,
379+
5.0,
380+
5.0,
381+
5.5,
382+
6.0,
383+
12.0,
384+
12.5,
385+
13.0,
386+
13.5,
387+
14.0,
388+
14.5,
389+
15.0,
390+
15.5,
391+
16.0,
392+
16.5,
393+
17.0,
394+
],
395+
index=MultiIndex.from_arrays(
396+
[[1] * 7 + [2] * 11, range(18)], names=["a", None]
397+
),
398+
name="b",
399+
dtype=np.float64,
400+
),
401+
),
402+
],
403+
)
404+
def test_rolling_groupby_with_fixed_forward_specific(df, window_size, expected):
405+
# GH 43267
406+
indexer = FixedForwardWindowIndexer(window_size=window_size)
407+
result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
408+
tm.assert_series_equal(result, expected)
409+
410+
411+
@pytest.mark.parametrize(
412+
"group_keys",
413+
[
414+
(1,),
415+
(1, 2),
416+
(2, 1),
417+
(1, 1, 2),
418+
(1, 2, 1),
419+
(1, 1, 2, 2),
420+
(1, 2, 3, 2, 3),
421+
(1, 1, 2) * 4,
422+
(1, 2, 3) * 5,
423+
],
424+
)
425+
@pytest.mark.parametrize("window_size", [1, 2, 3, 4, 5, 8, 20])
426+
def test_rolling_groupby_with_fixed_forward_many(group_keys, window_size):
427+
# GH 43267
428+
df = DataFrame(
429+
dict(
430+
a=np.array(list(group_keys)),
431+
b=np.arange(len(group_keys), dtype=np.float64) + 17,
432+
c=np.arange(len(group_keys), dtype=np.int64),
433+
)
434+
)
435+
436+
indexer = FixedForwardWindowIndexer(window_size=window_size)
437+
result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).sum()
438+
result.index.names = ["a", "c"]
439+
440+
groups = df.groupby("a")[["a", "b"]]
441+
manual = concat(
442+
[
443+
g.assign(
444+
b=[
445+
g["b"].iloc[i : i + window_size].sum(min_count=1)
446+
for i in range(len(g))
447+
]
448+
)
449+
for _, g in groups
450+
]
451+
)
452+
manual = manual.set_index(["a", "c"])["b"]
453+
454+
tm.assert_series_equal(result, manual)

0 commit comments

Comments
 (0)