Skip to content

Commit 9fc7317

Browse files
committed
BUG: Fixes to FixedForwardWindowIndexer and GroupbyIndexer (#43267)
1 parent beb7c48 commit 9fc7317

File tree

7 files changed

+205
-26
lines changed

7 files changed

+205
-26
lines changed

doc/source/whatsnew/v1.4.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ Groupby/resample/rolling
355355
- Bug in :meth:`pandas.DataFrame.ewm`, where non-float64 dtypes were silently failing (:issue:`42452`)
356356
- Bug in :meth:`pandas.DataFrame.rolling` operation along rows (``axis=1``) incorrectly omits columns containing ``float16`` and ``float32`` (:issue:`41779`)
357357
- Bug in :meth:`Resampler.aggregate` did not allow the use of Named Aggregation (:issue:`32803`)
358-
-
358+
- Bug in :meth:`pandas.DataFrame.groupby.rolling` and :class:`pandas.api.indexers.FixedForwardWindowIndexer` leading to segfaults and window endpoints being mixed across groups (:issue:`43267`)
359359

360360
Reshaping
361361
^^^^^^^^^

pandas/core/indexers/objects.py

+22-19
Original file line numberDiff line numberDiff line change
@@ -266,9 +266,8 @@ def get_window_bounds(
266266
)
267267

268268
start = np.arange(num_values, dtype="int64")
269-
end_s = start[: -self.window_size] + self.window_size
270-
end_e = np.full(self.window_size, num_values, dtype="int64")
271-
end = np.concatenate([end_s, end_e])
269+
end = start + self.window_size
270+
end[end > num_values] = num_values
272271

273272
return start, end
274273

@@ -280,7 +279,7 @@ def __init__(
280279
self,
281280
index_array: np.ndarray | None = None,
282281
window_size: int = 0,
283-
groupby_indicies: dict | None = None,
282+
groupby_indices: dict | None = None,
284283
window_indexer: type[BaseIndexer] = BaseIndexer,
285284
indexer_kwargs: dict | None = None,
286285
**kwargs,
@@ -294,7 +293,7 @@ def __init__(
294293
the groups
295294
window_size : int
296295
window size during the windowing operation
297-
groupby_indicies : dict or None
296+
groupby_indices : dict or None
298297
dict of {group label: [positional index of rows belonging to the group]}
299298
window_indexer : BaseIndexer
300299
BaseIndexer class determining the start and end bounds of each group
@@ -303,11 +302,13 @@ def __init__(
303302
**kwargs :
304303
keyword arguments that will be available when get_window_bounds is called
305304
"""
306-
self.groupby_indicies = groupby_indicies or {}
305+
self.groupby_indices = groupby_indices or {}
307306
self.window_indexer = window_indexer
308-
self.indexer_kwargs = indexer_kwargs or {}
307+
self.indexer_kwargs = indexer_kwargs.copy() if indexer_kwargs else {}
309308
super().__init__(
310-
index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs
309+
index_array=index_array,
310+
window_size=self.indexer_kwargs.pop("window_size", window_size),
311+
**kwargs,
311312
)
312313

313314
@Appender(get_window_bounds_doc)
@@ -323,8 +324,8 @@ def get_window_bounds(
323324
# 3) Append the window bounds in group order
324325
start_arrays = []
325326
end_arrays = []
326-
window_indicies_start = 0
327-
for key, indices in self.groupby_indicies.items():
327+
window_indices_start = 0
328+
for key, indices in self.groupby_indices.items():
328329
index_array: np.ndarray | None
329330

330331
if self.index_array is not None:
@@ -341,18 +342,20 @@ def get_window_bounds(
341342
)
342343
start = start.astype(np.int64)
343344
end = end.astype(np.int64)
344-
# Cannot use groupby_indicies as they might not be monotonic with the object
345+
# From get_window_bounds, those two should be equal in length of array
346+
assert len(start) == len(end)
347+
# Cannot use groupby_indices as they might not be monotonic with the object
345348
# we're rolling over
346-
window_indicies = np.arange(
347-
window_indicies_start, window_indicies_start + len(indices)
349+
window_indices = np.arange(
350+
window_indices_start, window_indices_start + len(indices)
348351
)
349-
window_indicies_start += len(indices)
352+
window_indices_start += len(indices)
350353
# Extend as we'll be slicing window like [start, end)
351-
window_indicies = np.append(
352-
window_indicies, [window_indicies[-1] + 1]
353-
).astype(np.int64)
354-
start_arrays.append(window_indicies.take(ensure_platform_int(start)))
355-
end_arrays.append(window_indicies.take(ensure_platform_int(end)))
354+
window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype(
355+
np.int64
356+
)
357+
start_arrays.append(window_indices.take(ensure_platform_int(start)))
358+
end_arrays.append(window_indices.take(ensure_platform_int(end)))
356359
start = np.concatenate(start_arrays)
357360
end = np.concatenate(end_arrays)
358361
return start, end

pandas/core/window/ewm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,7 @@ def _get_window_indexer(self) -> GroupbyIndexer:
725725
GroupbyIndexer
726726
"""
727727
window_indexer = GroupbyIndexer(
728-
groupby_indicies=self._grouper.indices,
728+
groupby_indices=self._grouper.indices,
729729
window_indexer=ExponentialMovingWindowIndexer,
730730
)
731731
return window_indexer

pandas/core/window/expanding.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ def _get_window_indexer(self) -> GroupbyIndexer:
684684
GroupbyIndexer
685685
"""
686686
window_indexer = GroupbyIndexer(
687-
groupby_indicies=self._grouper.indices,
687+
groupby_indices=self._grouper.indices,
688688
window_indexer=ExpandingIndexer,
689689
)
690690
return window_indexer

pandas/core/window/rolling.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ def __iter__(self):
299299
center=self.center,
300300
closed=self.closed,
301301
)
302+
302303
# From get_window_bounds, those two should be equal in length of array
303304
assert len(start) == len(end)
304305

@@ -529,6 +530,7 @@ def _apply(
529530
y : type of input
530531
"""
531532
window_indexer = self._get_window_indexer()
533+
532534
min_periods = (
533535
self.min_periods
534536
if self.min_periods is not None
@@ -542,12 +544,16 @@ def homogeneous_func(values: np.ndarray):
542544
return values.copy()
543545

544546
def calc(x):
547+
545548
start, end = window_indexer.get_window_bounds(
546549
num_values=len(x),
547550
min_periods=min_periods,
548551
center=self.center,
549552
closed=self.closed,
550553
)
554+
555+
# From get_window_bounds, those two should be equal in length of array
556+
assert len(start) == len(end)
551557
return func(x, start, end, min_periods, *numba_args)
552558

553559
with np.errstate(all="ignore"):
@@ -1429,6 +1435,10 @@ def cov_func(x, y):
14291435
center=self.center,
14301436
closed=self.closed,
14311437
)
1438+
1439+
# From get_window_bounds, those two should be equal in length of array
1440+
assert len(start) == len(end)
1441+
14321442
with np.errstate(all="ignore"):
14331443
mean_x_y = window_aggregations.roll_mean(
14341444
x_array * y_array, start, end, min_periods
@@ -1468,6 +1478,10 @@ def corr_func(x, y):
14681478
center=self.center,
14691479
closed=self.closed,
14701480
)
1481+
1482+
# From get_window_bounds, those two should be equal in length of array
1483+
assert len(start) == len(end)
1484+
14711485
with np.errstate(all="ignore"):
14721486
mean_x_y = window_aggregations.roll_mean(
14731487
x_array * y_array, start, end, min_periods
@@ -2341,26 +2355,29 @@ def _get_window_indexer(self) -> GroupbyIndexer:
23412355
rolling_indexer: type[BaseIndexer]
23422356
indexer_kwargs: dict[str, Any] | None = None
23432357
index_array = self._index_array
2358+
23442359
if isinstance(self.window, BaseIndexer):
23452360
rolling_indexer = type(self.window)
2346-
indexer_kwargs = self.window.__dict__
2361+
indexer_kwargs = self.window.__dict__.copy()
23472362
assert isinstance(indexer_kwargs, dict) # for mypy
23482363
# We'll be using the index of each group later
23492364
indexer_kwargs.pop("index_array", None)
2350-
window = 0
2365+
window = self.window
23512366
elif self._win_freq_i8 is not None:
23522367
rolling_indexer = VariableWindowIndexer
23532368
window = self._win_freq_i8
23542369
else:
23552370
rolling_indexer = FixedWindowIndexer
23562371
window = self.window
2372+
23572373
window_indexer = GroupbyIndexer(
23582374
index_array=index_array,
23592375
window_size=window,
2360-
groupby_indicies=self._grouper.indices,
2376+
groupby_indices=self._grouper.indices,
23612377
window_indexer=rolling_indexer,
23622378
indexer_kwargs=indexer_kwargs,
23632379
)
2380+
23642381
return window_indexer
23652382

23662383
def _validate_monotonic(self):

pandas/tests/groupby/test_missing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def test_min_count(func, min_count, value):
144144
tm.assert_frame_equal(result, expected)
145145

146146

147-
def test_indicies_with_missing():
147+
def test_indices_with_missing():
148148
# GH 9304
149149
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
150150
g = df.groupby(["a", "b"])

pandas/tests/window/test_base_indexer.py

+159
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@
44
from pandas import (
55
DataFrame,
66
Series,
7+
MultiIndex,
8+
concat,
79
date_range,
810
)
911
import pandas._testing as tm
@@ -13,6 +15,7 @@
1315
)
1416
from pandas.core.indexers.objects import (
1517
ExpandingIndexer,
18+
FixedWindowIndexer,
1619
VariableOffsetWindowIndexer,
1720
)
1821

@@ -293,3 +296,159 @@ def get_window_bounds(self, num_values, min_periods, center, closed):
293296
result = getattr(df.rolling(indexer), func)(*args)
294297
expected = DataFrame({"values": values})
295298
tm.assert_frame_equal(result, expected)
299+
300+
301+
@pytest.mark.parametrize(
302+
"indexer_class", [FixedWindowIndexer, FixedForwardWindowIndexer, ExpandingIndexer]
303+
)
304+
@pytest.mark.parametrize("window_size", [1, 2, 12])
305+
@pytest.mark.parametrize(
306+
"df_data",
307+
[
308+
{"a": [1, 1], "b": [0, 1]},
309+
{"a": [1, 2], "b": [0, 1]},
310+
{"a": [1] * 16, "b": [np.nan, 1, 2, np.nan] + list(range(4, 16))},
311+
],
312+
)
313+
def test_indexers_are_reusable_after_groupby_rolling(
314+
indexer_class, window_size, df_data
315+
):
316+
# GH 43267
317+
df = DataFrame(df_data)
318+
num_trials = 3
319+
indexer = indexer_class(window_size=window_size)
320+
original_window_size = indexer.window_size
321+
for i in range(num_trials):
322+
df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
323+
assert indexer.window_size == original_window_size
324+
325+
326+
@pytest.mark.parametrize(
327+
"window_size, num_values, expected_start, expected_end",
328+
[
329+
(1, 1, [0], [1]),
330+
(1, 2, [0, 1], [1, 2]),
331+
(2, 1, [0], [1]),
332+
(2, 2, [0, 1], [2, 2]),
333+
(5, 12, range(12), list(range(5, 12)) + [12] * 5),
334+
(12, 5, range(5), [5] * 5),
335+
(0, 0, np.array([]), np.array([])),
336+
(1, 0, np.array([]), np.array([])),
337+
(0, 1, [0], [0]),
338+
],
339+
)
340+
def test_fixed_forward_indexer_bounds(
341+
window_size, num_values, expected_start, expected_end
342+
):
343+
# GH 43267
344+
indexer = FixedForwardWindowIndexer(window_size=window_size)
345+
start, end = indexer.get_window_bounds(num_values=num_values)
346+
347+
tm.assert_numpy_array_equal(start, np.array(expected_start), check_dtype=False)
348+
tm.assert_numpy_array_equal(end, np.array(expected_end), check_dtype=False)
349+
assert len(start) == len(end)
350+
351+
352+
@pytest.mark.parametrize(
353+
"df, window_size, expected",
354+
[
355+
(
356+
DataFrame({"b": [0, 1, 2], "a": [1, 2, 2]}),
357+
2,
358+
Series(
359+
[0, 1.5, 2.0],
360+
index=MultiIndex.from_arrays([[1, 2, 2], range(3)], names=["a", None]),
361+
name="b",
362+
dtype=np.float64,
363+
),
364+
),
365+
(
366+
DataFrame(
367+
{
368+
"b": [np.nan, 1, 2, np.nan] + list(range(4, 18)),
369+
"a": [1] * 7 + [2] * 11,
370+
"c": range(18),
371+
}
372+
),
373+
12,
374+
Series(
375+
[
376+
3.6,
377+
3.6,
378+
4.25,
379+
5.0,
380+
5.0,
381+
5.5,
382+
6.0,
383+
12.0,
384+
12.5,
385+
13.0,
386+
13.5,
387+
14.0,
388+
14.5,
389+
15.0,
390+
15.5,
391+
16.0,
392+
16.5,
393+
17.0,
394+
],
395+
index=MultiIndex.from_arrays(
396+
[[1] * 7 + [2] * 11, range(18)], names=["a", None]
397+
),
398+
name="b",
399+
dtype=np.float64,
400+
),
401+
),
402+
],
403+
)
404+
def test_rolling_groupby_with_fixed_forward_specific(df, window_size, expected):
405+
# GH 43267
406+
indexer = FixedForwardWindowIndexer(window_size=window_size)
407+
result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
408+
tm.assert_series_equal(result, expected)
409+
410+
411+
@pytest.mark.parametrize(
412+
"group_keys",
413+
[
414+
(1,),
415+
(1, 2),
416+
(2, 1),
417+
(1, 1, 2),
418+
(1, 2, 1),
419+
(1, 1, 2, 2),
420+
(1, 2, 3, 2, 3),
421+
(1, 1, 2) * 4,
422+
(1, 2, 3) * 5,
423+
],
424+
)
425+
@pytest.mark.parametrize("window_size", [1, 2, 3, 4, 5, 8, 20])
426+
def test_rolling_groupby_with_fixed_forward_many(group_keys, window_size):
427+
# GH 43267
428+
df = DataFrame(
429+
dict(
430+
a=np.array(list(group_keys)),
431+
b=np.arange(len(group_keys), dtype=np.float64) + 17,
432+
c=np.arange(len(group_keys), dtype=np.int64),
433+
)
434+
)
435+
436+
indexer = FixedForwardWindowIndexer(window_size=window_size)
437+
result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).sum()
438+
result.index.names = ["a", "c"]
439+
440+
groups = df.groupby("a")[["a", "b"]]
441+
manual = concat(
442+
[
443+
g.assign(
444+
b=[
445+
g["b"].iloc[i : i + window_size].sum(min_count=1)
446+
for i in range(len(g))
447+
]
448+
)
449+
for _, g in groups
450+
]
451+
)
452+
manual = manual.set_index(["a", "c"])["b"]
453+
454+
tm.assert_series_equal(result, manual)

0 commit comments

Comments
 (0)