Skip to content

Commit 776329f

Browse files
dsm054jrebackmroeschke
authored
BUG: Fixes to FixedForwardWindowIndexer and GroupbyIndexer (#43267) (#43291)
* BUG: Fixes to FixedForwardWindowIndexer and GroupbyIndexer (#43267) * Undo whitespace changes * Fix type annotation of GroupbyIndexer * Ignore typing check for window_size since BaseIndexer is public * Update objects.py * Fix typing * Update per comments Co-authored-by: Jeff Reback <[email protected]> Co-authored-by: Matthew Roeschke <[email protected]>
1 parent 55f0654 commit 776329f

File tree

7 files changed

+210
-29
lines changed

7 files changed

+210
-29
lines changed

Diff for: doc/source/whatsnew/v1.3.4.rst

+1
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Fixed regressions
3333

3434
Bug fixes
3535
~~~~~~~~~
36+
- Fixed bug in :meth:`pandas.DataFrame.groupby.rolling` and :class:`pandas.api.indexers.FixedForwardWindowIndexer` leading to segfaults and window endpoints being mixed across groups (:issue:`43267`)
3637
- Fixed bug in :meth:`.GroupBy.mean` with datetimelike values including ``NaT`` values returning incorrect results (:issue:`43132`)
3738
- Fixed bug in :meth:`Series.aggregate` not passing the first ``args`` to the user supplied ``func`` in certain cases (:issue:`43357`)
3839
- Fixed memory leaks in :meth:`Series.rolling.quantile` and :meth:`Series.rolling.median` (:issue:`43339`)

Diff for: pandas/core/indexers/objects.py

+26-21
Original file line numberDiff line numberDiff line change
@@ -266,9 +266,9 @@ def get_window_bounds(
266266
)
267267

268268
start = np.arange(num_values, dtype="int64")
269-
end_s = start[: -self.window_size] + self.window_size
270-
end_e = np.full(self.window_size, num_values, dtype="int64")
271-
end = np.concatenate([end_s, end_e])
269+
end = start + self.window_size
270+
if self.window_size:
271+
end[-self.window_size :] = num_values
272272

273273
return start, end
274274

@@ -279,8 +279,8 @@ class GroupbyIndexer(BaseIndexer):
279279
def __init__(
280280
self,
281281
index_array: np.ndarray | None = None,
282-
window_size: int = 0,
283-
groupby_indicies: dict | None = None,
282+
window_size: int | BaseIndexer = 0,
283+
groupby_indices: dict | None = None,
284284
window_indexer: type[BaseIndexer] = BaseIndexer,
285285
indexer_kwargs: dict | None = None,
286286
**kwargs,
@@ -292,9 +292,9 @@ def __init__(
292292
np.ndarray of the index of the original object that we are performing
293293
a chained groupby operation over. This index has been pre-sorted relative to
294294
the groups
295-
window_size : int
295+
window_size : int or BaseIndexer
296296
window size during the windowing operation
297-
groupby_indicies : dict or None
297+
groupby_indices : dict or None
298298
dict of {group label: [positional index of rows belonging to the group]}
299299
window_indexer : BaseIndexer
300300
BaseIndexer class determining the start and end bounds of each group
@@ -303,11 +303,13 @@ def __init__(
303303
**kwargs :
304304
keyword arguments that will be available when get_window_bounds is called
305305
"""
306-
self.groupby_indicies = groupby_indicies or {}
306+
self.groupby_indices = groupby_indices or {}
307307
self.window_indexer = window_indexer
308-
self.indexer_kwargs = indexer_kwargs or {}
308+
self.indexer_kwargs = indexer_kwargs.copy() if indexer_kwargs else {}
309309
super().__init__(
310-
index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs
310+
index_array=index_array,
311+
window_size=self.indexer_kwargs.pop("window_size", window_size),
312+
**kwargs,
311313
)
312314

313315
@Appender(get_window_bounds_doc)
@@ -323,8 +325,8 @@ def get_window_bounds(
323325
# 3) Append the window bounds in group order
324326
start_arrays = []
325327
end_arrays = []
326-
window_indicies_start = 0
327-
for key, indices in self.groupby_indicies.items():
328+
window_indices_start = 0
329+
for key, indices in self.groupby_indices.items():
328330
index_array: np.ndarray | None
329331

330332
if self.index_array is not None:
@@ -341,18 +343,21 @@ def get_window_bounds(
341343
)
342344
start = start.astype(np.int64)
343345
end = end.astype(np.int64)
344-
# Cannot use groupby_indicies as they might not be monotonic with the object
346+
assert len(start) == len(
347+
end
348+
), "these should be equal in length from get_window_bounds"
349+
# Cannot use groupby_indices as they might not be monotonic with the object
345350
# we're rolling over
346-
window_indicies = np.arange(
347-
window_indicies_start, window_indicies_start + len(indices)
351+
window_indices = np.arange(
352+
window_indices_start, window_indices_start + len(indices)
348353
)
349-
window_indicies_start += len(indices)
354+
window_indices_start += len(indices)
350355
# Extend as we'll be slicing window like [start, end)
351-
window_indicies = np.append(
352-
window_indicies, [window_indicies[-1] + 1]
353-
).astype(np.int64)
354-
start_arrays.append(window_indicies.take(ensure_platform_int(start)))
355-
end_arrays.append(window_indicies.take(ensure_platform_int(end)))
356+
window_indices = np.append(window_indices, [window_indices[-1] + 1]).astype(
357+
np.int64, copy=False
358+
)
359+
start_arrays.append(window_indices.take(ensure_platform_int(start)))
360+
end_arrays.append(window_indices.take(ensure_platform_int(end)))
356361
start = np.concatenate(start_arrays)
357362
end = np.concatenate(end_arrays)
358363
return start, end

Diff for: pandas/core/window/ewm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -798,7 +798,7 @@ def _get_window_indexer(self) -> GroupbyIndexer:
798798
GroupbyIndexer
799799
"""
800800
window_indexer = GroupbyIndexer(
801-
groupby_indicies=self._grouper.indices,
801+
groupby_indices=self._grouper.indices,
802802
window_indexer=ExponentialMovingWindowIndexer,
803803
)
804804
return window_indexer

Diff for: pandas/core/window/expanding.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -760,7 +760,7 @@ def _get_window_indexer(self) -> GroupbyIndexer:
760760
GroupbyIndexer
761761
"""
762762
window_indexer = GroupbyIndexer(
763-
groupby_indicies=self._grouper.indices,
763+
groupby_indices=self._grouper.indices,
764764
window_indexer=ExpandingIndexer,
765765
)
766766
return window_indexer

Diff for: pandas/core/window/rolling.py

+21-5
Original file line numberDiff line numberDiff line change
@@ -311,8 +311,10 @@ def __iter__(self):
311311
center=self.center,
312312
closed=self.closed,
313313
)
314-
# From get_window_bounds, those two should be equal in length of array
315-
assert len(start) == len(end)
314+
315+
assert len(start) == len(
316+
end
317+
), "these should be equal in length from get_window_bounds"
316318

317319
for s, e in zip(start, end):
318320
result = obj.iloc[slice(s, e)]
@@ -563,6 +565,10 @@ def calc(x):
563565
center=self.center,
564566
closed=self.closed,
565567
)
568+
assert len(start) == len(
569+
end
570+
), "these should be equal in length from get_window_bounds"
571+
566572
return func(x, start, end, min_periods, *numba_args)
567573

568574
with np.errstate(all="ignore"):
@@ -1501,6 +1507,11 @@ def cov_func(x, y):
15011507
center=self.center,
15021508
closed=self.closed,
15031509
)
1510+
1511+
assert len(start) == len(
1512+
end
1513+
), "these should be equal in length from get_window_bounds"
1514+
15041515
with np.errstate(all="ignore"):
15051516
mean_x_y = window_aggregations.roll_mean(
15061517
x_array * y_array, start, end, min_periods
@@ -1540,6 +1551,11 @@ def corr_func(x, y):
15401551
center=self.center,
15411552
closed=self.closed,
15421553
)
1554+
1555+
assert len(start) == len(
1556+
end
1557+
), "these should be equal in length from get_window_bounds"
1558+
15431559
with np.errstate(all="ignore"):
15441560
mean_x_y = window_aggregations.roll_mean(
15451561
x_array * y_array, start, end, min_periods
@@ -2490,11 +2506,11 @@ def _get_window_indexer(self) -> GroupbyIndexer:
24902506
index_array = self._index_array
24912507
if isinstance(self.window, BaseIndexer):
24922508
rolling_indexer = type(self.window)
2493-
indexer_kwargs = self.window.__dict__
2509+
indexer_kwargs = self.window.__dict__.copy()
24942510
assert isinstance(indexer_kwargs, dict) # for mypy
24952511
# We'll be using the index of each group later
24962512
indexer_kwargs.pop("index_array", None)
2497-
window = 0
2513+
window = self.window
24982514
elif self._win_freq_i8 is not None:
24992515
rolling_indexer = VariableWindowIndexer
25002516
window = self._win_freq_i8
@@ -2504,7 +2520,7 @@ def _get_window_indexer(self) -> GroupbyIndexer:
25042520
window_indexer = GroupbyIndexer(
25052521
index_array=index_array,
25062522
window_size=window,
2507-
groupby_indicies=self._grouper.indices,
2523+
groupby_indices=self._grouper.indices,
25082524
window_indexer=rolling_indexer,
25092525
indexer_kwargs=indexer_kwargs,
25102526
)

Diff for: pandas/tests/groupby/test_missing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -146,7 +146,7 @@ def test_min_count(func, min_count, value):
146146
tm.assert_frame_equal(result, expected)
147147

148148

149-
def test_indicies_with_missing():
149+
def test_indices_with_missing():
150150
# GH 9304
151151
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
152152
g = df.groupby(["a", "b"])

Diff for: pandas/tests/window/test_base_indexer.py

+159
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,9 @@
33

44
from pandas import (
55
DataFrame,
6+
MultiIndex,
67
Series,
8+
concat,
79
date_range,
810
)
911
import pandas._testing as tm
@@ -13,6 +15,7 @@
1315
)
1416
from pandas.core.indexers.objects import (
1517
ExpandingIndexer,
18+
FixedWindowIndexer,
1619
VariableOffsetWindowIndexer,
1720
)
1821

@@ -293,3 +296,159 @@ def get_window_bounds(self, num_values, min_periods, center, closed):
293296
result = getattr(df.rolling(indexer), func)(*args)
294297
expected = DataFrame({"values": values})
295298
tm.assert_frame_equal(result, expected)
299+
300+
301+
@pytest.mark.parametrize(
302+
"indexer_class", [FixedWindowIndexer, FixedForwardWindowIndexer, ExpandingIndexer]
303+
)
304+
@pytest.mark.parametrize("window_size", [1, 2, 12])
305+
@pytest.mark.parametrize(
306+
"df_data",
307+
[
308+
{"a": [1, 1], "b": [0, 1]},
309+
{"a": [1, 2], "b": [0, 1]},
310+
{"a": [1] * 16, "b": [np.nan, 1, 2, np.nan] + list(range(4, 16))},
311+
],
312+
)
313+
def test_indexers_are_reusable_after_groupby_rolling(
314+
indexer_class, window_size, df_data
315+
):
316+
# GH 43267
317+
df = DataFrame(df_data)
318+
num_trials = 3
319+
indexer = indexer_class(window_size=window_size)
320+
original_window_size = indexer.window_size
321+
for i in range(num_trials):
322+
df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
323+
assert indexer.window_size == original_window_size
324+
325+
326+
@pytest.mark.parametrize(
327+
"window_size, num_values, expected_start, expected_end",
328+
[
329+
(1, 1, [0], [1]),
330+
(1, 2, [0, 1], [1, 2]),
331+
(2, 1, [0], [1]),
332+
(2, 2, [0, 1], [2, 2]),
333+
(5, 12, range(12), list(range(5, 12)) + [12] * 5),
334+
(12, 5, range(5), [5] * 5),
335+
(0, 0, np.array([]), np.array([])),
336+
(1, 0, np.array([]), np.array([])),
337+
(0, 1, [0], [0]),
338+
],
339+
)
340+
def test_fixed_forward_indexer_bounds(
341+
window_size, num_values, expected_start, expected_end
342+
):
343+
# GH 43267
344+
indexer = FixedForwardWindowIndexer(window_size=window_size)
345+
start, end = indexer.get_window_bounds(num_values=num_values)
346+
347+
tm.assert_numpy_array_equal(start, np.array(expected_start), check_dtype=False)
348+
tm.assert_numpy_array_equal(end, np.array(expected_end), check_dtype=False)
349+
assert len(start) == len(end)
350+
351+
352+
@pytest.mark.parametrize(
353+
"df, window_size, expected",
354+
[
355+
(
356+
DataFrame({"b": [0, 1, 2], "a": [1, 2, 2]}),
357+
2,
358+
Series(
359+
[0, 1.5, 2.0],
360+
index=MultiIndex.from_arrays([[1, 2, 2], range(3)], names=["a", None]),
361+
name="b",
362+
dtype=np.float64,
363+
),
364+
),
365+
(
366+
DataFrame(
367+
{
368+
"b": [np.nan, 1, 2, np.nan] + list(range(4, 18)),
369+
"a": [1] * 7 + [2] * 11,
370+
"c": range(18),
371+
}
372+
),
373+
12,
374+
Series(
375+
[
376+
3.6,
377+
3.6,
378+
4.25,
379+
5.0,
380+
5.0,
381+
5.5,
382+
6.0,
383+
12.0,
384+
12.5,
385+
13.0,
386+
13.5,
387+
14.0,
388+
14.5,
389+
15.0,
390+
15.5,
391+
16.0,
392+
16.5,
393+
17.0,
394+
],
395+
index=MultiIndex.from_arrays(
396+
[[1] * 7 + [2] * 11, range(18)], names=["a", None]
397+
),
398+
name="b",
399+
dtype=np.float64,
400+
),
401+
),
402+
],
403+
)
404+
def test_rolling_groupby_with_fixed_forward_specific(df, window_size, expected):
405+
# GH 43267
406+
indexer = FixedForwardWindowIndexer(window_size=window_size)
407+
result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
408+
tm.assert_series_equal(result, expected)
409+
410+
411+
@pytest.mark.parametrize(
412+
"group_keys",
413+
[
414+
(1,),
415+
(1, 2),
416+
(2, 1),
417+
(1, 1, 2),
418+
(1, 2, 1),
419+
(1, 1, 2, 2),
420+
(1, 2, 3, 2, 3),
421+
(1, 1, 2) * 4,
422+
(1, 2, 3) * 5,
423+
],
424+
)
425+
@pytest.mark.parametrize("window_size", [1, 2, 3, 4, 5, 8, 20])
426+
def test_rolling_groupby_with_fixed_forward_many(group_keys, window_size):
427+
# GH 43267
428+
df = DataFrame(
429+
{
430+
"a": np.array(list(group_keys)),
431+
"b": np.arange(len(group_keys), dtype=np.float64) + 17,
432+
"c": np.arange(len(group_keys), dtype=np.int64),
433+
}
434+
)
435+
436+
indexer = FixedForwardWindowIndexer(window_size=window_size)
437+
result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).sum()
438+
result.index.names = ["a", "c"]
439+
440+
groups = df.groupby("a")[["a", "b"]]
441+
manual = concat(
442+
[
443+
g.assign(
444+
b=[
445+
g["b"].iloc[i : i + window_size].sum(min_count=1)
446+
for i in range(len(g))
447+
]
448+
)
449+
for _, g in groups
450+
]
451+
)
452+
manual = manual.set_index(["a", "c"])["b"]
453+
454+
tm.assert_series_equal(result, manual)

0 commit comments

Comments
 (0)