Skip to content

Commit 5c5a1f2

Browse files
committed
BUG: Fixes to FixedForwardWindowIndexer and GroupbyIndexer (pandas-dev#43267)
1 parent beb7c48 commit 5c5a1f2

File tree

8 files changed

+134
-27
lines changed

8 files changed

+134
-27
lines changed

Diff for: doc/source/whatsnew/v1.4.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -355,7 +355,7 @@ Groupby/resample/rolling
355355
- Bug in :meth:`pandas.DataFrame.ewm`, where non-float64 dtypes were silently failing (:issue:`42452`)
356356
- Bug in :meth:`pandas.DataFrame.rolling` operation along rows (``axis=1``) incorrectly omits columns containing ``float16`` and ``float32`` (:issue:`41779`)
357357
- Bug in :meth:`Resampler.aggregate` did not allow the use of Named Aggregation (:issue:`32803`)
358-
-
358+
- Bug in :class:`GroupbyIndexer` and :class:`FixedForwardWindowIndexer` leading to segfaults and incorrect windows (:issue:`43267`)
359359

360360
Reshaping
361361
^^^^^^^^^

Diff for: pandas/core/indexers/objects.py

+18-19
Original file line numberDiff line numberDiff line change
@@ -93,7 +93,6 @@ def get_window_bounds(
9393

9494
end = np.clip(end, 0, num_values)
9595
start = np.clip(start, 0, num_values)
96-
9796
return start, end
9897

9998

@@ -266,9 +265,8 @@ def get_window_bounds(
266265
)
267266

268267
start = np.arange(num_values, dtype="int64")
269-
end_s = start[: -self.window_size] + self.window_size
270-
end_e = np.full(self.window_size, num_values, dtype="int64")
271-
end = np.concatenate([end_s, end_e])
268+
end = start + self.window_size
269+
end[end > num_values] = num_values
272270

273271
return start, end
274272

@@ -280,7 +278,7 @@ def __init__(
280278
self,
281279
index_array: np.ndarray | None = None,
282280
window_size: int = 0,
283-
groupby_indicies: dict | None = None,
281+
groupby_indices: dict | None = None,
284282
window_indexer: type[BaseIndexer] = BaseIndexer,
285283
indexer_kwargs: dict | None = None,
286284
**kwargs,
@@ -294,7 +292,7 @@ def __init__(
294292
the groups
295293
window_size : int
296294
window size during the windowing operation
297-
groupby_indicies : dict or None
295+
groupby_indices : dict or None
298296
dict of {group label: [positional index of rows belonging to the group]}
299297
window_indexer : BaseIndexer
300298
BaseIndexer class determining the start and end bounds of each group
@@ -303,11 +301,12 @@ def __init__(
303301
**kwargs :
304302
keyword arguments that will be available when get_window_bounds is called
305303
"""
306-
self.groupby_indicies = groupby_indicies or {}
304+
self.groupby_indices = groupby_indices or {}
307305
self.window_indexer = window_indexer
308-
self.indexer_kwargs = indexer_kwargs or {}
306+
self.indexer_kwargs = indexer_kwargs.copy() if indexer_kwargs else {}
309307
super().__init__(
310-
index_array, self.indexer_kwargs.pop("window_size", window_size), **kwargs
308+
index_array=index_array,
309+
window_size=self.indexer_kwargs.pop("window_size", window_size), **kwargs
311310
)
312311

313312
@Appender(get_window_bounds_doc)
@@ -323,8 +322,8 @@ def get_window_bounds(
323322
# 3) Append the window bounds in group order
324323
start_arrays = []
325324
end_arrays = []
326-
window_indicies_start = 0
327-
for key, indices in self.groupby_indicies.items():
325+
window_indices_start = 0
326+
for key, indices in self.groupby_indices.items():
328327
index_array: np.ndarray | None
329328

330329
if self.index_array is not None:
@@ -341,18 +340,18 @@ def get_window_bounds(
341340
)
342341
start = start.astype(np.int64)
343342
end = end.astype(np.int64)
344-
# Cannot use groupby_indicies as they might not be monotonic with the object
343+
# Cannot use groupby_indices as they might not be monotonic with the object
345344
# we're rolling over
346-
window_indicies = np.arange(
347-
window_indicies_start, window_indicies_start + len(indices)
345+
window_indices = np.arange(
346+
window_indices_start, window_indices_start + len(indices)
348347
)
349-
window_indicies_start += len(indices)
348+
window_indices_start += len(indices)
350349
# Extend as we'll be slicing window like [start, end)
351-
window_indicies = np.append(
352-
window_indicies, [window_indicies[-1] + 1]
350+
window_indices = np.append(
351+
window_indices, [window_indices[-1] + 1]
353352
).astype(np.int64)
354-
start_arrays.append(window_indicies.take(ensure_platform_int(start)))
355-
end_arrays.append(window_indicies.take(ensure_platform_int(end)))
353+
start_arrays.append(window_indices.take(ensure_platform_int(start)))
354+
end_arrays.append(window_indices.take(ensure_platform_int(end)))
356355
start = np.concatenate(start_arrays)
357356
end = np.concatenate(end_arrays)
358357
return start, end

Diff for: pandas/core/window/ewm.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -725,7 +725,7 @@ def _get_window_indexer(self) -> GroupbyIndexer:
725725
GroupbyIndexer
726726
"""
727727
window_indexer = GroupbyIndexer(
728-
groupby_indicies=self._grouper.indices,
728+
groupby_indices=self._grouper.indices,
729729
window_indexer=ExponentialMovingWindowIndexer,
730730
)
731731
return window_indexer

Diff for: pandas/core/window/expanding.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -684,7 +684,7 @@ def _get_window_indexer(self) -> GroupbyIndexer:
684684
GroupbyIndexer
685685
"""
686686
window_indexer = GroupbyIndexer(
687-
groupby_indicies=self._grouper.indices,
687+
groupby_indices=self._grouper.indices,
688688
window_indexer=ExpandingIndexer,
689689
)
690690
return window_indexer

Diff for: pandas/core/window/rolling.py

+20-3
Original file line numberDiff line numberDiff line change
@@ -299,6 +299,7 @@ def __iter__(self):
299299
center=self.center,
300300
closed=self.closed,
301301
)
302+
302303
# From get_window_bounds, those two should be equal in length of array
303304
assert len(start) == len(end)
304305

@@ -529,6 +530,7 @@ def _apply(
529530
y : type of input
530531
"""
531532
window_indexer = self._get_window_indexer()
533+
532534
min_periods = (
533535
self.min_periods
534536
if self.min_periods is not None
@@ -542,12 +544,16 @@ def homogeneous_func(values: np.ndarray):
542544
return values.copy()
543545

544546
def calc(x):
547+
545548
start, end = window_indexer.get_window_bounds(
546549
num_values=len(x),
547550
min_periods=min_periods,
548551
center=self.center,
549552
closed=self.closed,
550553
)
554+
555+
# From get_window_bounds, those two should be equal in length of array
556+
assert len(start) == len(end)
551557
return func(x, start, end, min_periods, *numba_args)
552558

553559
with np.errstate(all="ignore"):
@@ -1429,6 +1435,10 @@ def cov_func(x, y):
14291435
center=self.center,
14301436
closed=self.closed,
14311437
)
1438+
1439+
# From get_window_bounds, those two should be equal in length of array
1440+
assert len(start) == len(end)
1441+
14321442
with np.errstate(all="ignore"):
14331443
mean_x_y = window_aggregations.roll_mean(
14341444
x_array * y_array, start, end, min_periods
@@ -1468,6 +1478,10 @@ def corr_func(x, y):
14681478
center=self.center,
14691479
closed=self.closed,
14701480
)
1481+
1482+
# From get_window_bounds, those two should be equal in length of array
1483+
assert len(start) == len(end)
1484+
14711485
with np.errstate(all="ignore"):
14721486
mean_x_y = window_aggregations.roll_mean(
14731487
x_array * y_array, start, end, min_periods
@@ -2341,26 +2355,29 @@ def _get_window_indexer(self) -> GroupbyIndexer:
23412355
rolling_indexer: type[BaseIndexer]
23422356
indexer_kwargs: dict[str, Any] | None = None
23432357
index_array = self._index_array
2358+
23442359
if isinstance(self.window, BaseIndexer):
23452360
rolling_indexer = type(self.window)
2346-
indexer_kwargs = self.window.__dict__
2361+
indexer_kwargs = self.window.__dict__.copy()
23472362
assert isinstance(indexer_kwargs, dict) # for mypy
23482363
# We'll be using the index of each group later
23492364
indexer_kwargs.pop("index_array", None)
2350-
window = 0
2365+
window = self.window
23512366
elif self._win_freq_i8 is not None:
23522367
rolling_indexer = VariableWindowIndexer
23532368
window = self._win_freq_i8
23542369
else:
23552370
rolling_indexer = FixedWindowIndexer
23562371
window = self.window
2372+
23572373
window_indexer = GroupbyIndexer(
23582374
index_array=index_array,
23592375
window_size=window,
2360-
groupby_indicies=self._grouper.indices,
2376+
groupby_indices=self._grouper.indices,
23612377
window_indexer=rolling_indexer,
23622378
indexer_kwargs=indexer_kwargs,
23632379
)
2380+
23642381
return window_indexer
23652382

23662383
def _validate_monotonic(self):

Diff for: pandas/tests/groupby/test_missing.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,7 @@ def test_min_count(func, min_count, value):
144144
tm.assert_frame_equal(result, expected)
145145

146146

147-
def test_indicies_with_missing():
147+
def test_indices_with_missing():
148148
# GH 9304
149149
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
150150
g = df.groupby(["a", "b"])

Diff for: pandas/tests/window/test_base_indexer.py

+42
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
)
1414
from pandas.core.indexers.objects import (
1515
ExpandingIndexer,
16+
FixedWindowIndexer,
1617
VariableOffsetWindowIndexer,
1718
)
1819

@@ -293,3 +294,44 @@ def get_window_bounds(self, num_values, min_periods, center, closed):
293294
result = getattr(df.rolling(indexer), func)(*args)
294295
expected = DataFrame({"values": values})
295296
tm.assert_frame_equal(result, expected)
297+
298+
299+
@pytest.mark.parametrize('indexer_class', [FixedWindowIndexer,
300+
FixedForwardWindowIndexer,
301+
ExpandingIndexer])
302+
@pytest.mark.parametrize('window_size', [1, 2, 12])
303+
@pytest.mark.parametrize("df", [
304+
DataFrame({"a": [1, 1], "b": [0, 1]}),
305+
DataFrame({"a": [1, 2], "b": [0, 1]}),
306+
DataFrame({'b': [np.nan, 1, 2, np.nan] + list(range(4, 16))}).assign(a=1)
307+
])
308+
def test_indexers_are_reusable_after_groupby_rolling(indexer_class, window_size, df):
309+
# GH 43267
310+
num_trials = 3
311+
indexer = indexer_class(window_size=window_size)
312+
original_window_size = indexer.window_size
313+
for i in range(num_trials):
314+
df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
315+
assert indexer.window_size == original_window_size
316+
317+
318+
@pytest.mark.parametrize('window_size, num_values, expected_start, expected_end', [
319+
(1, 1, [0], [1]),
320+
(1, 2, [0, 1], [1, 2]),
321+
(2, 1, [0], [1]),
322+
(2, 2, [0, 1], [2, 2]),
323+
(5, 12, range(12), list(range(5, 12)) + [12] * 5),
324+
(12, 5, range(5), [5] * 5),
325+
(0, 0, np.array([], dtype='int64'), np.array([], dtype='int64')),
326+
(1, 0, np.array([], dtype='int64'), np.array([], dtype='int64')),
327+
(0, 1, [0], [0]),
328+
])
329+
def test_fixed_forward_indexer_bounds(window_size, num_values,
330+
expected_start, expected_end):
331+
# GH 43267
332+
indexer = FixedForwardWindowIndexer(window_size=window_size)
333+
start, end = indexer.get_window_bounds(num_values=num_values)
334+
335+
tm.assert_equal(start, np.array(expected_start))
336+
tm.assert_equal(end, np.array(expected_end))
337+
assert len(start) == len(end)

Diff for: pandas/tests/window/test_rolling.py

+50-1
Original file line numberDiff line numberDiff line change
@@ -19,13 +19,14 @@
1919
Series,
2020
Timedelta,
2121
Timestamp,
22+
concat,
2223
date_range,
2324
period_range,
2425
to_datetime,
2526
to_timedelta,
2627
)
2728
import pandas._testing as tm
28-
from pandas.api.indexers import BaseIndexer
29+
from pandas.api.indexers import BaseIndexer, FixedForwardWindowIndexer
2930
from pandas.core.window import Rolling
3031

3132

@@ -1500,3 +1501,51 @@ def test_rolling_numeric_dtypes():
15001501
dtype="float64",
15011502
)
15021503
tm.assert_frame_equal(result, expected)
1504+
1505+
1506+
@pytest.mark.parametrize('df, window_size, expected', [
1507+
(DataFrame({"b": [0, 1, 2], "a": [1, 2, 2]}),
1508+
2,
1509+
Series([0, 1.5, 2.0],
1510+
index=MultiIndex.from_arrays([[1, 2, 2], range(3)], names=['a', None]),
1511+
name="b", dtype=np.float64)),
1512+
(DataFrame({"b": [np.nan, 1, 2, np.nan] + list(range(4, 18)),
1513+
"a": [1] * 7 + [2] * 11,
1514+
"c": range(18)}),
1515+
12,
1516+
Series([3.6, 3.6, 4.25, 5., 5., 5.5, 6., 12., 12.5,
1517+
13., 13.5, 14., 14.5, 15., 15.5, 16., 16.5, 17.],
1518+
index=MultiIndex.from_arrays([[1] * 7 + [2] * 11, range(18)],
1519+
names=["a", None]),
1520+
name="b",
1521+
dtype=np.float64))])
1522+
def test_rolling_groupby_with_fixed_forward_specific(df, window_size, expected):
1523+
# GH 43267
1524+
indexer = FixedForwardWindowIndexer(window_size=window_size)
1525+
result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).mean()
1526+
tm.assert_series_equal(result, expected)
1527+
1528+
1529+
@pytest.mark.parametrize('group_keys', [
1530+
(1,), (1, 2), (2, 1),
1531+
(1, 1, 2), (1, 2, 1), (1, 1, 2, 2), (1, 2, 3, 2, 3),
1532+
(1, 1, 2) * 4,
1533+
(1, 2, 3) * 5,
1534+
])
1535+
@pytest.mark.parametrize('window_size', [1, 2, 3, 4, 5, 8, 20])
1536+
def test_rolling_groupby_with_fixed_forward_many(group_keys, window_size):
1537+
# GH 43267
1538+
df = DataFrame(dict(a=np.array(list(group_keys)),
1539+
b=np.arange(len(group_keys), dtype=np.float64) + 17,
1540+
c=np.arange(len(group_keys), dtype=np.int64)))
1541+
1542+
indexer = FixedForwardWindowIndexer(window_size=window_size)
1543+
result = df.groupby("a")["b"].rolling(window=indexer, min_periods=1).sum()
1544+
result.index.names = ['a', 'c']
1545+
1546+
groups = df.groupby("a")[["a", "b"]]
1547+
manual = concat([g.assign(b=[g['b'].iloc[i:i + window_size].sum(min_count=1)
1548+
for i in range(len(g))]) for _, g in groups])
1549+
manual = manual.set_index(["a", "c"])["b"]
1550+
1551+
tm.assert_series_equal(result, manual)

0 commit comments

Comments
 (0)