diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 03f8dbc20b52e..be8ba92496d49 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -735,6 +735,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi index fb86c4efb7314..013a03f719bbd 100644 --- a/pandas/src/algos_groupby_helper.pxi +++ b/pandas/src/algos_groupby_helper.pxi @@ -1356,6 +1356,12 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, ## reverse iterator if shifting backwards ii = offset + sign * i lab = labels[ii] + + # Skip null keys + if lab == -1: + out[ii] = -1 + continue + label_seen[lab] += 1 idxer_slot = label_seen[lab] % periods diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index 6b9d8f07587bc..5c704436ce3a0 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -700,6 +700,12 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, ## reverse iterator if shifting backwards ii = offset + sign * i lab = labels[ii] + + # Skip null keys + if lab == -1: + out[ii] = -1 + continue + label_seen[lab] += 1 idxer_slot = label_seen[lab] % periods diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 3f5b4152afe31..448d0c875b5c8 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6560,6 +6560,31 @@ def test_grouping_string_repr(self): expected = "Grouping(('A', 'a'))" tm.assert_equal(result, expected) + def test_group_shift_with_null_key(self): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `gr_.grouper.group_info` exactly + # at those places, where the group-by key is partilly missing. + df = pd.DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) + for i in range(n_rows)], dtype=float, + columns=["A", "B", "Z"], index=None) + gr_ = df.groupby(["A", "B"]) + + # Generate teh expected dataframe + expected = pd.DataFrame([(i % 12, i % 3 if i % 3 else np.nan, + i + 12 if i % 3 and i < n_rows - 12 + else np.nan) + for i in range(n_rows)], dtype=float, + columns=["A", "B", "Z"], index=None) + result = gr_.shift(-1) + + # Check for data grabbed from beyond the acceptable array bounds + # in case there was no segfault. + tm.assert_frame_equal(result, expected[["Z"]]) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all()