From 966d5c63fcebcc117dfb05b6086d0c3a025b9e88 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Wed, 27 Jul 2016 16:07:07 +0300 Subject: [PATCH 1/6] BUG: group_shift_indexer checks for null group keys --- pandas/src/algos_groupby_helper.pxi | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi index fb86c4efb7314..8e289544a072e 100644 --- a/pandas/src/algos_groupby_helper.pxi +++ b/pandas/src/algos_groupby_helper.pxi @@ -1356,6 +1356,11 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, ## reverse iterator if shifting backwards ii = offset + sign * i lab = labels[ii] + + # Skip null keys + if lab == -1: + continue + label_seen[lab] += 1 idxer_slot = label_seen[lab] % periods From fe2f0ecf7f510a8ee10b62cde9644687cb97fecc Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Wed, 27 Jul 2016 21:49:25 +0300 Subject: [PATCH 2/6] Treat incomplete group keys as distinct when shifting --- pandas/src/algos_groupby_helper.pxi | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/src/algos_groupby_helper.pxi b/pandas/src/algos_groupby_helper.pxi index 8e289544a072e..013a03f719bbd 100644 --- a/pandas/src/algos_groupby_helper.pxi +++ b/pandas/src/algos_groupby_helper.pxi @@ -1359,6 +1359,7 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, # Skip null keys if lab == -1: + out[ii] = -1 continue label_seen[lab] += 1 From 94bae0bdf069971d40030a4c60a369eb4783a33a Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Thu, 28 Jul 2016 08:37:27 +0300 Subject: [PATCH 3/6] Patched the template, and added a test for '.shift()' --- pandas/src/algos_groupby_helper.pxi.in | 6 ++++++ pandas/tests/test_groupby.py | 25 +++++++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index 6b9d8f07587bc..5c704436ce3a0 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -700,6 +700,12 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels, ## reverse iterator if shifting backwards ii = offset + sign * i lab = labels[ii] + + # Skip null keys + if lab == -1: + out[ii] = -1 + continue + label_seen[lab] += 1 idxer_slot = label_seen[lab] % periods diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 3f5b4152afe31..8ac5d88822259 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6560,6 +6560,31 @@ def test_grouping_string_repr(self): expected = "Grouping(('A', 'a'))" tm.assert_equal(result, expected) + def test_group_shift_with_null_key(self): + # This test is designed to replicate the segfault in issue #13813. + n_rows = 1200 + + # Generate a moderately large dataframe with occasional missing + # values in column `B`, and then group by [`A`, `B`]. This should + # force `-1` in `labels` array of `gr_.grouper.group_info` exactly + # at those places, where the group-by key is partilly missing. + df = pd.DataFrame([(i%12, i%3 if i%3 else float("nan"), i) + for i in range(n_rows)], dtype=float, + columns=["A", "B", "Z"], index=None) + gr_ = df.groupby(["A", "B"]) + + # Generate teh expected dataframe + expected = pd.DataFrame([(i%12, i%3 if i%3 else float("nan"), + i + 12 if i%3 and i < n_rows - 12 \ + else float("nan")) + for i in range(n_rows)], dtype=float, + columns=["A", "B", "Z"], index=None) + result = gr_.shift(-1) + + # Check for data grabbed from beyond the acceptable array bounds + # in case there was no segfault. + tm.assert_frame_equal(result, expected[["Z"]]) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all() From d92cf3cfbf23c4befd3bdd1b75e2d6bc1ba87e1e Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Thu, 28 Jul 2016 08:51:30 +0300 Subject: [PATCH 4/6] minor flake8 style corrections --- pandas/tests/test_groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 8ac5d88822259..c3e882766a48a 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6568,14 +6568,14 @@ def test_group_shift_with_null_key(self): # values in column `B`, and then group by [`A`, `B`]. This should # force `-1` in `labels` array of `gr_.grouper.group_info` exactly # at those places, where the group-by key is partilly missing. - df = pd.DataFrame([(i%12, i%3 if i%3 else float("nan"), i) + df = pd.DataFrame([(i % 12, i % 3 if i % 3 else float("nan"), i) for i in range(n_rows)], dtype=float, columns=["A", "B", "Z"], index=None) gr_ = df.groupby(["A", "B"]) # Generate teh expected dataframe - expected = pd.DataFrame([(i%12, i%3 if i%3 else float("nan"), - i + 12 if i%3 and i < n_rows - 12 \ + expected = pd.DataFrame([(i % 12, i % 3 if i % 3 else float("nan"), + i + 12 if i % 3 and i < n_rows - 12 else float("nan")) for i in range(n_rows)], dtype=float, columns=["A", "B", "Z"], index=None) From eab80388ef4031db84c5640fdb7486fe3c48d875 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Thu, 28 Jul 2016 09:46:39 +0300 Subject: [PATCH 5/6] Added bugfix description [ci skip] --- doc/source/whatsnew/v0.19.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt index 03f8dbc20b52e..be8ba92496d49 100644 --- a/doc/source/whatsnew/v0.19.0.txt +++ b/doc/source/whatsnew/v0.19.0.txt @@ -735,6 +735,7 @@ Performance Improvements Bug Fixes ~~~~~~~~~ +- Bug in ``groupby().shift()``, which could cause a segfault or corruption in rare circumstances when grouping by columns with missing values (:issue:`13813`) - Bug in ``pd.read_csv()``, which may cause a segfault or corruption when iterating in large chunks over a stream/file under rare circumstances (:issue:`13703`) - Bug in ``io.json.json_normalize()``, where non-ascii keys raised an exception (:issue:`13213`) - Bug in ``SparseSeries`` with ``MultiIndex`` ``[]`` indexing may raise ``IndexError`` (:issue:`13144`) From bddf799812202357ca94009206654f44f50cd1a3 Mon Sep 17 00:00:00 2001 From: Ivan Nazarov Date: Fri, 29 Jul 2016 10:26:26 +0300 Subject: [PATCH 6/6] Switched from float('nan') to np.nan --- pandas/tests/test_groupby.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index c3e882766a48a..448d0c875b5c8 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -6568,15 +6568,15 @@ def test_group_shift_with_null_key(self): # values in column `B`, and then group by [`A`, `B`]. This should # force `-1` in `labels` array of `gr_.grouper.group_info` exactly # at those places, where the group-by key is partilly missing. - df = pd.DataFrame([(i % 12, i % 3 if i % 3 else float("nan"), i) + df = pd.DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i) for i in range(n_rows)], dtype=float, columns=["A", "B", "Z"], index=None) gr_ = df.groupby(["A", "B"]) # Generate teh expected dataframe - expected = pd.DataFrame([(i % 12, i % 3 if i % 3 else float("nan"), + expected = pd.DataFrame([(i % 12, i % 3 if i % 3 else np.nan, i + 12 if i % 3 and i < n_rows - 12 - else float("nan")) + else np.nan) for i in range(n_rows)], dtype=float, columns=["A", "B", "Z"], index=None) result = gr_.shift(-1)