From 8938a5f3e229edf13ab071c713f794a5a0063414 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 31 Dec 2020 13:01:21 -0500 Subject: [PATCH 1/5] BUG: additional keys in groupby indices when NAs are present --- doc/source/whatsnew/v1.3.0.rst | 2 +- pandas/_libs/lib.pyx | 13 ++++++++++--- pandas/tests/groupby/test_missing.py | 9 +++++++++ 3 files changed, 20 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b6d5493aefaa9..f06647dac3145 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -283,7 +283,7 @@ Plotting Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) - Reshaping diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4e451fc33b055..dd53c6a26d366 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -891,9 +891,16 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, if n == 0: return result - start = 0 - cur = labels[0] - for i in range(1, n): + # Start at the first non-null entry + j = 0 + for j in range(0, n): + if labels[j] != -1: + break + assert j != n + cur = labels[j] + start = j + + for i in range(j+1, n): lab = labels[i] if lab != cur: diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 56cf400258f0f..374cfefd8ad26 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -126,3 +126,12 @@ def test_min_count(func, min_count, value): result = getattr(df.groupby("a"), func)(min_count=min_count) expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a")) tm.assert_frame_equal(result, expected) + + +def test_indicies_with_missing(): + # GH 9304 + df = pd.DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]}) + g = df.groupby(["a", "b"]) + result = g.indices + expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])} + assert result == expected From b51f9e7c2f9c8116387d241553e24fe781a63ef1 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 31 Dec 2020 13:11:23 -0500 Subject: [PATCH 2/5] Fix assert --- pandas/_libs/lib.pyx | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index dd53c6a26d366..de1997249b631 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -896,7 +896,8 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, for j in range(0, n): if labels[j] != -1: break - assert j != n + else: + raise ValueError("cannot handle all null values") cur = labels[j] start = j From 493b7d9b4389fdad63ec02b834d19dc7ec78364a Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 31 Dec 2020 13:18:34 -0500 Subject: [PATCH 3/5] Fix namespace in test --- pandas/tests/groupby/test_missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 374cfefd8ad26..e2ca63d9ab922 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -130,7 +130,7 @@ def test_min_count(func, min_count, value): def test_indicies_with_missing(): # GH 9304 - df = pd.DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]}) + df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]}) g = df.groupby(["a", "b"]) result = g.indices expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])} From 39396e37ad4366b098bd8aee93c3617f6548cea0 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Thu, 31 Dec 2020 16:21:15 -0500 Subject: [PATCH 4/5] Don't raise in lib.indices_fast --- pandas/_libs/lib.pyx | 2 +- pandas/core/sorting.py | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index de1997249b631..24567169d1f3e 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -897,7 +897,7 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, if labels[j] != -1: break else: - raise ValueError("cannot handle all null values") + return result cur = labels[j] start = j diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 90396f1be0755..9417b626386fc 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -542,8 +542,7 @@ def get_indexer_dict( group_index = get_group_index(label_list, shape, sort=True, xnull=True) if np.all(group_index == -1): - # When all keys are nan and dropna=True, indices_fast can't handle this - # and the return is empty anyway + # Short-circuit, lib.indices_fast will return the same return {} ngroups = ( ((group_index.size and group_index.max()) + 1) From b769afc16867118bc3ccd98fbdbe1f26a6cf1719 Mon Sep 17 00:00:00 2001 From: Richard Shadrach Date: Fri, 1 Jan 2021 08:43:34 -0500 Subject: [PATCH 5/5] Remove n == 0 check --- pandas/_libs/lib.pyx | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 24567169d1f3e..0f796b2b65dff 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -888,9 +888,6 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, k = len(keys) - if n == 0: - return result - # Start at the first non-null entry j = 0 for j in range(0, n):