diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index f66098633b45e..5e6d5c8c7466d 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -285,6 +285,7 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`) +- Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`) - Reshaping diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 4e451fc33b055..0f796b2b65dff 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -888,12 +888,17 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys, k = len(keys) - if n == 0: + # Start at the first non-null entry + j = 0 + for j in range(0, n): + if labels[j] != -1: + break + else: return result + cur = labels[j] + start = j - start = 0 - cur = labels[0] - for i in range(1, n): + for i in range(j+1, n): lab = labels[i] if lab != cur: diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index 90396f1be0755..9417b626386fc 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -542,8 +542,7 @@ def get_indexer_dict( group_index = get_group_index(label_list, shape, sort=True, xnull=True) if np.all(group_index == -1): - # When all keys are nan and dropna=True, indices_fast can't handle this - # and the return is empty anyway + # Short-circuit, lib.indices_fast will return the same return {} ngroups = ( ((group_index.size and group_index.max()) + 1) diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py index 56cf400258f0f..e2ca63d9ab922 100644 --- a/pandas/tests/groupby/test_missing.py +++ b/pandas/tests/groupby/test_missing.py @@ -126,3 +126,12 @@ def test_min_count(func, min_count, value): result = getattr(df.groupby("a"), func)(min_count=min_count) expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a")) tm.assert_frame_equal(result, expected) + + +def test_indicies_with_missing(): + # GH 9304 + df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]}) + g = df.groupby(["a", "b"]) + result = g.indices + expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])} + assert result == expected