From 8938a5f3e229edf13ab071c713f794a5a0063414 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Thu, 31 Dec 2020 13:01:21 -0500
Subject: [PATCH 1/5] BUG: additional keys in groupby indices when NAs are
 present

---
 doc/source/whatsnew/v1.3.0.rst       |  2 +-
 pandas/_libs/lib.pyx                 | 13 ++++++++++---
 pandas/tests/groupby/test_missing.py |  9 +++++++++
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst
index b6d5493aefaa9..f06647dac3145 100644
--- a/doc/source/whatsnew/v1.3.0.rst
+++ b/doc/source/whatsnew/v1.3.0.rst
@@ -283,7 +283,7 @@ Plotting
 Groupby/resample/rolling
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
--
+- Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`)
 -
 
 Reshaping
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 4e451fc33b055..dd53c6a26d366 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -891,9 +891,16 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,
     if n == 0:
         return result
 
-    start = 0
-    cur = labels[0]
-    for i in range(1, n):
+    # Start at the first non-null entry
+    j = 0
+    for j in range(0, n):
+        if labels[j] != -1:
+            break
+    assert j != n
+    cur = labels[j]
+    start = j
+
+    for i in range(j+1, n):
         lab = labels[i]
 
         if lab != cur:
diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py
index 56cf400258f0f..374cfefd8ad26 100644
--- a/pandas/tests/groupby/test_missing.py
+++ b/pandas/tests/groupby/test_missing.py
@@ -126,3 +126,12 @@ def test_min_count(func, min_count, value):
     result = getattr(df.groupby("a"), func)(min_count=min_count)
     expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
     tm.assert_frame_equal(result, expected)
+
+
+def test_indicies_with_missing():
+    # GH 9304
+    df = pd.DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
+    g = df.groupby(["a", "b"])
+    result = g.indices
+    expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
+    assert result == expected

From b51f9e7c2f9c8116387d241553e24fe781a63ef1 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Thu, 31 Dec 2020 13:11:23 -0500
Subject: [PATCH 2/5] Fix assert

---
 pandas/_libs/lib.pyx | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index dd53c6a26d366..de1997249b631 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -896,7 +896,8 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,
     for j in range(0, n):
         if labels[j] != -1:
             break
-    assert j != n
+    else:
+        raise ValueError("cannot handle all null values")
     cur = labels[j]
     start = j
 

From 493b7d9b4389fdad63ec02b834d19dc7ec78364a Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Thu, 31 Dec 2020 13:18:34 -0500
Subject: [PATCH 3/5] Fix namespace in test

---
 pandas/tests/groupby/test_missing.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/groupby/test_missing.py b/pandas/tests/groupby/test_missing.py
index 374cfefd8ad26..e2ca63d9ab922 100644
--- a/pandas/tests/groupby/test_missing.py
+++ b/pandas/tests/groupby/test_missing.py
@@ -130,7 +130,7 @@ def test_min_count(func, min_count, value):
 
 def test_indicies_with_missing():
     # GH 9304
-    df = pd.DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
+    df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
     g = df.groupby(["a", "b"])
     result = g.indices
     expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}

From 39396e37ad4366b098bd8aee93c3617f6548cea0 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Thu, 31 Dec 2020 16:21:15 -0500
Subject: [PATCH 4/5] Don't raise in lib.indices_fast

---
 pandas/_libs/lib.pyx   | 2 +-
 pandas/core/sorting.py | 3 +--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index de1997249b631..24567169d1f3e 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -897,7 +897,7 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,
         if labels[j] != -1:
             break
     else:
-        raise ValueError("cannot handle all null values")
+        return result
     cur = labels[j]
     start = j
 
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
index 90396f1be0755..9417b626386fc 100644
--- a/pandas/core/sorting.py
+++ b/pandas/core/sorting.py
@@ -542,8 +542,7 @@ def get_indexer_dict(
 
     group_index = get_group_index(label_list, shape, sort=True, xnull=True)
     if np.all(group_index == -1):
-        # When all keys are nan and dropna=True, indices_fast can't handle this
-        # and the return is empty anyway
+        # Short-circuit, lib.indices_fast will return the same
         return {}
     ngroups = (
         ((group_index.size and group_index.max()) + 1)

From b769afc16867118bc3ccd98fbdbe1f26a6cf1719 Mon Sep 17 00:00:00 2001
From: Richard Shadrach <rhshadrach@gmail.com>
Date: Fri, 1 Jan 2021 08:43:34 -0500
Subject: [PATCH 5/5] Remove n == 0 check

---
 pandas/_libs/lib.pyx | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 24567169d1f3e..0f796b2b65dff 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -888,9 +888,6 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,
 
     k = len(keys)
 
-    if n == 0:
-        return result
-
     # Start at the first non-null entry
     j = 0
     for j in range(0, n):