Skip to content

Commit 160e3f3

Browse files
authored
BUG: additional keys in groupby indices when NAs are present (#38861)
1 parent 6fd5cc3 commit 160e3f3

File tree

4 files changed

+20
-6
lines changed

4 files changed

+20
-6
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ Groupby/resample/rolling
288288
^^^^^^^^^^^^^^^^^^^^^^^^
289289

290290
- Bug in :meth:`SeriesGroupBy.value_counts` where unobserved categories in a grouped categorical series were not tallied (:issue:`38672`)
291+
- Bug in :meth:`.GroupBy.indices` would contain non-existent indices when null values were present in the groupby keys (:issue:`9304`)
291292
-
292293

293294
Reshaping

pandas/_libs/lib.pyx

+9-4
Original file line numberDiff line numberDiff line change
@@ -888,12 +888,17 @@ def indices_fast(ndarray index, const int64_t[:] labels, list keys,
888888

889889
k = len(keys)
890890

891-
if n == 0:
891+
# Start at the first non-null entry
892+
j = 0
893+
for j in range(0, n):
894+
if labels[j] != -1:
895+
break
896+
else:
892897
return result
898+
cur = labels[j]
899+
start = j
893900

894-
start = 0
895-
cur = labels[0]
896-
for i in range(1, n):
901+
for i in range(j+1, n):
897902
lab = labels[i]
898903

899904
if lab != cur:

pandas/core/sorting.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -542,8 +542,7 @@ def get_indexer_dict(
542542

543543
group_index = get_group_index(label_list, shape, sort=True, xnull=True)
544544
if np.all(group_index == -1):
545-
# When all keys are nan and dropna=True, indices_fast can't handle this
546-
# and the return is empty anyway
545+
# Short-circuit, lib.indices_fast will return the same
547546
return {}
548547
ngroups = (
549548
((group_index.size and group_index.max()) + 1)

pandas/tests/groupby/test_missing.py

+9
Original file line numberDiff line numberDiff line change
@@ -126,3 +126,12 @@ def test_min_count(func, min_count, value):
126126
result = getattr(df.groupby("a"), func)(min_count=min_count)
127127
expected = DataFrame({"b": [value], "c": [np.nan]}, index=Index([1], name="a"))
128128
tm.assert_frame_equal(result, expected)
129+
130+
131+
def test_indicies_with_missing():
132+
# GH 9304
133+
df = DataFrame({"a": [1, 1, np.nan], "b": [2, 3, 4], "c": [5, 6, 7]})
134+
g = df.groupby(["a", "b"])
135+
result = g.indices
136+
expected = {(1.0, 2): np.array([0]), (1.0, 3): np.array([1])}
137+
assert result == expected

0 commit comments

Comments
 (0)