Skip to content

Commit d0de0c6

Browse files
authored
BUG: Fix DataFrame.groupby().apply() for NaN groups with dropna=False (#35951)
1 parent b73489f commit d0de0c6

File tree

3 files changed

+59
-3
lines changed

3 files changed

+59
-3
lines changed

doc/source/whatsnew/v1.2.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -214,7 +214,8 @@ Performance improvements
214214

215215
Bug fixes
216216
~~~~~~~~~
217-
217+
- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`)
218+
-
218219

219220
Categorical
220221
^^^^^^^^^^^

pandas/core/reshape/concat.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from pandas.core.dtypes.concat import concat_compat
1313
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
14+
from pandas.core.dtypes.missing import isna
1415

1516
from pandas.core.arrays.categorical import (
1617
factorize_from_iterable,
@@ -624,10 +625,11 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
624625
for hlevel, level in zip(zipped, levels):
625626
to_concat = []
626627
for key, index in zip(hlevel, indexes):
627-
mask = level == key
628+
# Find matching codes, include matching nan values as equal.
629+
mask = (isna(level) & isna(key)) | (level == key)
628630
if not mask.any():
629631
raise ValueError(f"Key {key} not in level {level}")
630-
i = np.nonzero(level == key)[0][0]
632+
i = np.nonzero(mask)[0][0]
631633

632634
to_concat.append(np.repeat(i, len(index)))
633635
codes_list.append(np.concatenate(to_concat))

pandas/tests/groupby/test_groupby_dropna.py

+53
Original file line numberDiff line numberDiff line change
@@ -274,3 +274,56 @@ def test_groupby_dropna_datetime_like_data(
274274
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))
275275

276276
tm.assert_frame_equal(grouped, expected)
277+
278+
279+
@pytest.mark.parametrize(
280+
"dropna, data, selected_data, levels",
281+
[
282+
pytest.param(
283+
False,
284+
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
285+
{"values": [0, 1, 0, 0]},
286+
["a", "b", np.nan],
287+
id="dropna_false_has_nan",
288+
),
289+
pytest.param(
290+
True,
291+
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
292+
{"values": [0, 1, 0]},
293+
None,
294+
id="dropna_true_has_nan",
295+
),
296+
pytest.param(
297+
# no nan in "groups"; dropna=True|False should be same.
298+
False,
299+
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
300+
{"values": [0, 1, 0, 0]},
301+
None,
302+
id="dropna_false_no_nan",
303+
),
304+
pytest.param(
305+
# no nan in "groups"; dropna=True|False should be same.
306+
True,
307+
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
308+
{"values": [0, 1, 0, 0]},
309+
None,
310+
id="dropna_true_no_nan",
311+
),
312+
],
313+
)
314+
def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels):
315+
# GH 35889
316+
317+
df = pd.DataFrame(data)
318+
gb = df.groupby("groups", dropna=dropna)
319+
result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))}))
320+
321+
mi_tuples = tuple(zip(data["groups"], selected_data["values"]))
322+
mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None])
323+
# Since right now, by default MI will drop NA from levels when we create MI
324+
# via `from_*`, so we need to add NA for level manually afterwards.
325+
if not dropna and levels:
326+
mi = mi.set_levels(levels, level="groups")
327+
328+
expected = pd.DataFrame(selected_data, index=mi)
329+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)