Skip to content

Commit 607229a

Browse files
rhshadrachyehoshuadimarsky
authored andcommitted
BUG: groupby does not respect dropna=False when input has MultiIndex (pandas-dev#47186)
1 parent 2958ff7 commit 607229a

File tree

6 files changed

+54
-29
lines changed

6 files changed

+54
-29
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -873,6 +873,7 @@ Groupby/resample/rolling
873873
- Bug in :meth:`DataFrame.rolling` gives ValueError when center=True, axis=1 and win_type is specified (:issue:`46135`)
874874
- Bug in :meth:`.DataFrameGroupBy.describe` and :meth:`.SeriesGroupBy.describe` produces inconsistent results for empty datasets (:issue:`41575`)
875875
- Bug in :meth:`DataFrame.resample` reduction methods when used with ``on`` would attempt to aggregate the provided column (:issue:`47079`)
876+
- Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would not respect ``dropna=False`` when the input DataFrame/Series had a NaN values in a :class:`MultiIndex` (:issue:`46783`)
876877

877878
Reshaping
878879
^^^^^^^^^

pandas/core/groupby/grouper.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -501,7 +501,7 @@ def __init__(
501501
self.grouping_vector, # Index
502502
self._codes,
503503
self._group_index,
504-
) = index._get_grouper_for_level(mapper, level=ilevel)
504+
) = index._get_grouper_for_level(mapper, level=ilevel, dropna=dropna)
505505

506506
# a passed Grouper like, directly get the grouper in the same way
507507
# as single grouper groupby, use the group_info to get codes

pandas/core/indexes/base.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -2227,7 +2227,11 @@ def _drop_level_numbers(self, levnums: list[int]):
22272227
)
22282228

22292229
def _get_grouper_for_level(
2230-
self, mapper, *, level=None
2230+
self,
2231+
mapper,
2232+
*,
2233+
level=None,
2234+
dropna: bool = True,
22312235
) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]:
22322236
"""
22332237
Get index grouper corresponding to an index level
@@ -2238,6 +2242,8 @@ def _get_grouper_for_level(
22382242
Function mapping index values to groups
22392243
level : int or None
22402244
Index level, positional
2245+
dropna : bool
2246+
dropna from groupby
22412247
22422248
Returns
22432249
-------

pandas/core/indexes/multi.py

+14-26
Original file line numberDiff line numberDiff line change
@@ -1503,42 +1503,30 @@ def _set_names(self, names, *, level=None, validate: bool = True):
15031503

15041504
@doc(Index._get_grouper_for_level)
15051505
def _get_grouper_for_level(
1506-
self, mapper, *, level=None
1506+
self,
1507+
mapper,
1508+
*,
1509+
level=None,
1510+
dropna: bool = True,
15071511
) -> tuple[Index, npt.NDArray[np.signedinteger] | None, Index | None]:
1508-
indexer = self.codes[level]
1509-
level_index = self.levels[level]
1510-
15111512
if mapper is not None:
1513+
indexer = self.codes[level]
15121514
# Handle group mapping function and return
15131515
level_values = self.levels[level].take(indexer)
15141516
grouper = level_values.map(mapper)
15151517
return grouper, None, None
15161518

1517-
codes, uniques = algos.factorize(indexer, sort=True)
1518-
1519-
if len(uniques) > 0 and uniques[0] == -1:
1520-
# Handle NAs
1521-
mask = indexer != -1
1522-
ok_codes, uniques = algos.factorize(indexer[mask], sort=True)
1523-
1524-
codes = np.empty(len(indexer), dtype=indexer.dtype)
1525-
codes[mask] = ok_codes
1526-
codes[~mask] = -1
1527-
1528-
if len(uniques) < len(level_index):
1529-
# Remove unobserved levels from level_index
1530-
level_index = level_index.take(uniques)
1531-
else:
1532-
# break references back to us so that setting the name
1533-
# on the output of a groupby doesn't reflect back here.
1534-
level_index = level_index.copy()
1519+
values = self.get_level_values(level)
1520+
na_sentinel = -1 if dropna else None
1521+
codes, uniques = algos.factorize(values, sort=True, na_sentinel=na_sentinel)
1522+
assert isinstance(uniques, Index)
15351523

1536-
if level_index._can_hold_na:
1537-
grouper = level_index.take(codes, fill_value=True)
1524+
if self.levels[level]._can_hold_na:
1525+
grouper = uniques.take(codes, fill_value=True)
15381526
else:
1539-
grouper = level_index.take(codes)
1527+
grouper = uniques.take(codes)
15401528

1541-
return grouper, codes, level_index
1529+
return grouper, codes, uniques
15421530

15431531
@cache_readonly
15441532
def inferred_type(self) -> str:

pandas/tests/groupby/test_groupby_dropna.py

+30
Original file line numberDiff line numberDiff line change
@@ -332,6 +332,36 @@ def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data,
332332
tm.assert_frame_equal(result, expected)
333333

334334

335+
@pytest.mark.parametrize("input_index", [None, ["a"], ["a", "b"]])
336+
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
337+
@pytest.mark.parametrize("series", [True, False])
338+
def test_groupby_dropna_with_multiindex_input(input_index, keys, series):
339+
# GH#46783
340+
obj = pd.DataFrame(
341+
{
342+
"a": [1, np.nan],
343+
"b": [1, 1],
344+
"c": [2, 3],
345+
}
346+
)
347+
348+
expected = obj.set_index(keys)
349+
if series:
350+
expected = expected["c"]
351+
elif input_index == ["a", "b"] and keys == ["a"]:
352+
# Column b should not be aggregated
353+
expected = expected[["c"]]
354+
355+
if input_index is not None:
356+
obj = obj.set_index(input_index)
357+
gb = obj.groupby(keys, dropna=False)
358+
if series:
359+
gb = gb["c"]
360+
result = gb.sum()
361+
362+
tm.assert_equal(result, expected)
363+
364+
335365
def test_groupby_nan_included():
336366
# GH 35646
337367
data = {"group": ["g1", np.nan, "g1", "g2", np.nan], "B": [0, 1, 2, 3, 4]}

pandas/tests/groupby/test_grouping.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -471,7 +471,7 @@ def test_multiindex_negative_level(self, mframe):
471471
tm.assert_frame_equal(result, expected)
472472

473473
result = mframe.groupby(level=[-2, -1]).sum()
474-
expected = mframe
474+
expected = mframe.sort_index()
475475
tm.assert_frame_equal(result, expected)
476476

477477
result = mframe.groupby(level=[-1, "first"]).sum()

0 commit comments

Comments
 (0)