Skip to content

BUG: Fix DataFrame.groupby().apply() for NaN groups with dropna=False #35951

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
Sep 5, 2020
Merged
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.2.rst
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ Bug fixes
- Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`)
- Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`)
- Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`)
- Bug in :meth:`DataFrame.apply` on :meth:`DataFrame.groupby`, ``dropna=False`` and ``np.nan`` group(s) (:issue:`35889`)
-

.. ---------------------------------------------------------------------------
Expand Down
10 changes: 5 additions & 5 deletions pandas/core/reshape/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

from pandas.core.dtypes.concat import concat_compat
from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries
from pandas.core.dtypes.missing import isna

from pandas.core.arrays.categorical import (
factorize_from_iterable,
Expand Down Expand Up @@ -619,17 +620,16 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde
codes_list = []

# things are potentially different sizes, so compute the exact codes
# for each level and pass those to MultiIndex.from_arrays

# for each level and pass those to MultiIndex.from_arrays.
for hlevel, level in zip(zipped, levels):
to_concat = []
for key, index in zip(hlevel, indexes):
mask = level == key
mask = (isna(level) & isna(key)) | (level == key)
if not mask.any():
raise ValueError(f"Key {key} not in level {level}")
i = np.nonzero(level == key)[0][0]

i = np.nonzero(mask)[0][0]
to_concat.append(np.repeat(i, len(index)))

codes_list.append(np.concatenate(to_concat))

concat_index = _concat_indexes(indexes)
Expand Down
57 changes: 57 additions & 0 deletions pandas/tests/groupby/test_groupby_dropna.py
Original file line number Diff line number Diff line change
Expand Up @@ -276,3 +276,60 @@ def test_groupby_dropna_datetime_like_data(
expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt"))

tm.assert_frame_equal(grouped, expected)


@pytest.mark.parametrize(
"dropna, df_cols_in, df_cols_out, levels",
[
pytest.param(
False,
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
["a", "b", np.nan],
id="dropna_false_has_nan",
),
pytest.param(
True,
{"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0]},
None,
id="dropna_true_has_nan",
),
pytest.param(
# no nan in "groups"; dropna=True|False should be same.
False,
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
None,
id="dropna_false_no_nan",
),
pytest.param(
# no nan in "groups"; dropna=True|False should be same.
True,
{"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]},
{"values": [0, 1, 0, 0]},
None,
id="dropna_true_no_nan",
),
],
)
def test_groupby_dropna_multi_index_dataframe_apply(
dropna, df_cols_in, df_cols_out, levels
):
# GH 35889
# `groupby` with `dropna=False` and `apply` returning DataFrame of different
# sizes raises error if grouped column has nan values.

df = pd.DataFrame(df_cols_in)
dfg = df.groupby("groups", dropna=dropna)
rv = dfg.apply(lambda grp: pd.DataFrame({"values": list(range(len(grp)))}))

tuples = tuple(zip(df_cols_in["groups"], df_cols_out["values"]))
mi = pd.MultiIndex.from_tuples(tuples, names=["groups", None])
# Since right now, by default MI will drop NA from levels when we create MI
# via `from_*`, so we need to add NA for level manually afterwards.
if not dropna and levels:
mi = mi.set_levels(levels, level="groups")

expected = pd.DataFrame(df_cols_out, index=mi)
tm.assert_frame_equal(rv, expected)