From ac7c99afe25185b7fc3a269446282171de51e754 Mon Sep 17 00:00:00 2001 From: David Kwong Date: Fri, 28 Aug 2020 14:57:00 +1000 Subject: [PATCH 01/11] BUG: add unit test, should fail (#35889) --- pandas/tests/groupby/test_groupby_dropna.py | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index adf62c4723526..760daa0700f2b 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -276,3 +276,31 @@ def test_groupby_dropna_datetime_like_data( expected = pd.DataFrame({"values": values}, index=pd.Index(indexes, name="dt")) tm.assert_frame_equal(grouped, expected) + + +@pytest.mark.parametrize( + "dropna, inputs, outputs", + [ + ( + False, + {'groups': ['a', 'a', 'b', np.nan], 'values': [10, 10, 20, 30]}, + {'groups': ['a', 'b', np.nan], 'values': [0, 1, 0, 0]} + ), + ], +) +def test_groupby_dropna_multi_index_dataframe_nan_apply( + dropna, inputs, outputs +): + # GH 35889 + # `groupby` with `dropna=False` and `apply` returning DataFrame of different + # sizes raises error if grouped column has nan values. + + df = pd.DataFrame(inputs) + dfg = df.groupby('groups', dropna=dropna) + rv = dfg.apply(lambda grp: pd.DataFrame({'values': list(range(len(grp)))})) + + tuples = tuple(zip(inputs['groups'], outputs['values'])) + mi = pd.MultiIndex.from_tuples(tuples, names=['groups', None]) + + expected = pd.DataFrame(outputs, index=mi) + tm.assert_frame_equal(rv, expected) From 6bae42af2932b3f6fef893651028323781ceaf8f Mon Sep 17 00:00:00 2001 From: David Kwong Date: Fri, 28 Aug 2020 16:08:37 +1000 Subject: [PATCH 02/11] expand tests: group with no np.nan, fix expected output (#35889) * tests should still fail. * test dropna=True|False with no np.nan in groupings. * fix expected outputs, declare expected MultiIndex in resulting dataframe after df.group().apply() --- pandas/tests/groupby/test_groupby_dropna.py | 33 ++++++++++++++++++--- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 760daa0700f2b..ab82d51061654 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -281,14 +281,35 @@ def test_groupby_dropna_datetime_like_data( @pytest.mark.parametrize( "dropna, inputs, outputs", [ - ( + pytest.param( False, {'groups': ['a', 'a', 'b', np.nan], 'values': [10, 10, 20, 30]}, - {'groups': ['a', 'b', np.nan], 'values': [0, 1, 0, 0]} + {'values': [0, 1, 0, 0]}, + id='dropna_false_has_nan' + ), + pytest.param( + True, + {'groups': ['a', 'a', 'b', np.nan], 'values': [10, 10, 20, 30]}, + {'values': [0, 1, 0]}, + id='dropna_true_has_nan' + ), + pytest.param( + # no nan in 'groups'; dropna=True|False should be same. + False, + {'groups': ['a', 'a', 'b', 'c'], 'values': [10, 10, 20, 30]}, + {'values': [0, 1, 0, 0]}, + id='dropna_false_no_nan' + ), + pytest.param( + # no nan in 'groups'; dropna=True|False should be same. + True, + {'groups': ['a', 'a', 'b', 'c'], 'values': [10, 10, 20, 30]}, + {'values': [0, 1, 0, 0]}, + id='dropna_true_no_nan' ), ], ) -def test_groupby_dropna_multi_index_dataframe_nan_apply( +def test_groupby_dropna_multi_index_dataframe_apply( dropna, inputs, outputs ): # GH 35889 @@ -299,7 +320,11 @@ def test_groupby_dropna_multi_index_dataframe_nan_apply( dfg = df.groupby('groups', dropna=dropna) rv = dfg.apply(lambda grp: pd.DataFrame({'values': list(range(len(grp)))})) - tuples = tuple(zip(inputs['groups'], outputs['values'])) + if dropna: + groups = [g for g in inputs['groups'] if g is not None] + else: + groups = inputs['groups'] + tuples = tuple(zip(groups, outputs['values'])) mi = pd.MultiIndex.from_tuples(tuples, names=['groups', None]) expected = pd.DataFrame(outputs, index=mi) From 26453dc5a97ce9288e61faad69e7a3de0ee118b7 Mon Sep 17 00:00:00 2001 From: David Kwong Date: Fri, 28 Aug 2020 16:18:09 +1000 Subject: [PATCH 03/11] double quotes instead of single quote (#35889) --- pandas/tests/groupby/test_groupby_dropna.py | 40 ++++++++++----------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index ab82d51061654..76b281ac98467 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -283,29 +283,29 @@ def test_groupby_dropna_datetime_like_data( [ pytest.param( False, - {'groups': ['a', 'a', 'b', np.nan], 'values': [10, 10, 20, 30]}, - {'values': [0, 1, 0, 0]}, - id='dropna_false_has_nan' + {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0, 0]}, + id="dropna_false_has_nan" ), pytest.param( True, - {'groups': ['a', 'a', 'b', np.nan], 'values': [10, 10, 20, 30]}, - {'values': [0, 1, 0]}, - id='dropna_true_has_nan' + {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0]}, + id="dropna_true_has_nan" ), pytest.param( - # no nan in 'groups'; dropna=True|False should be same. + # no nan in "groups"; dropna=True|False should be same. False, - {'groups': ['a', 'a', 'b', 'c'], 'values': [10, 10, 20, 30]}, - {'values': [0, 1, 0, 0]}, - id='dropna_false_no_nan' + {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0, 0]}, + id="dropna_false_no_nan" ), pytest.param( - # no nan in 'groups'; dropna=True|False should be same. + # no nan in "groups"; dropna=True|False should be same. True, - {'groups': ['a', 'a', 'b', 'c'], 'values': [10, 10, 20, 30]}, - {'values': [0, 1, 0, 0]}, - id='dropna_true_no_nan' + {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, + {"values": [0, 1, 0, 0]}, + id="dropna_true_no_nan" ), ], ) @@ -317,15 +317,15 @@ def test_groupby_dropna_multi_index_dataframe_apply( # sizes raises error if grouped column has nan values. df = pd.DataFrame(inputs) - dfg = df.groupby('groups', dropna=dropna) - rv = dfg.apply(lambda grp: pd.DataFrame({'values': list(range(len(grp)))})) + dfg = df.groupby("groups", dropna=dropna) + rv = dfg.apply(lambda grp: pd.DataFrame({"values": list(range(len(grp)))})) if dropna: - groups = [g for g in inputs['groups'] if g is not None] + groups = [g for g in inputs["groups"] if g is not None] else: - groups = inputs['groups'] - tuples = tuple(zip(groups, outputs['values'])) - mi = pd.MultiIndex.from_tuples(tuples, names=['groups', None]) + groups = inputs["groups"] + tuples = tuple(zip(groups, outputs["values"])) + mi = pd.MultiIndex.from_tuples(tuples, names=["groups", None]) expected = pd.DataFrame(outputs, index=mi) tm.assert_frame_equal(rv, expected) From 87dbfd921699176d18afde2b39cfb34e0184293c Mon Sep 17 00:00:00 2001 From: David Kwong Date: Fri, 28 Aug 2020 16:31:18 +1000 Subject: [PATCH 04/11] adjust comparison: handle np.nan compare (#35889) * nans at same positions in `level` and `key` compares as equal. --- pandas/core/reshape/concat.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 9e8fb643791f2..baec62b6f030f 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -9,6 +9,7 @@ from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Label +from pandas.core.dtypes.missing import isna from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries @@ -619,17 +620,16 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde codes_list = [] # things are potentially different sizes, so compute the exact codes - # for each level and pass those to MultiIndex.from_arrays - + # for each level and pass those to MultiIndex.from_arrays. for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): - mask = level == key + mask = ((isna(level) & isna(key)) | (level == key)) if not mask.any(): raise ValueError(f"Key {key} not in level {level}") - i = np.nonzero(level == key)[0][0] - + i = np.nonzero(mask)[0][0] to_concat.append(np.repeat(i, len(index))) + codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes) From 2a1e804511b26b601bf34e91d5995646e2f7fd93 Mon Sep 17 00:00:00 2001 From: David Kwong Date: Fri, 28 Aug 2020 18:10:27 +1000 Subject: [PATCH 05/11] refactor test: handle MultiIndex dropping nan (#35889) * this makes test pass. * follow existing style where we create MultiIndex, then `set_levels` to reinsert nan for case when `dropna=False`, and groups has nan grouping. --- pandas/tests/groupby/test_groupby_dropna.py | 22 ++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 76b281ac98467..1ee9ea73ad360 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -279,18 +279,20 @@ def test_groupby_dropna_datetime_like_data( @pytest.mark.parametrize( - "dropna, inputs, outputs", + "dropna, df_cols_in, df_cols_out, levels", [ pytest.param( False, {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0, 0]}, + ["a", "b", np.nan], id="dropna_false_has_nan" ), pytest.param( True, {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0]}, + None, id="dropna_true_has_nan" ), pytest.param( @@ -298,6 +300,7 @@ def test_groupby_dropna_datetime_like_data( False, {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0, 0]}, + None, id="dropna_false_no_nan" ), pytest.param( @@ -305,27 +308,28 @@ def test_groupby_dropna_datetime_like_data( True, {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0, 0]}, + None, id="dropna_true_no_nan" ), ], ) def test_groupby_dropna_multi_index_dataframe_apply( - dropna, inputs, outputs + dropna, df_cols_in, df_cols_out, levels ): # GH 35889 # `groupby` with `dropna=False` and `apply` returning DataFrame of different # sizes raises error if grouped column has nan values. - df = pd.DataFrame(inputs) + df = pd.DataFrame(df_cols_in) dfg = df.groupby("groups", dropna=dropna) rv = dfg.apply(lambda grp: pd.DataFrame({"values": list(range(len(grp)))})) - if dropna: - groups = [g for g in inputs["groups"] if g is not None] - else: - groups = inputs["groups"] - tuples = tuple(zip(groups, outputs["values"])) + tuples = tuple(zip(df_cols_in["groups"], df_cols_out["values"])) mi = pd.MultiIndex.from_tuples(tuples, names=["groups", None]) + # Since right now, by default MI will drop NA from levels when we create MI + # via `from_*`, so we need to add NA for level manually afterwards. + if not dropna and levels: + mi = mi.set_levels(levels, level="groups") - expected = pd.DataFrame(outputs, index=mi) + expected = pd.DataFrame(df_cols_out, index=mi) tm.assert_frame_equal(rv, expected) From 3c754d33b2f0c311f38bd5d3920e9b8354a3bc3a Mon Sep 17 00:00:00 2001 From: David Kwong Date: Fri, 28 Aug 2020 18:42:37 +1000 Subject: [PATCH 06/11] BUG: update rst (#35889) --- doc/source/whatsnew/v1.1.2.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 9747a8ef3e71f..6d3ab55381c03 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -29,6 +29,7 @@ Bug fixes - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) +- Bug in :meth:`DataFrame.apply` on :meth:`DataFrame.groupby`, ``dropna=False`` and ``np.nan`` group(s) (:issue:`35889`) - .. --------------------------------------------------------------------------- From 2cc4f39687e4456fe719ded52f4ce09916e9c5d9 Mon Sep 17 00:00:00 2001 From: David Kwong Date: Fri, 28 Aug 2020 18:57:31 +1000 Subject: [PATCH 07/11] BUG: run code formatters (#35889) * black pandas * git diff upstream/master -u -- "*.py" | flake8 --diff --- pandas/core/reshape/concat.py | 2 +- pandas/tests/groupby/test_groupby_dropna.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index baec62b6f030f..ceddd5ca186ac 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -624,7 +624,7 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): - mask = ((isna(level) & isna(key)) | (level == key)) + mask = (isna(level) & isna(key)) | (level == key) if not mask.any(): raise ValueError(f"Key {key} not in level {level}") i = np.nonzero(mask)[0][0] diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 1ee9ea73ad360..5c17a1955086d 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -286,14 +286,14 @@ def test_groupby_dropna_datetime_like_data( {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0, 0]}, ["a", "b", np.nan], - id="dropna_false_has_nan" + id="dropna_false_has_nan", ), pytest.param( True, {"groups": ["a", "a", "b", np.nan], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0]}, None, - id="dropna_true_has_nan" + id="dropna_true_has_nan", ), pytest.param( # no nan in "groups"; dropna=True|False should be same. @@ -301,7 +301,7 @@ def test_groupby_dropna_datetime_like_data( {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0, 0]}, None, - id="dropna_false_no_nan" + id="dropna_false_no_nan", ), pytest.param( # no nan in "groups"; dropna=True|False should be same. @@ -309,7 +309,7 @@ def test_groupby_dropna_datetime_like_data( {"groups": ["a", "a", "b", "c"], "values": [10, 10, 20, 30]}, {"values": [0, 1, 0, 0]}, None, - id="dropna_true_no_nan" + id="dropna_true_no_nan", ), ], ) From 8a2eab89a6d6f8835613cd4cfb7b084a9fba4b98 Mon Sep 17 00:00:00 2001 From: David Kwong Date: Fri, 28 Aug 2020 20:53:33 +1000 Subject: [PATCH 08/11] fix isort error during CI (#35889) --- pandas/core/reshape/concat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index ceddd5ca186ac..cf20626fd3ac1 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -9,9 +9,9 @@ from pandas._typing import FrameOrSeries, FrameOrSeriesUnion, Label -from pandas.core.dtypes.missing import isna from pandas.core.dtypes.concat import concat_compat from pandas.core.dtypes.generic import ABCDataFrame, ABCSeries +from pandas.core.dtypes.missing import isna from pandas.core.arrays.categorical import ( factorize_from_iterable, From aadbd750125a43651a7cee852099d84113cc5e1f Mon Sep 17 00:00:00 2001 From: David Kwong Date: Sat, 29 Aug 2020 22:42:54 +1000 Subject: [PATCH 09/11] BUG: address PR review comments (#35889) --- doc/source/whatsnew/v1.1.2.rst | 2 +- pandas/tests/groupby/test_groupby_dropna.py | 22 ++++++++++----------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 6d3ab55381c03..3990cd07d7166 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -29,7 +29,7 @@ Bug fixes - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) -- Bug in :meth:`DataFrame.apply` on :meth:`DataFrame.groupby`, ``dropna=False`` and ``np.nan`` group(s) (:issue:`35889`) +- Bug in :meth:`DataFrame.groupby(...).apply(...)` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - .. --------------------------------------------------------------------------- diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 5c17a1955086d..6ea41a45d4e7f 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -279,7 +279,7 @@ def test_groupby_dropna_datetime_like_data( @pytest.mark.parametrize( - "dropna, df_cols_in, df_cols_out, levels", + "dropna, data, selected_data, levels", [ pytest.param( False, @@ -313,23 +313,21 @@ def test_groupby_dropna_datetime_like_data( ), ], ) -def test_groupby_dropna_multi_index_dataframe_apply( - dropna, df_cols_in, df_cols_out, levels +def test_groupby_apply_with_dropna_for_multi_index( + dropna, data, selected_data, levels ): # GH 35889 - # `groupby` with `dropna=False` and `apply` returning DataFrame of different - # sizes raises error if grouped column has nan values. - df = pd.DataFrame(df_cols_in) - dfg = df.groupby("groups", dropna=dropna) - rv = dfg.apply(lambda grp: pd.DataFrame({"values": list(range(len(grp)))})) + df = pd.DataFrame(data) + gb = df.groupby("groups", dropna=dropna) + result = gb.apply(lambda grp: pd.DataFrame({"values": range(len(grp))})) - tuples = tuple(zip(df_cols_in["groups"], df_cols_out["values"])) - mi = pd.MultiIndex.from_tuples(tuples, names=["groups", None]) + mi_tuples = tuple(zip(data["groups"], selected_data["values"])) + mi = pd.MultiIndex.from_tuples(mi_tuples, names=["groups", None]) # Since right now, by default MI will drop NA from levels when we create MI # via `from_*`, so we need to add NA for level manually afterwards. if not dropna and levels: mi = mi.set_levels(levels, level="groups") - expected = pd.DataFrame(df_cols_out, index=mi) - tm.assert_frame_equal(rv, expected) + expected = pd.DataFrame(selected_data, index=mi) + tm.assert_frame_equal(result, expected) From 311059c36ae19c9c438ff712ce5063149aee31e1 Mon Sep 17 00:00:00 2001 From: David Kwong Date: Sat, 29 Aug 2020 23:21:03 +1000 Subject: [PATCH 10/11] forgot to run black, flake8 (#35889) --- pandas/tests/groupby/test_groupby_dropna.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/groupby/test_groupby_dropna.py b/pandas/tests/groupby/test_groupby_dropna.py index 6ea41a45d4e7f..2ef5dbcf41758 100644 --- a/pandas/tests/groupby/test_groupby_dropna.py +++ b/pandas/tests/groupby/test_groupby_dropna.py @@ -313,9 +313,7 @@ def test_groupby_dropna_datetime_like_data( ), ], ) -def test_groupby_apply_with_dropna_for_multi_index( - dropna, data, selected_data, levels -): +def test_groupby_apply_with_dropna_for_multi_index(dropna, data, selected_data, levels): # GH 35889 df = pd.DataFrame(data) From 714c81ab0e08da5f2b96783b7850b7fcee0a1008 Mon Sep 17 00:00:00 2001 From: David Kwong Date: Wed, 2 Sep 2020 11:09:34 +1000 Subject: [PATCH 11/11] BUG: address review comments (#35889) --- doc/source/whatsnew/v1.1.2.rst | 1 - doc/source/whatsnew/v1.2.0.rst | 3 ++- pandas/core/reshape/concat.py | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/doc/source/whatsnew/v1.1.2.rst b/doc/source/whatsnew/v1.1.2.rst index 3990cd07d7166..9747a8ef3e71f 100644 --- a/doc/source/whatsnew/v1.1.2.rst +++ b/doc/source/whatsnew/v1.1.2.rst @@ -29,7 +29,6 @@ Bug fixes - Bug in :class:`Series` constructor raising a ``TypeError`` when constructing sparse datetime64 dtypes (:issue:`35762`) - Bug in :meth:`DataFrame.apply` with ``result_type="reduce"`` returning with incorrect index (:issue:`35683`) - Bug in :meth:`DateTimeIndex.format` and :meth:`PeriodIndex.format` with ``name=True`` setting the first item to ``"None"`` where it should bw ``""`` (:issue:`35712`) -- Bug in :meth:`DataFrame.groupby(...).apply(...)` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) - .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 55570341cf4e8..0e376511ddd08 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -163,7 +163,8 @@ Performance improvements Bug fixes ~~~~~~~~~ - +- Bug in :meth:`DataFrameGroupBy.apply` raising error with ``np.nan`` group(s) when ``dropna=False`` (:issue:`35889`) +- Categorical ^^^^^^^^^^^ diff --git a/pandas/core/reshape/concat.py b/pandas/core/reshape/concat.py index 30486b6e2bff0..9b94dae8556f6 100644 --- a/pandas/core/reshape/concat.py +++ b/pandas/core/reshape/concat.py @@ -620,16 +620,18 @@ def _make_concat_multiindex(indexes, keys, levels=None, names=None) -> MultiInde codes_list = [] # things are potentially different sizes, so compute the exact codes - # for each level and pass those to MultiIndex.from_arrays. + # for each level and pass those to MultiIndex.from_arrays + for hlevel, level in zip(zipped, levels): to_concat = [] for key, index in zip(hlevel, indexes): + # Find matching codes, include matching nan values as equal. mask = (isna(level) & isna(key)) | (level == key) if not mask.any(): raise ValueError(f"Key {key} not in level {level}") i = np.nonzero(mask)[0][0] - to_concat.append(np.repeat(i, len(index))) + to_concat.append(np.repeat(i, len(index))) codes_list.append(np.concatenate(to_concat)) concat_index = _concat_indexes(indexes)