From debed82067c88f218cc0af7347c2b2a2bdec6916 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sun, 12 Jul 2020 23:24:57 +0100 Subject: [PATCH 1/5] #34951 bug fixed and test added --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/groupby/generic.py | 2 +- pandas/tests/groupby/test_categorical.py | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5f93e08d51baa..b56d7c2599033 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1091,6 +1091,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) +- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise ``ValueError`` grouping on multiple ``Categoricals`` (:issue:`34951`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1f49ee2b0b665..9acea86b14103 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1058,7 +1058,7 @@ def _cython_agg_blocks( # reductions; see GH#28949 obj = obj.iloc[:, 0] - s = get_groupby(obj, self.grouper) + s = get_groupby(obj, self.grouper, observed=True) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) except TypeError: diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 118d928ac02f4..b51f84618e977 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1669,3 +1669,26 @@ def test_categorical_transform(): expected["status"] = expected["status"].astype(delivery_status_type) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("func", ["first", "last"]) +def test_groupby_first_on_categorical_col_grouped_on_2_categoricals(func: str): + + cat = pd.Categorical([0, 0, 1, 1]) + val = [0, 1, 1, 0] + df = pd.DataFrame({"a": cat, "b": cat, "c": val}) + + idx = pd.Categorical([0, 1]) + idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"]) + expected = { + "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), + "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), + } + + df_grp = df.groupby(["a", "b"]) + df_res = getattr(df_grp, func)() + tm.assert_frame_equal(df_res, expected[func].to_frame()) + + srs_grp = df_grp["c"] + srs_res = getattr(srs_grp, func)() + tm.assert_series_equal(srs_res, expected[func]) From b6903c6df39b36d857ffbae56646485b60cd5a7f Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 13 Jul 2020 00:06:06 +0100 Subject: [PATCH 2/5] issue number in test --- pandas/tests/groupby/test_categorical.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b51f84618e977..b5d77a04da63d 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1673,6 +1673,7 @@ def test_categorical_transform(): @pytest.mark.parametrize("func", ["first", "last"]) def test_groupby_first_on_categorical_col_grouped_on_2_categoricals(func: str): + # GH 34951 cat = pd.Categorical([0, 0, 1, 1]) val = [0, 1, 1, 0] @@ -1684,6 +1685,7 @@ def test_groupby_first_on_categorical_col_grouped_on_2_categoricals(func: str): "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), } + import pdb; pdb.set_trace() df_grp = df.groupby(["a", "b"]) df_res = getattr(df_grp, func)() From fe2a8ba70c0d0889586809ae47cedd58ccfab639 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 13 Jul 2020 00:14:20 +0100 Subject: [PATCH 3/5] removed errant pdb.set_trace() --- pandas/tests/groupby/test_categorical.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index b5d77a04da63d..a87de65800008 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1685,7 +1685,6 @@ def test_groupby_first_on_categorical_col_grouped_on_2_categoricals(func: str): "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), } - import pdb; pdb.set_trace() df_grp = df.groupby(["a", "b"]) df_res = getattr(df_grp, func)() From 7bbb3be8a27608cf4f530e10a7a7310b74015fc1 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 13 Jul 2020 20:43:30 +0100 Subject: [PATCH 4/5] addressing comments --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/groupby/generic.py | 4 ++++ pandas/tests/groupby/test_categorical.py | 25 ++++++++++++++++-------- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b56d7c2599033..85b29a58a1f15 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1091,7 +1091,7 @@ Groupby/resample/rolling - Bug in :meth:`Rolling.apply` where ``center=True`` was ignored when ``engine='numba'`` was specified (:issue:`34784`) - Bug in :meth:`DataFrame.ewm.cov` was throwing ``AssertionError`` for :class:`MultiIndex` inputs (:issue:`34440`) - Bug in :meth:`core.groupby.DataFrameGroupBy.transform` when ``func='nunique'`` and columns are of type ``datetime64``, the result would also be of type ``datetime64`` instead of ``int64`` (:issue:`35109`) -- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise ``ValueError`` grouping on multiple ``Categoricals`` (:issue:`34951`) +- Bug in :meth:'DataFrameGroupBy.first' and :meth:'DataFrameGroupBy.last' that would raise an unnecessary ``ValueError`` when grouping on multiple ``Categoricals`` (:issue:`34951`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 9acea86b14103..093e1d4ab3942 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1058,6 +1058,10 @@ def _cython_agg_blocks( # reductions; see GH#28949 obj = obj.iloc[:, 0] + # Create SeriesGroupBy with observed=True so that it does + # not try to add missing categories if grouping over multiple + # Categoricals. This will done by later self._reindex_output() + # Doing it here creates an error. See GH#34951 s = get_groupby(obj, self.grouper, observed=True) try: result = s.aggregate(lambda x: alt(x, axis=self.axis)) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index a87de65800008..f2be125632ad7 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1672,7 +1672,9 @@ def test_categorical_transform(): @pytest.mark.parametrize("func", ["first", "last"]) -def test_groupby_first_on_categorical_col_grouped_on_2_categoricals(func: str): +def test_groupby_first_on_categorical_col_grouped_on_2_categoricals( + func: str, observed: bool +): # GH 34951 cat = pd.Categorical([0, 0, 1, 1]) @@ -1681,15 +1683,22 @@ def test_groupby_first_on_categorical_col_grouped_on_2_categoricals(func: str): idx = pd.Categorical([0, 1]) idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"]) - expected = { + expected_dict = { "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), } - df_grp = df.groupby(["a", "b"]) - df_res = getattr(df_grp, func)() - tm.assert_frame_equal(df_res, expected[func].to_frame()) + expected = expected_dict[func] + if observed: + expected = expected.dropna().astype(np.int64) + + # Check SeriesGroupBy + srs_grp = df.groupby(["a", "b"], observed=observed)["c"] + result = getattr(srs_grp, func)() + tm.assert_series_equal(result, expected) - srs_grp = df_grp["c"] - srs_res = getattr(srs_grp, func)() - tm.assert_series_equal(srs_res, expected[func]) + # Check DataFrameGroupBy + df_grp = df.groupby(["a", "b"], observed=observed) + result = getattr(df_grp, func)() + expected = expected.to_frame() + tm.assert_frame_equal(result, expected) From d098586b1ca08cd628ab39f0c3ccfd2fc1420fe6 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 13 Jul 2020 21:18:20 +0100 Subject: [PATCH 5/5] Typing Validation failed. Split new test into separate tests for SeriesGroupBy and DataFrameGroupBy --- pandas/tests/groupby/test_categorical.py | 27 +++++++++++++++++++----- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index f2be125632ad7..7e4513da37dc9 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1672,11 +1672,10 @@ def test_categorical_transform(): @pytest.mark.parametrize("func", ["first", "last"]) -def test_groupby_first_on_categorical_col_grouped_on_2_categoricals( +def test_series_groupby_first_on_categorical_col_grouped_on_2_categoricals( func: str, observed: bool ): # GH 34951 - cat = pd.Categorical([0, 0, 1, 1]) val = [0, 1, 1, 0] df = pd.DataFrame({"a": cat, "b": cat, "c": val}) @@ -1692,13 +1691,31 @@ def test_groupby_first_on_categorical_col_grouped_on_2_categoricals( if observed: expected = expected.dropna().astype(np.int64) - # Check SeriesGroupBy srs_grp = df.groupby(["a", "b"], observed=observed)["c"] result = getattr(srs_grp, func)() tm.assert_series_equal(result, expected) - # Check DataFrameGroupBy + +@pytest.mark.parametrize("func", ["first", "last"]) +def test_df_groupby_first_on_categorical_col_grouped_on_2_categoricals( + func: str, observed: bool +): + # GH 34951 + cat = pd.Categorical([0, 0, 1, 1]) + val = [0, 1, 1, 0] + df = pd.DataFrame({"a": cat, "b": cat, "c": val}) + + idx = pd.Categorical([0, 1]) + idx = pd.MultiIndex.from_product([idx, idx], names=["a", "b"]) + expected_dict = { + "first": pd.Series([0, np.NaN, np.NaN, 1], idx, name="c"), + "last": pd.Series([1, np.NaN, np.NaN, 0], idx, name="c"), + } + + expected = expected_dict[func].to_frame() + if observed: + expected = expected.dropna().astype(np.int64) + df_grp = df.groupby(["a", "b"], observed=observed) result = getattr(df_grp, func)() - expected = expected.to_frame() tm.assert_frame_equal(result, expected)