From a2d5f957d50398dfb7b40b1b3bdccce5cf3762b3 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Tue, 5 Jan 2021 14:55:53 -0800 Subject: [PATCH 1/5] Add error message including expected string to properly fallback --- pandas/core/groupby/ops.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 45897666b6ccf..2c0ba5b05c19b 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -540,7 +540,9 @@ def _ea_wrap_cython_operation( result = type(orig_values)._from_sequence(res_values) return result - raise NotImplementedError(values.dtype) + raise NotImplementedError( + f"function is not implemented for this dtype: {values.dtype}" + ) @final def _cython_operation( From b826e7d3ac2e002bfd64cc23d08c8ecc40f86be1 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Wed, 6 Jan 2021 11:42:40 -0800 Subject: [PATCH 2/5] Added base test for extension array groupby agg --- pandas/tests/extension/base/groupby.py | 9 +++++++++ pandas/tests/extension/test_boolean.py | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index 94d0ef7bbea84..b7ecff4126250 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -33,6 +33,15 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): expected = expected.reset_index() self.assert_frame_equal(result, expected) + def test_groupby_agg_extension(self, data_for_grouping): + # GH#38980 groupby agg on extension type fails for non-numeric types + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) + result = df.groupby("A").agg({"B": "first"}).B.array + + expected = df["B"].iloc[[0, 2, 4, 7]].array + + self.assert_extension_array_equal(result, expected) + def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index ced7ea9261310..edb30de38ed49 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -291,6 +291,15 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): expected = expected.reset_index() self.assert_frame_equal(result, expected) + def test_groupby_agg_extension(self, data_for_grouping): + # GH#38980 groupby agg on extension type fails for non-numeric types + df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) + result = df.groupby("A").agg({"B": "first"}).B.array + + expected = df["B"].iloc[[0, 2, 4]].array + + self.assert_extension_array_equal(result, expected) + def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) result = df.groupby("B", sort=False).A.mean() From 579d8e25aab3b645445b902bbba604ba26da9ad7 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Sun, 10 Jan 2021 21:37:52 -0800 Subject: [PATCH 3/5] Marked tests for decimal,json as xfail --- pandas/tests/extension/decimal/test_decimal.py | 4 ++++ pandas/tests/extension/json/test_json.py | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/pandas/tests/extension/decimal/test_decimal.py b/pandas/tests/extension/decimal/test_decimal.py index 233b658d29782..08768bda312ba 100644 --- a/pandas/tests/extension/decimal/test_decimal.py +++ b/pandas/tests/extension/decimal/test_decimal.py @@ -197,6 +197,10 @@ class TestGroupby(BaseDecimal, base.BaseGroupbyTests): def test_groupby_apply_identity(self, data_for_grouping): super().test_groupby_apply_identity(data_for_grouping) + @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") + def test_groupby_agg_extension(self, data_for_grouping): + super().test_groupby_agg_extension(data_for_grouping) + class TestSetitem(BaseDecimal, base.BaseSetitemTests): pass diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index 3a5e49796c53b..164a39498ec73 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -313,6 +313,10 @@ def test_groupby_extension_apply(self): def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) + @pytest.mark.xfail(reason="GH#39098: Converts agg result to object") + def test_groupby_agg_extension(self, data_for_grouping): + super().test_groupby_agg_extension(data_for_grouping) + class TestArithmeticOps(BaseJSON, base.BaseArithmeticOpsTests): def test_error(self, data, all_arithmetic_operators): From 00181c91cd86924cc2185b3b509ec1336f16a7b3 Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Mon, 11 Jan 2021 21:59:42 -0800 Subject: [PATCH 4/5] Compare resulting DataFrame in tests --- pandas/tests/extension/base/groupby.py | 13 ++++++++++--- pandas/tests/extension/test_boolean.py | 13 ++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/pandas/tests/extension/base/groupby.py b/pandas/tests/extension/base/groupby.py index b7ecff4126250..c81304695f353 100644 --- a/pandas/tests/extension/base/groupby.py +++ b/pandas/tests/extension/base/groupby.py @@ -36,11 +36,18 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): def test_groupby_agg_extension(self, data_for_grouping): # GH#38980 groupby agg on extension type fails for non-numeric types df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) - result = df.groupby("A").agg({"B": "first"}).B.array - expected = df["B"].iloc[[0, 2, 4, 7]].array + expected = df.iloc[[0, 2, 4, 7]] + expected = expected.set_index("A") - self.assert_extension_array_equal(result, expected) + result = df.groupby("A").agg({"B": "first"}) + self.assert_frame_equal(result, expected) + + result = df.groupby("A").agg("first") + self.assert_frame_equal(result, expected) + + result = df.groupby("A").first() + self.assert_frame_equal(result, expected) def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1, 4], "B": data_for_grouping}) diff --git a/pandas/tests/extension/test_boolean.py b/pandas/tests/extension/test_boolean.py index edb30de38ed49..86a0bc9213256 100644 --- a/pandas/tests/extension/test_boolean.py +++ b/pandas/tests/extension/test_boolean.py @@ -294,11 +294,18 @@ def test_groupby_extension_agg(self, as_index, data_for_grouping): def test_groupby_agg_extension(self, data_for_grouping): # GH#38980 groupby agg on extension type fails for non-numeric types df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) - result = df.groupby("A").agg({"B": "first"}).B.array - expected = df["B"].iloc[[0, 2, 4]].array + expected = df.iloc[[0, 2, 4]] + expected = expected.set_index("A") - self.assert_extension_array_equal(result, expected) + result = df.groupby("A").agg({"B": "first"}) + self.assert_frame_equal(result, expected) + + result = df.groupby("A").agg("first") + self.assert_frame_equal(result, expected) + + result = df.groupby("A").first() + self.assert_frame_equal(result, expected) def test_groupby_extension_no_sort(self, data_for_grouping): df = pd.DataFrame({"A": [1, 1, 2, 2, 3, 3, 1], "B": data_for_grouping}) From 6ba43dc1f7fa5022398c60ec45eb6ad93dc0fdcf Mon Sep 17 00:00:00 2001 From: Bryan Cutler Date: Fri, 8 Jan 2021 17:32:57 -0800 Subject: [PATCH 5/5] Added whatsnew entry --- doc/source/whatsnew/v1.2.1.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.1.rst b/doc/source/whatsnew/v1.2.1.rst index 36b4b4fa77c4a..849b599141c2b 100644 --- a/doc/source/whatsnew/v1.2.1.rst +++ b/doc/source/whatsnew/v1.2.1.rst @@ -28,6 +28,7 @@ Fixed regressions - Fixed regression in :meth:`DataFrame.replace` raising ``ValueError`` when :class:`DataFrame` has dtype ``bytes`` (:issue:`38900`) - Fixed regression in :meth:`DataFrameGroupBy.diff` raising for ``int8`` and ``int16`` columns (:issue:`39050`) - Fixed regression that raised ``AttributeError`` with PyArrow versions [0.16.0, 1.0.0) (:issue:`38801`) +- Fixed regression in :meth:`DataFrame.groupby` when aggregating an :class:`ExtensionDType` that could fail for non-numeric values (:issue:`38980`) - -