From e8c6e4cda5c49b78e800cc842256ed4668d8d79b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jul 2023 16:59:23 -0700 Subject: [PATCH 1/6] BUG: Series.groupby.count returning int64 for masked and arrow types --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/groupby/groupby.py | 14 ++++++++++++-- pandas/tests/extension/test_arrow.py | 7 +++++++ pandas/tests/groupby/test_size.py | 8 ++++++++ 4 files changed, 28 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index b7cc254d5c7e5..3c2c7f3147584 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -553,6 +553,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupBy.apply` raising a ``TypeError`` when selecting multiple columns and providing a function that returns ``np.ndarray`` results (:issue:`18930`) - Bug in :meth:`GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`) - Bug in :meth:`GroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`) +- Bug in :meth:`SeriesGroupBy.size` where the dtype would be ``np.int64`` for data with :class:`ArrowDtype` or masked dtypes (e.g. ``Int64``) (:issue:`53831`) - Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`) - Bug in :meth:`DataFrameGroupby.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`) - Bug in :meth:`Resampler.ohlc` with empty object returning a :class:`Series` instead of empty :class:`DataFrame` (:issue:`42902`) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 85ec8c1b86374..183e8cd402dda 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -99,6 +99,7 @@ class providing the base-class of operations. from pandas.core._numba import executor from pandas.core.apply import warn_alias_replacement from pandas.core.arrays import ( + ArrowExtensionArray, BaseMaskedArray, Categorical, ExtensionArray, @@ -2890,12 +2891,21 @@ def size(self) -> DataFrame | Series: Freq: MS, dtype: int64 """ result = self.grouper.size() + result_dtype = result.dtype + if isinstance(self.obj, Series): + if isinstance(self.obj.array, ArrowExtensionArray): + result_dtype = "int64[pyarrow]" + elif isinstance(self.obj.array, BaseMaskedArray): + result_dtype = "Int64" + # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? # GH28330 preserve subclassed Series/DataFrames through calls if isinstance(self.obj, Series): - result = self._obj_1d_constructor(result, name=self.obj.name) + result = self._obj_1d_constructor( + result, name=self.obj.name, dtype=result_dtype + ) else: - result = self._obj_1d_constructor(result) + result = self._obj_1d_constructor(result, dtype=result_dtype) with com.temp_setattr(self, "as_index", True): # size already has the desired behavior in GH#49519, but this makes the diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7b2bedc531076..7c19343f15a04 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3121,6 +3121,13 @@ def test_iter_temporal(pa_type): assert result == expected +def test_groupby_series_size_returns_pa_int(data): + ser = pd.Series(data[:3], index=["a", "a", "b"]) + result = ser.groupby(level=0).size() + expected = pd.Series([2, 1], dtype="int64[pyarrow]", index=["a", "b"]) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES ) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index b96fe41c26c3e..836eed990cb54 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -95,3 +95,11 @@ def test_size_on_categorical(as_index): expected = expected.set_index(["A", "B"])["size"].rename(None) tm.assert_equal(result, expected) + + +@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) +def test_size_series_masked_type_returns_Int64(dtype): + ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype) + result = ser.groupby(level=0).size() + expected = Series([2, 1], dtype="Int64", index=["a", "b"]) + tm.assert_series_equal(result, expected) From 1abafb9d579fff74c65a80e5293cd806c481ebd4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 14 Jul 2023 17:47:22 -0700 Subject: [PATCH 2/6] typing: --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 183e8cd402dda..d9561b9a3943d 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2891,7 +2891,7 @@ def size(self) -> DataFrame | Series: Freq: MS, dtype: int64 """ result = self.grouper.size() - result_dtype = result.dtype + result_dtype: str | np.dtype = result.dtype if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): result_dtype = "int64[pyarrow]" From 07147e0354509b225daa39eb31949a8d77275b2e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 17 Jul 2023 11:30:00 -0700 Subject: [PATCH 3/6] Add GH issue ref --- pandas/tests/extension/test_arrow.py | 1 + pandas/tests/groupby/test_size.py | 1 + 2 files changed, 2 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7c19343f15a04..f9756810a8988 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -3122,6 +3122,7 @@ def test_iter_temporal(pa_type): def test_groupby_series_size_returns_pa_int(data): + # GH 54132 ser = pd.Series(data[:3], index=["a", "a", "b"]) result = ser.groupby(level=0).size() expected = pd.Series([2, 1], dtype="int64[pyarrow]", index=["a", "b"]) diff --git a/pandas/tests/groupby/test_size.py b/pandas/tests/groupby/test_size.py index 836eed990cb54..e7598ec34fa15 100644 --- a/pandas/tests/groupby/test_size.py +++ b/pandas/tests/groupby/test_size.py @@ -99,6 +99,7 @@ def test_size_on_categorical(as_index): @pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"]) def test_size_series_masked_type_returns_Int64(dtype): + # GH 54132 ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype) result = ser.groupby(level=0).size() expected = Series([2, 1], dtype="Int64", index=["a", "b"]) From 9af9958e8332f966093021469f0bc922ec627eea Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 18 Jul 2023 11:01:04 -0700 Subject: [PATCH 4/6] Use convert_dtypes --- pandas/core/groupby/groupby.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 49d5567efeede..d6c6a273dc385 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2931,21 +2931,28 @@ def size(self) -> DataFrame | Series: Freq: MS, dtype: int64 """ result = self.grouper.size() - result_dtype: str | np.dtype = result.dtype + dtype_backend: str | None = None if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): - result_dtype = "int64[pyarrow]" + dtype_backend = "pyarrow" elif isinstance(self.obj.array, BaseMaskedArray): - result_dtype = "Int64" + dtype_backend = "numpy_nullable" # TODO: For DataFrames what if columns are mixed arrow/numpy/masked? # GH28330 preserve subclassed Series/DataFrames through calls if isinstance(self.obj, Series): - result = self._obj_1d_constructor( - result, name=self.obj.name, dtype=result_dtype - ) + result = self._obj_1d_constructor(result, name=self.obj.name) else: - result = self._obj_1d_constructor(result, dtype=result_dtype) + result = self._obj_1d_constructor(result) + + if dtype_backend is not None: + result = result.convert_dtypes( + infer_objects=False, + convert_string=False, + convert_boolean=False, + convert_floating=False, + dtype_backend=dtype_backend, + ) with com.temp_setattr(self, "as_index", True): # size already has the desired behavior in GH#49519, but this makes the From 3783220c9dd2384428e7adc98cf97caff248568e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 18 Jul 2023 12:08:40 -0700 Subject: [PATCH 5/6] Remove typing --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d6c6a273dc385..17b1e041deff3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2931,7 +2931,7 @@ def size(self) -> DataFrame | Series: Freq: MS, dtype: int64 """ result = self.grouper.size() - dtype_backend: str | None = None + dtype_backend = None if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): dtype_backend = "pyarrow" From 12ccd573623c773694c8f01f50dad55c633cce4d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 18 Jul 2023 12:12:20 -0700 Subject: [PATCH 6/6] Type better --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 17b1e041deff3..64d874a31c428 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -2931,7 +2931,7 @@ def size(self) -> DataFrame | Series: Freq: MS, dtype: int64 """ result = self.grouper.size() - dtype_backend = None + dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None if isinstance(self.obj, Series): if isinstance(self.obj.array, ArrowExtensionArray): dtype_backend = "pyarrow"