Skip to content

Commit e0964c2

Browse files
authored
BUG: Series.groupby.size returning int64 for masked and arrow types (#54132)
* BUG: Series.groupby.count returning int64 for masked and arrow types * typing: * Add GH issue ref * Use convert_dtypes * Remove typing * Type better
1 parent 289e081 commit e0964c2

File tree

4 files changed

+35
-0
lines changed

4 files changed

+35
-0
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -565,6 +565,7 @@ Groupby/resample/rolling
565565
- Bug in :meth:`DataFrameGroupBy.apply` raising a ``TypeError`` when selecting multiple columns and providing a function that returns ``np.ndarray`` results (:issue:`18930`)
566566
- Bug in :meth:`GroupBy.groups` with a datetime key in conjunction with another key produced incorrect number of group keys (:issue:`51158`)
567567
- Bug in :meth:`GroupBy.quantile` may implicitly sort the result index with ``sort=False`` (:issue:`53009`)
568+
- Bug in :meth:`SeriesGroupBy.size` where the dtype would be ``np.int64`` for data with :class:`ArrowDtype` or masked dtypes (e.g. ``Int64``) (:issue:`53831`)
568569
- Bug in :meth:`GroupBy.var` failing to raise ``TypeError`` when called with datetime64, timedelta64 or :class:`PeriodDtype` values (:issue:`52128`, :issue:`53045`)
569570
- Bug in :meth:`DataFrameGroupby.resample` with ``kind="period"`` raising ``AttributeError`` (:issue:`24103`)
570571
- Bug in :meth:`Resampler.ohlc` with empty object returning a :class:`Series` instead of empty :class:`DataFrame` (:issue:`42902`)

pandas/core/groupby/groupby.py

+17
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ class providing the base-class of operations.
9999
from pandas.core._numba import executor
100100
from pandas.core.apply import warn_alias_replacement
101101
from pandas.core.arrays import (
102+
ArrowExtensionArray,
102103
BaseMaskedArray,
103104
Categorical,
104105
ExtensionArray,
@@ -2930,13 +2931,29 @@ def size(self) -> DataFrame | Series:
29302931
Freq: MS, dtype: int64
29312932
"""
29322933
result = self.grouper.size()
2934+
dtype_backend: None | Literal["pyarrow", "numpy_nullable"] = None
2935+
if isinstance(self.obj, Series):
2936+
if isinstance(self.obj.array, ArrowExtensionArray):
2937+
dtype_backend = "pyarrow"
2938+
elif isinstance(self.obj.array, BaseMaskedArray):
2939+
dtype_backend = "numpy_nullable"
2940+
# TODO: For DataFrames what if columns are mixed arrow/numpy/masked?
29332941

29342942
# GH28330 preserve subclassed Series/DataFrames through calls
29352943
if isinstance(self.obj, Series):
29362944
result = self._obj_1d_constructor(result, name=self.obj.name)
29372945
else:
29382946
result = self._obj_1d_constructor(result)
29392947

2948+
if dtype_backend is not None:
2949+
result = result.convert_dtypes(
2950+
infer_objects=False,
2951+
convert_string=False,
2952+
convert_boolean=False,
2953+
convert_floating=False,
2954+
dtype_backend=dtype_backend,
2955+
)
2956+
29402957
with com.temp_setattr(self, "as_index", True):
29412958
# size already has the desired behavior in GH#49519, but this makes the
29422959
# as_index=False path of _reindex_output fail on categorical groupers.

pandas/tests/extension/test_arrow.py

+8
Original file line numberDiff line numberDiff line change
@@ -3122,6 +3122,14 @@ def test_iter_temporal(pa_type):
31223122
assert result == expected
31233123

31243124

3125+
def test_groupby_series_size_returns_pa_int(data):
3126+
# GH 54132
3127+
ser = pd.Series(data[:3], index=["a", "a", "b"])
3128+
result = ser.groupby(level=0).size()
3129+
expected = pd.Series([2, 1], dtype="int64[pyarrow]", index=["a", "b"])
3130+
tm.assert_series_equal(result, expected)
3131+
3132+
31253133
@pytest.mark.parametrize(
31263134
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
31273135
)

pandas/tests/groupby/test_size.py

+9
Original file line numberDiff line numberDiff line change
@@ -95,3 +95,12 @@ def test_size_on_categorical(as_index):
9595
expected = expected.set_index(["A", "B"])["size"].rename(None)
9696

9797
tm.assert_equal(result, expected)
98+
99+
100+
@pytest.mark.parametrize("dtype", ["Int64", "Float64", "boolean"])
101+
def test_size_series_masked_type_returns_Int64(dtype):
102+
# GH 54132
103+
ser = Series([1, 1, 1], index=["a", "a", "b"], dtype=dtype)
104+
result = ser.groupby(level=0).size()
105+
expected = Series([2, 1], dtype="Int64", index=["a", "b"])
106+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)