Skip to content

Commit 3ade717

Browse files
phofllithomas1
authored andcommitted
BUG: Groupby not keeping string dtype for empty objects (pandas-dev#55619)
* BUG: Groupby not keeping string dtype for empty objects * Fix --------- Co-authored-by: Thomas Li <[email protected]> (cherry picked from commit 8afd868)
1 parent 8059d86 commit 3ade717

File tree

4 files changed

+30
-7
lines changed

4 files changed

+30
-7
lines changed

doc/source/whatsnew/v2.1.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ Fixed regressions
3636
Bug fixes
3737
~~~~~~~~~
3838
- Fixed bug in :class:`.DataFrameGroupBy` reductions not preserving object dtype when ``infer_string`` is set (:issue:`55620`)
39+
- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`)
3940
- Fixed bug in :meth:`.SeriesGroupBy.value_counts` returning incorrect dtype for string columns (:issue:`55627`)
4041
- Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`)
4142
- Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`)

pandas/core/arrays/base.py

+3
Original file line numberDiff line numberDiff line change
@@ -2235,6 +2235,9 @@ def _groupby_op(
22352235
# GH#43682
22362236
if isinstance(self.dtype, StringDtype):
22372237
# StringArray
2238+
if op.how not in ["any", "all"]:
2239+
# Fail early to avoid conversion to object
2240+
op._get_cython_function(op.kind, op.how, np.dtype(object), False)
22382241
npvalues = self.to_numpy(object, na_value=np.nan)
22392242
else:
22402243
raise NotImplementedError(

pandas/core/groupby/ops.py

+13-7
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
from pandas.errors import AbstractMethodError
3434
from pandas.util._decorators import cache_readonly
3535

36+
from pandas.core.dtypes.base import ExtensionDtype
3637
from pandas.core.dtypes.cast import (
3738
maybe_cast_pointwise_result,
3839
maybe_downcast_to_dtype,
@@ -837,10 +838,8 @@ def agg_series(
837838
-------
838839
np.ndarray or ExtensionArray
839840
"""
840-
# test_groupby_empty_with_category gets here with self.ngroups == 0
841-
# and len(obj) > 0
842841

843-
if len(obj) > 0 and not isinstance(obj._values, np.ndarray):
842+
if not isinstance(obj._values, np.ndarray):
844843
# we can preserve a little bit more aggressively with EA dtype
845844
# because maybe_cast_pointwise_result will do a try/except
846845
# with _from_sequence. NB we are assuming here that _from_sequence
@@ -849,11 +848,18 @@ def agg_series(
849848

850849
result = self._aggregate_series_pure_python(obj, func)
851850

852-
npvalues = lib.maybe_convert_objects(result, try_float=False)
853-
if preserve_dtype:
854-
out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True)
851+
if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype):
852+
cls = obj.dtype.construct_array_type()
853+
out = cls._from_sequence(result)
854+
855855
else:
856-
out = npvalues
856+
npvalues = lib.maybe_convert_objects(result, try_float=False)
857+
if preserve_dtype:
858+
out = maybe_cast_pointwise_result(
859+
npvalues, obj.dtype, numeric_only=True
860+
)
861+
else:
862+
out = npvalues
857863
return out
858864

859865
@final

pandas/tests/groupby/test_function.py

+13
Original file line numberDiff line numberDiff line change
@@ -1670,6 +1670,19 @@ def test_groupby_empty_dataset(dtype, kwargs):
16701670
tm.assert_frame_equal(result, expected)
16711671

16721672

1673+
@pytest.mark.parametrize("func", ["min", "max"])
1674+
def test_min_empty_string_dtype(func):
1675+
# GH#55619
1676+
pytest.importorskip("pyarrow")
1677+
dtype = "string[pyarrow_numpy]"
1678+
df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0]
1679+
result = getattr(df.groupby("a"), func)()
1680+
expected = DataFrame(
1681+
columns=["b", "c"], dtype=dtype, index=Index([], dtype=dtype, name="a")
1682+
)
1683+
tm.assert_frame_equal(result, expected)
1684+
1685+
16731686
def test_corrwith_with_1_axis():
16741687
# GH 47723
16751688
df = DataFrame({"a": [1, 1, 2], "b": [3, 7, 4]})

0 commit comments

Comments
 (0)