From 0cb459c5763c794c669cc217aec6e07dc9cae97a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 21 Oct 2023 20:07:18 +0200 Subject: [PATCH 1/2] BUG: Groupby not keeping string dtype for empty objects --- doc/source/whatsnew/v2.1.2.rst | 1 + pandas/core/arrays/base.py | 2 ++ pandas/core/groupby/ops.py | 20 +++++++++++++------- pandas/tests/groupby/test_reductions.py | 13 +++++++++++++ 4 files changed, 29 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v2.1.2.rst b/doc/source/whatsnew/v2.1.2.rst index 97a718dd496e9..0f8ba33160e72 100644 --- a/doc/source/whatsnew/v2.1.2.rst +++ b/doc/source/whatsnew/v2.1.2.rst @@ -23,6 +23,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ +- Fixed bug in :meth:`.DataFrameGroupBy.min()` and :meth:`.DataFrameGroupBy.max()` not preserving extension dtype for empty object (:issue:`55619`) - Fixed bug in :meth:`Categorical.equals` if other has arrow backed string dtype (:issue:`55364`) - Fixed bug in :meth:`DataFrame.__setitem__` not inferring string dtype for zero-dimensional array with ``infer_string=True`` (:issue:`55366`) - Fixed bug in :meth:`DataFrame.idxmin` and :meth:`DataFrame.idxmax` raising for arrow dtypes (:issue:`55368`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 05e6fc09a5ef6..b48250ca95df6 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2352,6 +2352,8 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray + # Fail early to avoid conversion to object + op._get_cython_function(op.kind, op.how, np.dtype(object), False) npvalues = self.to_numpy(object, na_value=np.nan) else: raise NotImplementedError( diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 607059e5183ec..e4cba7ce8f1cd 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -33,6 +33,7 @@ from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly +from pandas.core.dtypes.base import ExtensionDtype from pandas.core.dtypes.cast import ( maybe_cast_pointwise_result, maybe_downcast_to_dtype, @@ -837,10 +838,8 @@ def agg_series( ------- np.ndarray or ExtensionArray """ - # test_groupby_empty_with_category gets here with self.ngroups == 0 - # and len(obj) > 0 - if len(obj) > 0 and not isinstance(obj._values, np.ndarray): + if not isinstance(obj._values, np.ndarray): # we can preserve a little bit more aggressively with EA dtype # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence @@ -849,11 +848,18 @@ def agg_series( result = self._aggregate_series_pure_python(obj, func) - npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve_dtype: - out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) + if len(obj) == 0 and len(result) == 0 and isinstance(obj.dtype, ExtensionDtype): + cls = obj.dtype.construct_array_type() + out = cls._from_sequence(result) + else: - out = npvalues + npvalues = lib.maybe_convert_objects(result, try_float=False) + if preserve_dtype: + out = maybe_cast_pointwise_result( + npvalues, obj.dtype, numeric_only=True + ) + else: + out = npvalues return out @final diff --git a/pandas/tests/groupby/test_reductions.py b/pandas/tests/groupby/test_reductions.py index fdfb211ac2269..35ad8e3f5dc61 100644 --- a/pandas/tests/groupby/test_reductions.py +++ b/pandas/tests/groupby/test_reductions.py @@ -575,6 +575,19 @@ def test_groupby_min_max_categorical(func): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("func", ["min", "max"]) +def test_min_empty_string_dtype(func): + # GH#55619 + pytest.importorskip("pyarrow") + dtype = "string[pyarrow_numpy]" + df = DataFrame({"a": ["a"], "b": "a", "c": "a"}, dtype=dtype).iloc[:0] + result = getattr(df.groupby("a"), func)() + expected = DataFrame( + columns=["b", "c"], dtype=dtype, index=pd.Index([], dtype=dtype, name="a") + ) + tm.assert_frame_equal(result, expected) + + def test_max_nan_bug(): raw = """,Date,app,File -04-23,2013-04-23 00:00:00,,log080001.log From 506c2b22a3f8b02bc5d3c3ad74b15a6fc971e156 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Sat, 21 Oct 2023 21:20:36 +0200 Subject: [PATCH 2/2] Fix --- pandas/core/arrays/base.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b48250ca95df6..3d97711d5f8c3 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -2352,8 +2352,9 @@ def _groupby_op( # GH#43682 if isinstance(self.dtype, StringDtype): # StringArray - # Fail early to avoid conversion to object - op._get_cython_function(op.kind, op.how, np.dtype(object), False) + if op.how not in ["any", "all"]: + # Fail early to avoid conversion to object + op._get_cython_function(op.kind, op.how, np.dtype(object), False) npvalues = self.to_numpy(object, na_value=np.nan) else: raise NotImplementedError(