diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 08b3175ad0ad6..a33c043915e27 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -461,6 +461,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` that was returning numpy dtype values when input values are pyarrow dtype values, instead of returning pyarrow dtype values. (:issue:`53030`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) - Bug in :meth:`DataFrameGroupBy.apply` with ``as_index=False`` that was returning :class:`MultiIndex` instead of returning :class:`Index`. (:issue:`58291`) - Bug in :meth:`DataFrameGroupby.transform` and :meth:`SeriesGroupby.transform` with a reducer and ``observed=False`` that coerces dtype to float when there are unobserved categories. (:issue:`55326`) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4f40c4f4283f0..c94b59e5b12ac 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -45,12 +45,14 @@ ensure_uint64, is_1d_only_ea_dtype, ) +from pandas.core.dtypes.dtypes import ArrowDtype from pandas.core.dtypes.missing import ( isna, maybe_fill, ) from pandas.core.arrays import Categorical +from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -927,20 +929,29 @@ def agg_series( np.ndarray or ExtensionArray """ - if not isinstance(obj._values, np.ndarray): + result = self._aggregate_series_pure_python(obj, func) + npvalues = lib.maybe_convert_objects(result, try_float=False) + + if isinstance(obj._values, ArrowExtensionArray): + out = maybe_cast_pointwise_result( + npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype + ) + import pyarrow as pa + + if isinstance(out.dtype, ArrowDtype) and pa.types.is_struct( + out.dtype.pyarrow_dtype + ): + out = npvalues + + elif not isinstance(obj._values, np.ndarray): # we can preserve a little bit more aggressively with EA dtype # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence # is sufficiently strict that it casts appropriately. - preserve_dtype = True - - result = self._aggregate_series_pure_python(obj, func) - - npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues + return out @final diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index 3362d6209af6d..8cf71f193d231 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -1662,3 +1662,89 @@ def func(x): msg = "length must not be 0" with pytest.raises(ValueError, match=msg): df.groupby("A", observed=False).agg(func) + + +@pytest.mark.parametrize( + "input_dtype, output_dtype", + [ + ("float[pyarrow]", "double[pyarrow]"), + ("int64[pyarrow]", "int64[pyarrow]"), + ("uint64[pyarrow]", "int64[pyarrow]"), + ("bool[pyarrow]", "bool[pyarrow]"), + ], +) +def test_agg_lambda_pyarrow_dtype_conversion(input_dtype, output_dtype): + # GH#53030 + # test numpy dtype conversion back to pyarrow dtype + # complexes, floats, ints, uints, object + df = DataFrame( + { + "A": ["c1", "c2", "c3", "c1", "c2", "c3"], + "B": pd.array([100, 200, 255, 0, 199, 40392], dtype=input_dtype), + } + ) + gb = df.groupby("A") + result = gb.agg(lambda x: x.min()) + + expected = DataFrame( + {"B": pd.array([0, 199, 255], dtype=output_dtype)}, + index=Index(["c1", "c2", "c3"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_lambda_complex128_dtype_conversion(): + # GH#53030 + df = DataFrame( + {"A": ["c1", "c2", "c3"], "B": pd.array([100, 200, 255], "int64[pyarrow]")} + ) + gb = df.groupby("A") + result = gb.agg(lambda x: complex(x.sum(), x.count())) + + expected = DataFrame( + { + "B": pd.array( + [complex(100, 1), complex(200, 1), complex(255, 1)], dtype="complex128" + ), + }, + index=Index(["c1", "c2", "c3"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion(): + # GH#53030 + df = DataFrame( + { + "A": ["c1", "c2", "c3"], + "B": pd.array([100, 200, 255], dtype="uint64[pyarrow]"), + } + ) + gb = df.groupby("A") + result = gb.agg(lambda x: np.uint64(x.sum())) + + expected = DataFrame( + { + "B": pd.array([100, 200, 255], dtype="uint64[pyarrow]"), + }, + index=Index(["c1", "c2", "c3"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +def test_agg_lambda_pyarrow_struct_to_object_dtype_conversion(): + # GH#53030 + df = DataFrame( + { + "A": ["c1", "c2", "c3"], + "B": pd.array([100, 200, 255], dtype="int64[pyarrow]"), + } + ) + gb = df.groupby("A") + result = gb.agg(lambda x: {"number": 1}) + + expected = DataFrame( + {"B": pd.array([{"number": 1}, {"number": 1}, {"number": 1}], dtype="object")}, + index=Index(["c1", "c2", "c3"], name="A"), + ) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index b99ef2a0e840d..bc3a165352a48 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -28,6 +28,7 @@ ) import pandas._testing as tm from pandas.core.arrays import BooleanArray +from pandas.core.arrays.string_arrow import ArrowStringArrayNumpySemantics import pandas.core.common as com pytestmark = pytest.mark.filterwarnings("ignore:Mean of empty slice:RuntimeWarning") @@ -2475,9 +2476,14 @@ def test_by_column_values_with_same_starting_value(dtype): "Mood": [["happy", "sad"], "happy"], "Credit": [2500, 900], "Name": ["Thomas", "Thomas John"], - } + }, ).set_index("Name") + if dtype == "string[pyarrow_numpy]": + import pyarrow as pa + mood_values = ArrowStringArrayNumpySemantics(pa.array(["happy", "sad"])) + expected_result["Mood"] = [mood_values, "happy"] + expected_result["Mood"] = expected_result["Mood"].astype(dtype) tm.assert_frame_equal(result, expected_result)