diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index bad06329c4bfa..433a8e30b1fe1 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -772,6 +772,7 @@ Groupby/resample/rolling - Bug in :meth:`.DataFrameGroupBy.quantile` when ``interpolation="nearest"`` is inconsistent with :meth:`DataFrame.quantile` (:issue:`47942`) - Bug in :meth:`.Resampler.interpolate` on a :class:`DataFrame` with non-uniform sampling and/or indices not aligning with the resulting resampled index would result in wrong interpolation (:issue:`21351`) - Bug in :meth:`DataFrame.ewm` and :meth:`Series.ewm` when passed ``times`` and aggregation functions other than mean (:issue:`51695`) +- Bug in :meth:`DataFrameGroupBy.agg` and :meth:`SeriesGroupBy.agg` that was returning numpy dtype values when input values are pyarrow dtype values, instead of returning pyarrow dtype values. (:issue:`53030`) - Bug in :meth:`DataFrameGroupBy.agg` that raises ``AttributeError`` when there is dictionary input and duplicated columns, instead of returning a DataFrame with the aggregation of all duplicate columns. (:issue:`55041`) - Bug in :meth:`DataFrameGroupBy.apply` and :meth:`SeriesGroupBy.apply` for empty data frame with ``group_keys=False`` still creating output index using group keys. (:issue:`60471`) - Bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index c4c7f73ee166c..d680e26d7386e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -50,6 +50,7 @@ ) from pandas.core.arrays import Categorical +from pandas.core.arrays.arrow.array import ArrowExtensionArray from pandas.core.frame import DataFrame from pandas.core.groupby import grouper from pandas.core.indexes.api import ( @@ -954,18 +955,28 @@ def agg_series( ------- np.ndarray or ExtensionArray """ + result = self._aggregate_series_pure_python(obj, func) + npvalues = lib.maybe_convert_objects(result, try_float=False) + + if isinstance(obj._values, ArrowExtensionArray): + from pandas.core.dtypes.common import is_string_dtype - if not isinstance(obj._values, np.ndarray): + # When obj.dtype is a string, any object can be cast. Only do so if the + # UDF returned strings or NA values. + if not is_string_dtype(obj.dtype) or is_string_dtype( + npvalues[~isna(npvalues)] + ): + out = maybe_cast_pointwise_result( + npvalues, obj.dtype, numeric_only=True, same_dtype=preserve_dtype + ) + else: + out = npvalues + + elif not isinstance(obj._values, np.ndarray): # we can preserve a little bit more aggressively with EA dtype # because maybe_cast_pointwise_result will do a try/except # with _from_sequence. NB we are assuming here that _from_sequence # is sufficiently strict that it casts appropriately. - preserve_dtype = True - - result = self._aggregate_series_pure_python(obj, func) - - npvalues = lib.maybe_convert_objects(result, try_float=False) - if preserve_dtype: out = maybe_cast_pointwise_result(npvalues, obj.dtype, numeric_only=True) else: out = npvalues diff --git a/pandas/tests/groupby/aggregate/test_aggregate.py b/pandas/tests/groupby/aggregate/test_aggregate.py index b7e6e55739c17..ec1755cb98c25 100644 --- a/pandas/tests/groupby/aggregate/test_aggregate.py +++ b/pandas/tests/groupby/aggregate/test_aggregate.py @@ -10,6 +10,7 @@ import pytest from pandas.errors import SpecificationError +import pandas.util._test_decorators as td from pandas.core.dtypes.common import is_integer_dtype @@ -23,6 +24,7 @@ to_datetime, ) import pandas._testing as tm +from pandas.arrays import ArrowExtensionArray from pandas.core.groupby.grouper import Grouping @@ -1807,3 +1809,99 @@ def test_groupby_aggregation_func_list_multi_index_duplicate_columns(): index=Index(["level1.1", "level1.2"]), ) tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +@pytest.mark.parametrize( + "input_dtype, output_dtype", + [ + # With NumPy arrays, the results from the UDF would be e.g. np.float32 scalars + # which we can therefore preserve. However with PyArrow arrays, the results are + # Python scalars so we have no information about size or uint vs int. + ("float[pyarrow]", "double[pyarrow]"), + ("int64[pyarrow]", "int64[pyarrow]"), + ("uint64[pyarrow]", "int64[pyarrow]"), + ("bool[pyarrow]", "bool[pyarrow]"), + ], +) +def test_agg_lambda_pyarrow_dtype_conversion(input_dtype, output_dtype): + # GH#59601 + # Test PyArrow dtype conversion back to PyArrow dtype + df = DataFrame( + { + "A": ["c1", "c2", "c3", "c1", "c2", "c3"], + "B": pd.array([100, 200, 255, 0, 199, 40392], dtype=input_dtype), + } + ) + gb = df.groupby("A") + result = gb.agg(lambda x: x.min()) + + expected = DataFrame( + {"B": pd.array([0, 199, 255], dtype=output_dtype)}, + index=Index(["c1", "c2", "c3"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_agg_lambda_complex128_dtype_conversion(): + # GH#59601 + df = DataFrame( + {"A": ["c1", "c2", "c3"], "B": pd.array([100, 200, 255], "int64[pyarrow]")} + ) + gb = df.groupby("A") + result = gb.agg(lambda x: complex(x.sum(), x.count())) + + expected = DataFrame( + { + "B": pd.array( + [complex(100, 1), complex(200, 1), complex(255, 1)], dtype="complex128" + ), + }, + index=Index(["c1", "c2", "c3"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_agg_lambda_numpy_uint64_to_pyarrow_dtype_conversion(): + # GH#59601 + df = DataFrame( + { + "A": ["c1", "c2", "c3"], + "B": pd.array([100, 200, 255], dtype="uint64[pyarrow]"), + } + ) + gb = df.groupby("A") + result = gb.agg(lambda x: np.uint64(x.sum())) + + expected = DataFrame( + { + "B": pd.array([100, 200, 255], dtype="uint64[pyarrow]"), + }, + index=Index(["c1", "c2", "c3"], name="A"), + ) + tm.assert_frame_equal(result, expected) + + +@td.skip_if_no("pyarrow") +def test_agg_lambda_pyarrow_struct_to_object_dtype_conversion(): + # GH#59601 + import pyarrow as pa + + df = DataFrame( + { + "A": ["c1", "c2", "c3"], + "B": pd.array([100, 200, 255], dtype="int64[pyarrow]"), + } + ) + gb = df.groupby("A") + result = gb.agg(lambda x: {"number": 1}) + + arr = pa.array([{"number": 1}, {"number": 1}, {"number": 1}]) + expected = DataFrame( + {"B": ArrowExtensionArray(arr)}, + index=Index(["c1", "c2", "c3"], name="A"), + ) + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 4b1f23c1f755e..4955b1fe0da54 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2434,25 +2434,28 @@ def test_rolling_wrong_param_min_period(): def test_by_column_values_with_same_starting_value(any_string_dtype): # GH29635 + dtype = any_string_dtype df = DataFrame( { "Name": ["Thomas", "Thomas", "Thomas John"], "Credit": [1200, 1300, 900], - "Mood": Series(["sad", "happy", "happy"], dtype=any_string_dtype), + "Mood": Series(["sad", "happy", "happy"], dtype=dtype), } ) aggregate_details = {"Mood": Series.mode, "Credit": "sum"} result = df.groupby(["Name"]).agg(aggregate_details) - expected_result = DataFrame( + expected = DataFrame( { "Mood": [["happy", "sad"], "happy"], "Credit": [2500, 900], "Name": ["Thomas", "Thomas John"], - } + }, ).set_index("Name") - - tm.assert_frame_equal(result, expected_result) + if getattr(dtype, "storage", None) == "pyarrow": + mood_values = pd.array(["happy", "sad"], dtype=dtype) + expected["Mood"] = [mood_values, "happy"] + tm.assert_frame_equal(result, expected) def test_groupby_none_in_first_mi_level():