diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 294cb723eee1a..1f766efd21855 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -31,7 +31,6 @@ class providing the base-class of operations. from pandas.core.dtypes.common import ( ensure_float, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, @@ -45,7 +44,6 @@ class providing the base-class of operations. from pandas.core.arrays import Categorical, try_cast_to_ea from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.construction import extract_array from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base @@ -790,22 +788,11 @@ def _try_cast(self, result, obj, numeric_only: bool = False): dtype = obj.dtype if not is_scalar(result): - if is_datetime64tz_dtype(dtype): - # GH 23683 - # Prior results _may_ have been generated in UTC. - # Ensure we localize to UTC first before converting - # to the target timezone - arr = extract_array(obj) - try: - result = arr._from_sequence(result, dtype="datetime64[ns, UTC]") - result = result.astype(dtype) - except TypeError: - # _try_cast was called at a point where the result - # was already tz-aware - pass - elif is_extension_array_dtype(dtype): + if is_extension_array_dtype(dtype) and dtype.kind != "M": # The function can return something of any type, so check - # if the type is compatible with the calling EA. + # if the type is compatible with the calling EA. + # datetime64tz is handled correctly in agg_series, + # so is excluded here. # return the same type (Series) as our caller cls = dtype.construct_array_type() @@ -872,7 +859,9 @@ def _cython_agg_general( if numeric_only and not is_numeric: continue - result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) + result, names = self.grouper.aggregate( + obj._values, how, min_count=min_count + ) output[name] = self._try_cast(result, obj) if len(output) == 0: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7ed79e4b00371..47ca2b2190ecf 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -604,11 +604,11 @@ def agg_series(self, obj: Series, func): # SeriesGrouper would raise if we were to call _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) - elif is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": + elif is_extension_array_dtype(obj.dtype): # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider + # In the datetime64tz case it would incorrectly cast to tz-naive # TODO: can we get a performant workaround for EAs backed by ndarray? - # TODO: is the datetime64tz case supposed to go through here? return self._aggregate_series_pure_python(obj, func) elif isinstance(obj.index, MultiIndex): @@ -657,7 +657,15 @@ def _aggregate_series_pure_python(self, obj: Series, func): res = func(group) if result is None: if isinstance(res, (Series, Index, np.ndarray)): - raise ValueError("Function does not reduce") + if len(res) == 1: + # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) + # FIXME: are we potentially losing import res.index info? + + # TODO: use `.item()` if/when we un-deprecate it. + # For non-Series we could just do `res[0]` + res = next(iter(res)) + else: + raise ValueError("Function does not reduce") result = np.empty(ngroups, dtype="O") counts[label] = group.shape[0] diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 1c297f3e2ada3..721045f1097f8 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -454,6 +454,31 @@ def test_agg_over_numpy_arrays(): tm.assert_frame_equal(result, expected) +def test_agg_tzaware_non_datetime_result(): + # discussed in GH#29589, fixed in GH#29641, operating on tzaware values + # with function that is not dtype-preserving + dti = pd.date_range("2012-01-01", periods=4, tz="UTC") + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": dti}) + gb = df.groupby("a") + + # Case that _does_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0]) + expected = pd.Series(dti[::2], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + # Cases that do _not_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0].year) + expected = pd.Series([2012, 2012], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) + expected = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + def test_agg_timezone_round_trip(): # GH 15426 ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific")