From daaddad7f8eab1394ef28259b5412d23fae88578 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 15 Nov 2019 09:50:08 -0800 Subject: [PATCH 01/14] REF: make _aggregate_series_pure_python extraction behave like the cython version --- pandas/core/groupby/groupby.py | 17 ++--------------- pandas/core/groupby/ops.py | 12 +++++++++--- 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 280f1e88b0ea8..74e9f50ac4d61 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -789,20 +789,7 @@ def _try_cast(self, result, obj, numeric_only: bool = False): dtype = obj.dtype if not is_scalar(result): - if is_datetime64tz_dtype(dtype): - # GH 23683 - # Prior results _may_ have been generated in UTC. - # Ensure we localize to UTC first before converting - # to the target timezone - arr = extract_array(obj) - try: - result = arr._from_sequence(result, dtype="datetime64[ns, UTC]") - result = result.astype(dtype) - except TypeError: - # _try_cast was called at a point where the result - # was already tz-aware - pass - elif is_extension_array_dtype(dtype): + if is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. @@ -869,7 +856,7 @@ def _cython_agg_general( if numeric_only and not is_numeric: continue - result, names = self.grouper.aggregate(obj.values, how, min_count=min_count) + result, names = self.grouper.aggregate(obj._values, how, min_count=min_count) output[name] = self._try_cast(result, obj) if len(output) == 0: diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 7ed79e4b00371..3e8befcfe9a0a 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -604,11 +604,11 @@ def agg_series(self, obj: Series, func): # SeriesGrouper would raise if we were to call _aggregate_series_fast return self._aggregate_series_pure_python(obj, func) - elif is_extension_array_dtype(obj.dtype) and obj.dtype.kind != "M": + elif is_extension_array_dtype(obj.dtype): # _aggregate_series_fast would raise TypeError when # calling libreduction.Slider + # In the datetime64tz case it would incorrectly cast to tz-naive # TODO: can we get a performant workaround for EAs backed by ndarray? - # TODO: is the datetime64tz case supposed to go through here? return self._aggregate_series_pure_python(obj, func) elif isinstance(obj.index, MultiIndex): @@ -657,7 +657,13 @@ def _aggregate_series_pure_python(self, obj: Series, func): res = func(group) if result is None: if isinstance(res, (Series, Index, np.ndarray)): - raise ValueError("Function does not reduce") + if len(res) == 1: + # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) + # FIXME: are we potentially losing import res.index info? + res = getattr(res, "_values", res) + res = res[0] + else: + raise ValueError("Function does not reduce") result = np.empty(ngroups, dtype="O") counts[label] = group.shape[0] From 18bb6c979196230442868a797ee273aff0c9668a Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 15 Nov 2019 10:05:30 -0800 Subject: [PATCH 02/14] blackify --- pandas/core/groupby/groupby.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 74e9f50ac4d61..d2d8c699eda62 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -856,7 +856,9 @@ def _cython_agg_general( if numeric_only and not is_numeric: continue - result, names = self.grouper.aggregate(obj._values, how, min_count=min_count) + result, names = self.grouper.aggregate( + obj._values, how, min_count=min_count + ) output[name] = self._try_cast(result, obj) if len(output) == 0: From 201bc27fc58a73bac54882f16efc648d56fc1ebc Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Fri, 15 Nov 2019 15:32:49 -0800 Subject: [PATCH 03/14] add tests --- pandas/core/groupby/groupby.py | 8 ++++---- pandas/tests/groupby/aggregate/test_other.py | 20 ++++++++++++++++++++ 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index d2d8c699eda62..39d094494bad2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -31,7 +31,6 @@ class providing the base-class of operations. from pandas.core.dtypes.common import ( ensure_float, is_datetime64_dtype, - is_datetime64tz_dtype, is_extension_array_dtype, is_integer_dtype, is_numeric_dtype, @@ -45,7 +44,6 @@ class providing the base-class of operations. from pandas.core.arrays import Categorical, try_cast_to_ea from pandas.core.base import DataError, PandasObject, SelectionMixin import pandas.core.common as com -from pandas.core.construction import extract_array from pandas.core.frame import DataFrame from pandas.core.generic import NDFrame from pandas.core.groupby import base @@ -789,9 +787,11 @@ def _try_cast(self, result, obj, numeric_only: bool = False): dtype = obj.dtype if not is_scalar(result): - if is_extension_array_dtype(dtype): + if is_extension_array_dtype(dtype) and dtype.kind != "M": # The function can return something of any type, so check - # if the type is compatible with the calling EA. + # if the type is compatible with the calling EA. + # datetime64tz is handled correctly in agg_series, + # so is excluded here. # return the same type (Series) as our caller cls = dtype.construct_array_type() diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 1c297f3e2ada3..296e0e61d68f9 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -454,6 +454,26 @@ def test_agg_over_numpy_arrays(): tm.assert_frame_equal(result, expected) +def test_agg_tzaware_non_datetime_result(): + # discussed in GH#29589, fixed in GH#29641, operating on tzaware values + # with function that is not dtype-preserving + dti = pd.date_range("2012-01-01", periods=4, tz="UTC") + df = pd.DataFrame({"a": [0, 0, 1, 1], "b": dti}) + gb = df.groupby("a") + + # Case that _does_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0]) + expected = pd.Series(dti[::2], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + # Case that does _not_ preserve the dtype + result = gb["b"].agg(lambda x: x.iloc[0].year) + expected = pd.Series([2012, 2012], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + + def test_agg_timezone_round_trip(): # GH 15426 ts = pd.Timestamp("2016-01-01 12:00:00", tz="US/Pacific") From 61f32e5096dec1727733cbc2deab2cf1423530b7 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 16 Nov 2019 07:57:01 -0800 Subject: [PATCH 04/14] added test --- pandas/tests/groupby/aggregate/test_other.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index 296e0e61d68f9..721045f1097f8 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -467,12 +467,17 @@ def test_agg_tzaware_non_datetime_result(): expected.index.name = "a" tm.assert_series_equal(result, expected) - # Case that does _not_ preserve the dtype + # Cases that do _not_ preserve the dtype result = gb["b"].agg(lambda x: x.iloc[0].year) expected = pd.Series([2012, 2012], name="b") expected.index.name = "a" tm.assert_series_equal(result, expected) + result = gb["b"].agg(lambda x: x.iloc[-1] - x.iloc[0]) + expected = pd.Series([pd.Timedelta(days=1), pd.Timedelta(days=1)], name="b") + expected.index.name = "a" + tm.assert_series_equal(result, expected) + def test_agg_timezone_round_trip(): # GH 15426 From fb9f1714a27f65820bce013b470780467c9a6d6d Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 16 Nov 2019 14:32:44 -0800 Subject: [PATCH 05/14] avoid _values getattr pattern --- pandas/core/groupby/ops.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 3e8befcfe9a0a..4b52dd752deba 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -660,8 +660,9 @@ def _aggregate_series_pure_python(self, obj: Series, func): if len(res) == 1: # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) # FIXME: are we potentially losing import res.index info? - res = getattr(res, "_values", res) - res = res[0] + + # For non-Series we could just do `res[0]` + res = next(iter(res)) else: raise ValueError("Function does not reduce") result = np.empty(ngroups, dtype="O") From d542515e3c1d77d1cf37c3008224c83406bb8edb Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 Nov 2019 08:02:27 -0800 Subject: [PATCH 06/14] todo comment --- pandas/core/groupby/ops.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 4b52dd752deba..47ca2b2190ecf 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -661,6 +661,7 @@ def _aggregate_series_pure_python(self, obj: Series, func): # e.g. test_agg_lambda_with_timezone lambda e: e.head(1) # FIXME: are we potentially losing import res.index info? + # TODO: use `.item()` if/when we un-deprecate it. # For non-Series we could just do `res[0]` res = next(iter(res)) else: From 71b21c75f9c161198c0cce58aa869fac7b845908 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 16 Nov 2019 19:32:05 -0800 Subject: [PATCH 07/14] simplify _transform_fast --- pandas/core/groupby/generic.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 6376dbefcf435..58a75557c50d8 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -406,7 +406,7 @@ def transform(self, func, *args, **kwargs): # result to the whole group. Compute func result # and deal with possible broadcasting below. return self._transform_fast( - lambda: getattr(self, func)(*args, **kwargs), func + lambda: getattr(self, func)(*args, **kwargs) ) # reg transform @@ -443,19 +443,13 @@ def transform(self, func, *args, **kwargs): result.index = self._selected_obj.index return result - def _transform_fast(self, func, func_nm) -> Series: + def _transform_fast(self, func) -> Series: """ fast version of transform, only applicable to builtin/cythonizable functions """ - if isinstance(func, str): - func = getattr(self, func) - ids, _, ngroup = self.grouper.group_info - cast = self._transform_should_cast(func_nm) out = algorithms.take_1d(func()._values, ids) - if cast: - out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): @@ -1381,6 +1375,8 @@ def _transform_fast(self, result: DataFrame, obj: DataFrame, func_nm) -> DataFra output = [] for i, _ in enumerate(result.columns): res = algorithms.take_1d(result.iloc[:, i].values, ids) + # TODO: we have no test cases that get here with EA dtypes; + # try_cast is not needed if EAs never get here if cast: res = self._try_cast(res, obj.iloc[:, i]) output.append(res) From 640710a1adcc730b74844b9ac4a4bb6bf99226c9 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sat, 16 Nov 2019 19:59:20 -0800 Subject: [PATCH 08/14] remove _try_casts --- pandas/core/groupby/generic.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 58a75557c50d8..76ba48b23b2f4 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1051,12 +1051,12 @@ def _aggregate_frame(self, func, *args, **kwargs) -> DataFrame: if axis != obj._info_axis_number: for name, data in self: fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) + result[name] = fres else: for name in self.indices: data = self.get_group(name, obj=obj) fres = func(data, *args, **kwargs) - result[name] = self._try_cast(fres, data) + result[name] = fres return self._wrap_frame_output(result, obj) From a0676fbb23f85917498048b35312f406e28ba321 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 Nov 2019 08:43:48 -0800 Subject: [PATCH 09/14] comments --- pandas/core/groupby/generic.py | 5 +++-- pandas/core/groupby/grouper.py | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 76ba48b23b2f4..e448a9900efdc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -406,7 +406,7 @@ def transform(self, func, *args, **kwargs): # result to the whole group. Compute func result # and deal with possible broadcasting below. return self._transform_fast( - lambda: getattr(self, func)(*args, **kwargs) + lambda: getattr(self, func)(*args, **kwargs), func ) # reg transform @@ -443,7 +443,7 @@ def transform(self, func, *args, **kwargs): result.index = self._selected_obj.index return result - def _transform_fast(self, func) -> Series: + def _transform_fast(self, func, func_nm) -> Series: """ fast version of transform, only applicable to builtin/cythonizable functions @@ -1109,6 +1109,7 @@ def _decide_output_index(self, output, labels): return output_keys + # TODO: this function is way too big def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return DataFrame(index=keys) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index eb1442aeb8a4c..b739fb706be03 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -245,6 +245,7 @@ class Grouping: * group_index : unique groups * groups : dict of {group -> label_list} """ + # TODO: too much going on here def __init__( self, @@ -431,6 +432,7 @@ def groups(self) -> dict: return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) +# TODO: this function is way too big def get_grouper( obj: FrameOrSeries, key=None, From ab125ec4cbbcf3417d649e543ebad66ce8aeaf3f Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Sun, 17 Nov 2019 17:38:20 -0800 Subject: [PATCH 10/14] Whatsnew --- doc/source/whatsnew/v1.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index cb68bd0e762c4..ee954d95df69b 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -438,6 +438,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) +- Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrect casting results to the original dtype (:issue:`29641`) Reshaping ^^^^^^^^^ From 96df6d936609692aaeca2873d4e87b1e41fdf142 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 18 Nov 2019 09:23:23 -0800 Subject: [PATCH 11/14] remove comments --- pandas/core/groupby/generic.py | 1 - pandas/core/groupby/grouper.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index e448a9900efdc..dc6be99c07447 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1109,7 +1109,6 @@ def _decide_output_index(self, output, labels): return output_keys - # TODO: this function is way too big def _wrap_applied_output(self, keys, values, not_indexed_same=False): if len(keys) == 0: return DataFrame(index=keys) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 8d8dc08cce9e0..c37617b1f1f7f 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -245,7 +245,6 @@ class Grouping: * group_index : unique groups * groups : dict of {group -> label_list} """ - # TODO: too much going on here def __init__( self, @@ -432,7 +431,6 @@ def groups(self) -> dict: return self.index.groupby(Categorical.from_codes(self.codes, self.group_index)) -# TODO: this function is way too big def get_grouper( obj: FrameOrSeries, key=None, From dceda2f29ebb8b3c62fa5a3dbc0e4893d40bd627 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 18 Nov 2019 12:07:40 -0800 Subject: [PATCH 12/14] revert parts in other PRs --- pandas/core/groupby/generic.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 1712f75b459bb..8b3346ee860e1 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -448,8 +448,14 @@ def _transform_fast(self, func, func_nm) -> Series: fast version of transform, only applicable to builtin/cythonizable functions """ + if isinstance(func, str): + func = getattr(self, func) + ids, _, ngroup = self.grouper.group_info + cast = self._transform_should_cast(func_nm) out = algorithms.take_1d(func()._values, ids) + if cast: + out = self._try_cast(out, self.obj) return Series(out, index=self.obj.index, name=self.obj.name) def filter(self, func, dropna=True, *args, **kwargs): @@ -1369,7 +1375,7 @@ def _transform_fast(self, result: DataFrame, obj: DataFrame, func_nm) -> DataFra for i, _ in enumerate(result.columns): res = algorithms.take_1d(result.iloc[:, i].values, ids) # TODO: we have no test cases that get here with EA dtypes; - # try_cast is not needed if EAs never get here + # try_cast may not be needed if EAs never get here if cast: res = self._try_cast(res, obj.iloc[:, i]) output.append(res) From 1965e22c3770ecc367d1d5817f6e84dc18402686 Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Mon, 18 Nov 2019 15:05:01 -0800 Subject: [PATCH 13/14] comment --- pandas/core/apply.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index 071cd116ea982..34a8ed1fa7a83 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -226,6 +226,8 @@ def apply_raw(self): if "Function does not reduce" not in str(err): # catch only ValueError raised intentionally in libreduction raise + # We expect np.apply_along_axis to give a two-dimensional result, or + # also raise. result = np.apply_along_axis(self.f, self.axis, self.values) # TODO: mixed type case From 0cccfa51d44e5919d7f2b54299d001354cfc069c Mon Sep 17 00:00:00 2001 From: jbrockmendel Date: Wed, 20 Nov 2019 17:14:41 -0800 Subject: [PATCH 14/14] remove note --- doc/source/whatsnew/v1.0.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 900e5a5fc3d70..db24be628dd67 100644 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -510,7 +510,6 @@ Groupby/resample/rolling - Bug in :meth:`DataFrameGroupby.agg` not able to use lambda function with named aggregation (:issue:`27519`) - Bug in :meth:`DataFrame.groupby` losing column name information when grouping by a categorical column (:issue:`28787`) - Bug in :meth:`DataFrameGroupBy.rolling().quantile()` ignoring ``interpolation`` keyword argument (:issue:`28779`) -- Bug in :meth:`DataFrameGroupBy.agg` with timezone-aware datetime64 column incorrect casting results to the original dtype (:issue:`29641`) - Bug in :meth:`DataFrame.groupby` where ``any``, ``all``, ``nunique`` and transform functions would incorrectly handle duplicate column labels (:issue:`21668`) -