From 46943e0c43b836e559644a3e95e2ffde9fedb78c Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 11 Feb 2019 23:05:12 -0800 Subject: [PATCH 01/10] BUG: Groupby.agg cannot reduce with tz aware data --- pandas/_libs/reduction.pyx | 3 ++- pandas/tests/groupby/aggregate/test_other.py | 15 +++++++++++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 507567cf480d7..28a67fb79ae2b 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -342,7 +342,8 @@ cdef class SeriesGrouper: index = None else: values = dummy.values - if dummy.dtype != self.arr.dtype: + if (dummy.dtype != self.arr.dtype + and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') if not values.flags.contiguous: values = values.copy() diff --git a/pandas/tests/groupby/aggregate/test_other.py b/pandas/tests/groupby/aggregate/test_other.py index b5214b11bddcc..cacfdb7694de1 100644 --- a/pandas/tests/groupby/aggregate/test_other.py +++ b/pandas/tests/groupby/aggregate/test_other.py @@ -512,3 +512,18 @@ def test_agg_list_like_func(): expected = pd.DataFrame({'A': [str(x) for x in range(3)], 'B': [[str(x)] for x in range(3)]}) tm.assert_frame_equal(result, expected) + + +def test_agg_lambda_with_timezone(): + # GH 23683 + df = pd.DataFrame({ + 'tag': [1, 1], + 'date': [ + pd.Timestamp('2018-01-01', tz='UTC'), + pd.Timestamp('2018-01-02', tz='UTC')] + }) + result = df.groupby('tag').agg({'date': lambda e: e.head(1)}) + expected = pd.DataFrame([pd.Timestamp('2018-01-01', tz='UTC')], + index=pd.Index([1], name='tag'), + columns=['date']) + tm.assert_frame_equal(result, expected) From 724a69e9a2ac35fe0301d8b0127e79534dd99a48 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 12 Feb 2019 16:32:13 -0800 Subject: [PATCH 02/10] Handle output always as UTC --- pandas/_libs/reduction.pyx | 1 + pandas/core/groupby/groupby.py | 14 +++++++++++++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 28a67fb79ae2b..517d59c399179 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -342,6 +342,7 @@ cdef class SeriesGrouper: index = None else: values = dummy.values + # GH 23683: datetimetz types are equivalent to datetime types here if (dummy.dtype != self.arr.dtype and values.dtype != self.arr.dtype): raise ValueError('Dummy array must be same dtype') diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c63bc5164e25b..ea911b75784b3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -26,7 +26,9 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar) + ensure_float, is_extension_array_dtype, is_datetime64tz_dtype, + is_numeric_dtype, is_scalar) +from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algorithms @@ -768,7 +770,17 @@ def _try_cast(self, result, obj, numeric_only=False): # The function can return something of any type, so check # if the type is compatible with the calling EA. try: + tz_dtype = None + if is_datetime64tz_dtype(dtype): + # GH 23683 + # Prior results were generated in UTC. Ensure + # We localize to UTC first before converting to + # the target timezone + tz_dtype = dtype + dtype = DatetimeTZDtype(tz='UTC') result = obj._values._from_sequence(result, dtype=dtype) + if tz_dtype is not None: + result = result.astype(tz_dtype) except Exception: # https://github.com/pandas-dev/pandas/issues/22850 # pandas has no control over what 3rd-party ExtensionArrays From 0d1eb5541888ef0ce37f8eeb8f31500adfab234e Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 13 Feb 2019 11:54:35 -0800 Subject: [PATCH 03/10] Add whatsnew --- doc/source/whatsnew/v0.25.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 95362521f3b9f..86e4a47d3b3a9 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -172,7 +172,7 @@ Plotting Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ -- +- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) - - From b4913dc92fa9c095b304a2a5298b4d3769dd0fda Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Wed, 13 Feb 2019 13:05:03 -0800 Subject: [PATCH 04/10] isort and add another fixed groupby.first/last issue --- doc/source/whatsnew/v0.25.0.rst | 2 +- pandas/core/groupby/groupby.py | 2 +- pandas/tests/groupby/test_nth.py | 20 ++++++++++++++++++++ 3 files changed, 22 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 86e4a47d3b3a9..7d0b683a631ea 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -173,7 +173,7 @@ Groupby/Resample/Rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`) -- +- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`) - diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ea911b75784b3..77edd155125f2 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -26,7 +26,7 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import maybe_downcast_to_dtype from pandas.core.dtypes.common import ( - ensure_float, is_extension_array_dtype, is_datetime64tz_dtype, + ensure_float, is_datetime64tz_dtype, is_extension_array_dtype, is_numeric_dtype, is_scalar) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna, notna diff --git a/pandas/tests/groupby/test_nth.py b/pandas/tests/groupby/test_nth.py index 255d9a8acf2d0..7a3d189d3020e 100644 --- a/pandas/tests/groupby/test_nth.py +++ b/pandas/tests/groupby/test_nth.py @@ -278,6 +278,26 @@ def test_first_last_tz(data, expected_first, expected_last): assert_frame_equal(result, expected[['id', 'time']]) +@pytest.mark.parametrize('method, ts, alpha', [ + ['first', Timestamp('2013-01-01', tz='US/Eastern'), 'a'], + ['last', Timestamp('2013-01-02', tz='US/Eastern'), 'b'] +]) +def test_first_last_tz_multi_column(method, ts, alpha): + # GH 21603 + df = pd.DataFrame({'group': [1, 1, 2], + 'category_string': pd.Series(list('abc')).astype( + 'category'), + 'datetimetz': pd.date_range('20130101', periods=3, + tz='US/Eastern')}) + result = getattr(df.groupby('group'), method)() + expepcted = pd.DataFrame({'category_string': [alpha, 'c'], + 'datetimetz': [ts, + Timestamp('2013-01-03', + tz='US/Eastern')]}, + index=pd.Index([1, 2], name='group')) + assert_frame_equal(result, expepcted) + + def test_nth_multi_index_as_expected(): # PR 9090, related to issue 8979 # test nth on MultiIndex From da26b7b9ed85e03e369c48639c1bac73845bd766 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sat, 16 Feb 2019 15:36:18 -0800 Subject: [PATCH 05/10] bring condition at a higher level --- pandas/core/groupby/groupby.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 77edd155125f2..e8852e2f04b2c 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -766,21 +766,18 @@ def _try_cast(self, result, obj, numeric_only=False): dtype = obj.dtype if not is_scalar(result): - if is_extension_array_dtype(dtype): + if is_datetime64tz_dtype(dtype): + # GH 23683 + # Prior results were generated in UTC. Ensure we localize to + # UTC first before converting to the target timezone + utc_dtype = DatetimeTZDtype(tz='UTC') + result = obj._values._from_sequence(result, dtype=utc_dtype) + result = result.astype(dtype) + elif is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. try: - tz_dtype = None - if is_datetime64tz_dtype(dtype): - # GH 23683 - # Prior results were generated in UTC. Ensure - # We localize to UTC first before converting to - # the target timezone - tz_dtype = dtype - dtype = DatetimeTZDtype(tz='UTC') result = obj._values._from_sequence(result, dtype=dtype) - if tz_dtype is not None: - result = result.astype(tz_dtype) except Exception: # https://github.com/pandas-dev/pandas/issues/22850 # pandas has no control over what 3rd-party ExtensionArrays From 60adcf066b392eab91bb5af5c3ae14e0d6b46d9d Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 17 Feb 2019 23:17:26 -0800 Subject: [PATCH 06/10] Add try for _try_cast --- pandas/core/groupby/groupby.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e8852e2f04b2c..c2afc7485f072 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -768,11 +768,16 @@ def _try_cast(self, result, obj, numeric_only=False): if not is_scalar(result): if is_datetime64tz_dtype(dtype): # GH 23683 - # Prior results were generated in UTC. Ensure we localize to - # UTC first before converting to the target timezone - utc_dtype = DatetimeTZDtype(tz='UTC') - result = obj._values._from_sequence(result, dtype=utc_dtype) - result = result.astype(dtype) + # Prior results _may_ have been generated in UTC. + # Ensure we localize to UTC first before converting + # to the target timezone + try: + utc_dtype = DatetimeTZDtype(tz='UTC') + result = obj._values._from_sequence(result, + dtype=utc_dtype) + result = result.astype(dtype) + except TypeError: + pass elif is_extension_array_dtype(dtype): # The function can return something of any type, so check # if the type is compatible with the calling EA. From 206303ab07d8ffb4ee53db4892ed9a957481e967 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 19 Feb 2019 15:25:03 -0800 Subject: [PATCH 07/10] Add comments --- pandas/core/groupby/groupby.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index c2afc7485f072..ca5c3f3711fdb 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -777,6 +777,8 @@ def _try_cast(self, result, obj, numeric_only=False): dtype=utc_dtype) result = result.astype(dtype) except TypeError: + # _try_cast was called at a point where the result + # was already tz-aware pass elif is_extension_array_dtype(dtype): # The function can return something of any type, so check From 0c0b43a0417ff2629e6eb3c0d2fcad16f275b48a Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 24 Feb 2019 11:39:05 -0800 Subject: [PATCH 08/10] Don't pass the utc_dtype explicitly --- pandas/core/groupby/groupby.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index ca5c3f3711fdb..dc8b8d520d2d3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -772,10 +772,8 @@ def _try_cast(self, result, obj, numeric_only=False): # Ensure we localize to UTC first before converting # to the target timezone try: - utc_dtype = DatetimeTZDtype(tz='UTC') - result = obj._values._from_sequence(result, - dtype=utc_dtype) - result = result.astype(dtype) + result = obj._values._from_sequence(result) + result = result.tz_localize('UTC').tz_convert(dtype.tz) except TypeError: # _try_cast was called at a point where the result # was already tz-aware From 5bf07a99d74cdcb7aa994ce8adaf95c71ac4da63 Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Sun, 24 Feb 2019 13:52:30 -0800 Subject: [PATCH 09/10] Remove unused import --- pandas/core/groupby/groupby.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index dc8b8d520d2d3..cb2cf7b9e4e21 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -28,7 +28,6 @@ class providing the base-class of operations. from pandas.core.dtypes.common import ( ensure_float, is_datetime64tz_dtype, is_extension_array_dtype, is_numeric_dtype, is_scalar) -from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.missing import isna, notna import pandas.core.algorithms as algorithms From 95a48d67ad76993ad93ab7ba6236d2577a9e3a1c Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Mon, 25 Feb 2019 21:57:46 -0800 Subject: [PATCH 10/10] Use string dtype instead --- pandas/core/groupby/groupby.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index cb2cf7b9e4e21..057f0484a86c4 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -771,8 +771,10 @@ def _try_cast(self, result, obj, numeric_only=False): # Ensure we localize to UTC first before converting # to the target timezone try: - result = obj._values._from_sequence(result) - result = result.tz_localize('UTC').tz_convert(dtype.tz) + result = obj._values._from_sequence( + result, dtype='datetime64[ns, UTC]' + ) + result = result.astype(dtype) except TypeError: # _try_cast was called at a point where the result # was already tz-aware