Skip to content

Commit 169a56a

Browse files
authored
BUG: Groupby.agg with reduction function with tz aware data (pandas-dev#25308)
* BUG: Groupby.agg cannot reduce with tz aware data * Handle output always as UTC * Add whatsnew * isort and add another fixed groupby.first/last issue * bring condition at a higher level * Add try for _try_cast * Add comments * Don't pass the utc_dtype explicitly * Remove unused import * Use string dtype instead
1 parent e9de5f3 commit 169a56a

File tree

5 files changed

+57
-5
lines changed

5 files changed

+57
-5
lines changed

doc/source/whatsnew/v0.25.0.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -229,8 +229,8 @@ Groupby/Resample/Rolling
229229

230230
- Bug in :meth:`pandas.core.resample.Resampler.agg` with a timezone aware index where ``OverflowError`` would raise when passing a list of functions (:issue:`22660`)
231231
- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.nunique` in which the names of column levels were lost (:issue:`23222`)
232-
-
233-
-
232+
- Bug in :func:`pandas.core.groupby.GroupBy.agg` when applying a aggregation function to timezone aware data (:issue:`23683`)
233+
- Bug in :func:`pandas.core.groupby.GroupBy.first` and :func:`pandas.core.groupby.GroupBy.last` where timezone information would be dropped (:issue:`21603`)
234234

235235

236236
Reshaping

pandas/_libs/reduction.pyx

+3-1
Original file line numberDiff line numberDiff line change
@@ -342,7 +342,9 @@ cdef class SeriesGrouper:
342342
index = None
343343
else:
344344
values = dummy.values
345-
if dummy.dtype != self.arr.dtype:
345+
# GH 23683: datetimetz types are equivalent to datetime types here
346+
if (dummy.dtype != self.arr.dtype
347+
and values.dtype != self.arr.dtype):
346348
raise ValueError('Dummy array must be same dtype')
347349
if not values.flags.contiguous:
348350
values = values.copy()

pandas/core/groupby/groupby.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@ class providing the base-class of operations.
2626

2727
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
2828
from pandas.core.dtypes.common import (
29-
ensure_float, is_extension_array_dtype, is_numeric_dtype, is_scalar)
29+
ensure_float, is_datetime64tz_dtype, is_extension_array_dtype,
30+
is_numeric_dtype, is_scalar)
3031
from pandas.core.dtypes.missing import isna, notna
3132

3233
from pandas.api.types import (
@@ -766,7 +767,21 @@ def _try_cast(self, result, obj, numeric_only=False):
766767
dtype = obj.dtype
767768

768769
if not is_scalar(result):
769-
if is_extension_array_dtype(dtype):
770+
if is_datetime64tz_dtype(dtype):
771+
# GH 23683
772+
# Prior results _may_ have been generated in UTC.
773+
# Ensure we localize to UTC first before converting
774+
# to the target timezone
775+
try:
776+
result = obj._values._from_sequence(
777+
result, dtype='datetime64[ns, UTC]'
778+
)
779+
result = result.astype(dtype)
780+
except TypeError:
781+
# _try_cast was called at a point where the result
782+
# was already tz-aware
783+
pass
784+
elif is_extension_array_dtype(dtype):
770785
# The function can return something of any type, so check
771786
# if the type is compatible with the calling EA.
772787
try:

pandas/tests/groupby/aggregate/test_other.py

+15
Original file line numberDiff line numberDiff line change
@@ -512,3 +512,18 @@ def test_agg_list_like_func():
512512
expected = pd.DataFrame({'A': [str(x) for x in range(3)],
513513
'B': [[str(x)] for x in range(3)]})
514514
tm.assert_frame_equal(result, expected)
515+
516+
517+
def test_agg_lambda_with_timezone():
518+
# GH 23683
519+
df = pd.DataFrame({
520+
'tag': [1, 1],
521+
'date': [
522+
pd.Timestamp('2018-01-01', tz='UTC'),
523+
pd.Timestamp('2018-01-02', tz='UTC')]
524+
})
525+
result = df.groupby('tag').agg({'date': lambda e: e.head(1)})
526+
expected = pd.DataFrame([pd.Timestamp('2018-01-01', tz='UTC')],
527+
index=pd.Index([1], name='tag'),
528+
columns=['date'])
529+
tm.assert_frame_equal(result, expected)

pandas/tests/groupby/test_nth.py

+20
Original file line numberDiff line numberDiff line change
@@ -278,6 +278,26 @@ def test_first_last_tz(data, expected_first, expected_last):
278278
assert_frame_equal(result, expected[['id', 'time']])
279279

280280

281+
@pytest.mark.parametrize('method, ts, alpha', [
282+
['first', Timestamp('2013-01-01', tz='US/Eastern'), 'a'],
283+
['last', Timestamp('2013-01-02', tz='US/Eastern'), 'b']
284+
])
285+
def test_first_last_tz_multi_column(method, ts, alpha):
286+
# GH 21603
287+
df = pd.DataFrame({'group': [1, 1, 2],
288+
'category_string': pd.Series(list('abc')).astype(
289+
'category'),
290+
'datetimetz': pd.date_range('20130101', periods=3,
291+
tz='US/Eastern')})
292+
result = getattr(df.groupby('group'), method)()
293+
expepcted = pd.DataFrame({'category_string': [alpha, 'c'],
294+
'datetimetz': [ts,
295+
Timestamp('2013-01-03',
296+
tz='US/Eastern')]},
297+
index=pd.Index([1, 2], name='group'))
298+
assert_frame_equal(result, expepcted)
299+
300+
281301
def test_nth_multi_index_as_expected():
282302
# PR 9090, related to issue 8979
283303
# test nth on MultiIndex

0 commit comments

Comments
 (0)