Skip to content

Commit 4c30610

Browse files
committed
Merge pull request #7099 from sinhrks/grouptz
BUG: GroupBy doesn't preserve timezone
2 parents cd422c2 + 02ddd9e commit 4c30610

File tree

8 files changed

+51
-36
lines changed

8 files changed

+51
-36
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -497,6 +497,7 @@ Bug Fixes
497497
- Bug in ``quantile`` with datetime values (:issue:`6965`)
498498
- Bug in ``Dataframe.set_index``, ``reindex`` and ``pivot`` don't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`3950`, :issue:`5878`, :issue:`6631`)
499499
- Bug in ``MultiIndex.get_level_values`` doesn't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`7092`)
500+
- Bug in ``Groupby`` doesn't preserve ``tz`` (:issue:`3950`)
500501

501502
pandas 0.13.1
502503
-------------

pandas/core/algorithms.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -112,15 +112,17 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
112112
Returns
113113
-------
114114
labels : the indexer to the original array
115-
uniques : the unique values
115+
uniques : ndarray (1-d) or Index
116+
the unique values. Index is returned when passed values is Index or Series
116117
117118
note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex
118119
"""
119120
if order is not None:
120121
warn("order is deprecated."
121122
"See https://github.com/pydata/pandas/issues/6926", FutureWarning)
122123

123-
from pandas.tseries.period import PeriodIndex
124+
from pandas.core.index import Index
125+
from pandas.core.series import Series
124126
vals = np.asarray(values)
125127
is_datetime = com.is_datetime64_dtype(vals)
126128
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
@@ -159,9 +161,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
159161

160162
if is_datetime:
161163
uniques = uniques.astype('M8[ns]')
162-
if isinstance(values, PeriodIndex):
163-
uniques = PeriodIndex(ordinal=uniques, freq=values.freq)
164-
164+
if isinstance(values, Index):
165+
uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None),
166+
tz=getattr(values, 'tz', None))
167+
elif isinstance(values, Series):
168+
uniques = Index(uniques)
165169
return labels, uniques
166170

167171

pandas/core/base.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -336,10 +336,7 @@ def factorize(self, sort=False, na_sentinel=-1):
336336
uniques : the unique Index
337337
"""
338338
from pandas.core.algorithms import factorize
339-
from pandas.core.index import Index
340-
labels, uniques = factorize(self, sort=sort, na_sentinel=na_sentinel)
341-
uniques = Index(uniques)
342-
return labels, uniques
339+
return factorize(self, sort=sort, na_sentinel=na_sentinel)
343340

344341
date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps')
345342
time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps')

pandas/core/categorical.py

+4-10
Original file line numberDiff line numberDiff line change
@@ -80,16 +80,10 @@ def __init__(self, labels, levels=None, name=None):
8080
if levels is None:
8181
if name is None:
8282
name = getattr(labels, 'name', None)
83-
if hasattr(labels, 'factorize'):
84-
try:
85-
labels, levels = labels.factorize(sort=True)
86-
except TypeError:
87-
labels, levels = labels.factorize(sort=False)
88-
else:
89-
try:
90-
labels, levels = factorize(labels, sort=True)
91-
except TypeError:
92-
labels, levels = factorize(labels, sort=False)
83+
try:
84+
labels, levels = factorize(labels, sort=True)
85+
except TypeError:
86+
labels, levels = factorize(labels, sort=False)
9387

9488
self.labels = labels
9589
self.levels = levels

pandas/core/groupby.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1875,9 +1875,9 @@ def _make_labels(self):
18751875
if self._was_factor: # pragma: no cover
18761876
raise Exception('Should not call this method grouping by level')
18771877
else:
1878-
labs, uniques = algos.factorize(self.grouper, sort=self.sort)
1878+
labels, uniques = algos.factorize(self.grouper, sort=self.sort)
18791879
uniques = Index(uniques, name=self.name)
1880-
self._labels = labs
1880+
self._labels = labels
18811881
self._group_index = uniques
18821882

18831883
_groups = None

pandas/tests/test_algos.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -116,11 +116,11 @@ def test_datelike(self):
116116
# periods are not 'sorted' as they are converted back into an index
117117
labels, uniques = algos.factorize(x)
118118
self.assert_numpy_array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))
119-
self.assert_numpy_array_equal(uniques, np.array([v1, v2],dtype=object))
119+
self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))
120120

121121
labels, uniques = algos.factorize(x,sort=True)
122122
self.assert_numpy_array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))
123-
self.assert_numpy_array_equal(uniques, np.array([v1, v2],dtype=object))
123+
self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))
124124

125125
class TestUnique(tm.TestCase):
126126
_multiprocess_can_split_ = True

pandas/tests/test_groupby.py

+32
Original file line numberDiff line numberDiff line change
@@ -2900,6 +2900,38 @@ def test_groupby_groups_datetimeindex(self):
29002900
groups = grouped.groups
29012901
tm.assert_isinstance(list(groups.keys())[0], datetime)
29022902

2903+
def test_groupby_groups_datetimeindex_tz(self):
2904+
# GH 3950
2905+
dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00',
2906+
'2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']
2907+
df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
2908+
'datetime': dates, 'value1': range(6), 'value2': [1, 2] * 3})
2909+
df['datetime'] = df['datetime'].apply(lambda d: Timestamp(d, tz='US/Pacific'))
2910+
2911+
exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 07:00:00',
2912+
'2011-07-19 08:00:00', '2011-07-19 08:00:00',
2913+
'2011-07-19 09:00:00', '2011-07-19 09:00:00'],
2914+
tz='US/Pacific', name='datetime')
2915+
exp_idx2 = Index(['a', 'b'] * 3, name='label')
2916+
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
2917+
expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], 'value2': [1, 2, 2, 1, 1, 2]},
2918+
index=exp_idx, columns=['value1', 'value2'])
2919+
2920+
result = df.groupby(['datetime', 'label']).sum()
2921+
assert_frame_equal(result, expected)
2922+
2923+
# by level
2924+
didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo')
2925+
df = DataFrame({'value1': range(6), 'value2': [1, 2, 3, 1, 2, 3]}, index=didx)
2926+
2927+
exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00',
2928+
'2011-07-19 09:00:00'], tz='Asia/Tokyo')
2929+
expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
2930+
index=exp_idx, columns=['value1', 'value2'])
2931+
2932+
result = df.groupby(level=0).sum()
2933+
assert_frame_equal(result, expected)
2934+
29032935
def test_groupby_reindex_inside_function(self):
29042936
from pandas.tseries.api import DatetimeIndex
29052937

pandas/tseries/index.py

-13
Original file line numberDiff line numberDiff line change
@@ -806,19 +806,6 @@ def to_period(self, freq=None):
806806

807807
return PeriodIndex(self.values, freq=freq, tz=self.tz)
808808

809-
def factorize(self, sort=False, na_sentinel=-1):
810-
"""
811-
Index.factorize with handling for DatetimeIndex metadata
812-
813-
Returns
814-
-------
815-
result : DatetimeIndex
816-
"""
817-
from pandas.core.algorithms import factorize
818-
labels, uniques = factorize(self.asi8, sort=sort, na_sentinel=na_sentinel)
819-
uniques = DatetimeIndex._simple_new(uniques, name=self.name, freq=self.freq, tz=self.tz)
820-
return labels, uniques
821-
822809
def order(self, return_indexer=False, ascending=True):
823810
"""
824811
Return sorted copy of Index

0 commit comments

Comments
 (0)