Skip to content

BUG: GroupBy doesn't preserve timezone #7099

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 13, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -497,6 +497,7 @@ Bug Fixes
- Bug in ``quantile`` with datetime values (:issue:`6965`)
- Bug in ``Dataframe.set_index``, ``reindex`` and ``pivot`` don't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`3950`, :issue:`5878`, :issue:`6631`)
- Bug in ``MultiIndex.get_level_values`` doesn't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`7092`)
- Bug in ``Groupby`` doesn't preserve ``tz`` (:issue:`3950`)

pandas 0.13.1
-------------
Expand Down
14 changes: 9 additions & 5 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,15 +112,17 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
Returns
-------
labels : the indexer to the original array
uniques : the unique values
uniques : ndarray (1-d) or Index
the unique values. Index is returned when passed values is Index or Series
note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex
"""
if order is not None:
warn("order is deprecated."
"See https://github.com/pydata/pandas/issues/6926", FutureWarning)

from pandas.tseries.period import PeriodIndex
from pandas.core.index import Index
from pandas.core.series import Series
vals = np.asarray(values)
is_datetime = com.is_datetime64_dtype(vals)
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
Expand Down Expand Up @@ -159,9 +161,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):

if is_datetime:
uniques = uniques.astype('M8[ns]')
if isinstance(values, PeriodIndex):
uniques = PeriodIndex(ordinal=uniques, freq=values.freq)

if isinstance(values, Index):
uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None),
tz=getattr(values, 'tz', None))
elif isinstance(values, Series):
uniques = Index(uniques)
return labels, uniques


Expand Down
5 changes: 1 addition & 4 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,10 +336,7 @@ def factorize(self, sort=False, na_sentinel=-1):
uniques : the unique Index
"""
from pandas.core.algorithms import factorize
from pandas.core.index import Index
labels, uniques = factorize(self, sort=sort, na_sentinel=na_sentinel)
uniques = Index(uniques)
return labels, uniques
return factorize(self, sort=sort, na_sentinel=na_sentinel)

date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps')
time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps')
Expand Down
14 changes: 4 additions & 10 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,10 @@ def __init__(self, labels, levels=None, name=None):
if levels is None:
if name is None:
name = getattr(labels, 'name', None)
if hasattr(labels, 'factorize'):
try:
labels, levels = labels.factorize(sort=True)
except TypeError:
labels, levels = labels.factorize(sort=False)
else:
try:
labels, levels = factorize(labels, sort=True)
except TypeError:
labels, levels = factorize(labels, sort=False)
try:
labels, levels = factorize(labels, sort=True)
except TypeError:
labels, levels = factorize(labels, sort=False)

self.labels = labels
self.levels = levels
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1875,9 +1875,9 @@ def _make_labels(self):
if self._was_factor: # pragma: no cover
raise Exception('Should not call this method grouping by level')
else:
labs, uniques = algos.factorize(self.grouper, sort=self.sort)
labels, uniques = algos.factorize(self.grouper, sort=self.sort)
uniques = Index(uniques, name=self.name)
self._labels = labs
self._labels = labels
self._group_index = uniques

_groups = None
Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,11 +116,11 @@ def test_datelike(self):
# periods are not 'sorted' as they are converted back into an index
labels, uniques = algos.factorize(x)
self.assert_numpy_array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))
self.assert_numpy_array_equal(uniques, np.array([v1, v2],dtype=object))
self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))

labels, uniques = algos.factorize(x,sort=True)
self.assert_numpy_array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64))
self.assert_numpy_array_equal(uniques, np.array([v1, v2],dtype=object))
self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2]))

class TestUnique(tm.TestCase):
_multiprocess_can_split_ = True
Expand Down
32 changes: 32 additions & 0 deletions pandas/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2900,6 +2900,38 @@ def test_groupby_groups_datetimeindex(self):
groups = grouped.groups
tm.assert_isinstance(list(groups.keys())[0], datetime)

def test_groupby_groups_datetimeindex_tz(self):
# GH 3950
dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00',
'2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']
df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'],
'datetime': dates, 'value1': range(6), 'value2': [1, 2] * 3})
df['datetime'] = df['datetime'].apply(lambda d: Timestamp(d, tz='US/Pacific'))

exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 07:00:00',
'2011-07-19 08:00:00', '2011-07-19 08:00:00',
'2011-07-19 09:00:00', '2011-07-19 09:00:00'],
tz='US/Pacific', name='datetime')
exp_idx2 = Index(['a', 'b'] * 3, name='label')
exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2])
expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], 'value2': [1, 2, 2, 1, 1, 2]},
index=exp_idx, columns=['value1', 'value2'])

result = df.groupby(['datetime', 'label']).sum()
assert_frame_equal(result, expected)

# by level
didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo')
df = DataFrame({'value1': range(6), 'value2': [1, 2, 3, 1, 2, 3]}, index=didx)

exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00',
'2011-07-19 09:00:00'], tz='Asia/Tokyo')
expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]},
index=exp_idx, columns=['value1', 'value2'])

result = df.groupby(level=0).sum()
assert_frame_equal(result, expected)

def test_groupby_reindex_inside_function(self):
from pandas.tseries.api import DatetimeIndex

Expand Down
13 changes: 0 additions & 13 deletions pandas/tseries/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -806,19 +806,6 @@ def to_period(self, freq=None):

return PeriodIndex(self.values, freq=freq, tz=self.tz)

def factorize(self, sort=False, na_sentinel=-1):
"""
Index.factorize with handling for DatetimeIndex metadata

Returns
-------
result : DatetimeIndex
"""
from pandas.core.algorithms import factorize
labels, uniques = factorize(self.asi8, sort=sort, na_sentinel=na_sentinel)
uniques = DatetimeIndex._simple_new(uniques, name=self.name, freq=self.freq, tz=self.tz)
return labels, uniques

def order(self, return_indexer=False, ascending=True):
"""
Return sorted copy of Index
Expand Down