From 02ddd9e2be1da0969bf95d6408ddf455f8dd27d2 Mon Sep 17 00:00:00 2001 From: sinhrks Date: Sun, 11 May 2014 19:28:27 +0900 Subject: [PATCH] BUG: GroupBy doesnt preserve timezone --- doc/source/release.rst | 1 + pandas/core/algorithms.py | 14 +++++++++----- pandas/core/base.py | 5 +---- pandas/core/categorical.py | 14 ++++---------- pandas/core/groupby.py | 4 ++-- pandas/tests/test_algos.py | 4 ++-- pandas/tests/test_groupby.py | 32 ++++++++++++++++++++++++++++++++ pandas/tseries/index.py | 13 ------------- 8 files changed, 51 insertions(+), 36 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 429e4ea8169d1..e9fd0f1d5fd48 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -497,6 +497,7 @@ Bug Fixes - Bug in ``quantile`` with datetime values (:issue:`6965`) - Bug in ``Dataframe.set_index``, ``reindex`` and ``pivot`` don't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`3950`, :issue:`5878`, :issue:`6631`) - Bug in ``MultiIndex.get_level_values`` doesn't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`7092`) +- Bug in ``Groupby`` doesn't preserve ``tz`` (:issue:`3950`) pandas 0.13.1 ------------- diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f0ecce0235b49..002d5480b9b7b 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -112,7 +112,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): Returns ------- labels : the indexer to the original array - uniques : the unique values + uniques : ndarray (1-d) or Index + the unique values. Index is returned when passed values is Index or Series note: an array of Periods will ignore sort as it returns an always sorted PeriodIndex """ @@ -120,7 +121,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): warn("order is deprecated." "See https://github.com/pydata/pandas/issues/6926", FutureWarning) - from pandas.tseries.period import PeriodIndex + from pandas.core.index import Index + from pandas.core.series import Series vals = np.asarray(values) is_datetime = com.is_datetime64_dtype(vals) (hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables) @@ -159,9 +161,11 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): if is_datetime: uniques = uniques.astype('M8[ns]') - if isinstance(values, PeriodIndex): - uniques = PeriodIndex(ordinal=uniques, freq=values.freq) - + if isinstance(values, Index): + uniques = values._simple_new(uniques, None, freq=getattr(values, 'freq', None), + tz=getattr(values, 'tz', None)) + elif isinstance(values, Series): + uniques = Index(uniques) return labels, uniques diff --git a/pandas/core/base.py b/pandas/core/base.py index f614516c87d50..5605e1b0bb7ce 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -336,10 +336,7 @@ def factorize(self, sort=False, na_sentinel=-1): uniques : the unique Index """ from pandas.core.algorithms import factorize - from pandas.core.index import Index - labels, uniques = factorize(self, sort=sort, na_sentinel=na_sentinel) - uniques = Index(uniques) - return labels, uniques + return factorize(self, sort=sort, na_sentinel=na_sentinel) date = _field_accessor('date','Returns numpy array of datetime.date. The date part of the Timestamps') time = _field_accessor('time','Returns numpy array of datetime.time. The time part of the Timestamps') diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index ee6f8f1847258..dfadd34e2d205 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -80,16 +80,10 @@ def __init__(self, labels, levels=None, name=None): if levels is None: if name is None: name = getattr(labels, 'name', None) - if hasattr(labels, 'factorize'): - try: - labels, levels = labels.factorize(sort=True) - except TypeError: - labels, levels = labels.factorize(sort=False) - else: - try: - labels, levels = factorize(labels, sort=True) - except TypeError: - labels, levels = factorize(labels, sort=False) + try: + labels, levels = factorize(labels, sort=True) + except TypeError: + labels, levels = factorize(labels, sort=False) self.labels = labels self.levels = levels diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index bce3a993171a7..258005c8a08a9 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -1875,9 +1875,9 @@ def _make_labels(self): if self._was_factor: # pragma: no cover raise Exception('Should not call this method grouping by level') else: - labs, uniques = algos.factorize(self.grouper, sort=self.sort) + labels, uniques = algos.factorize(self.grouper, sort=self.sort) uniques = Index(uniques, name=self.name) - self._labels = labs + self._labels = labels self._group_index = uniques _groups = None diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 07bf247e5aafe..7081ba50e481f 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -116,11 +116,11 @@ def test_datelike(self): # periods are not 'sorted' as they are converted back into an index labels, uniques = algos.factorize(x) self.assert_numpy_array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64)) - self.assert_numpy_array_equal(uniques, np.array([v1, v2],dtype=object)) + self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2])) labels, uniques = algos.factorize(x,sort=True) self.assert_numpy_array_equal(labels, np.array([ 0,0,0,1,1,0],dtype=np.int64)) - self.assert_numpy_array_equal(uniques, np.array([v1, v2],dtype=object)) + self.assert_numpy_array_equal(uniques, pd.PeriodIndex([v1, v2])) class TestUnique(tm.TestCase): _multiprocess_can_split_ = True diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py index 7a8fc8a3832db..fbeb947f4bbdc 100644 --- a/pandas/tests/test_groupby.py +++ b/pandas/tests/test_groupby.py @@ -2900,6 +2900,38 @@ def test_groupby_groups_datetimeindex(self): groups = grouped.groups tm.assert_isinstance(list(groups.keys())[0], datetime) + def test_groupby_groups_datetimeindex_tz(self): + # GH 3950 + dates = ['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00', + '2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00'] + df = DataFrame({'label': ['a', 'a', 'a', 'b', 'b', 'b'], + 'datetime': dates, 'value1': range(6), 'value2': [1, 2] * 3}) + df['datetime'] = df['datetime'].apply(lambda d: Timestamp(d, tz='US/Pacific')) + + exp_idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 07:00:00', + '2011-07-19 08:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', '2011-07-19 09:00:00'], + tz='US/Pacific', name='datetime') + exp_idx2 = Index(['a', 'b'] * 3, name='label') + exp_idx = MultiIndex.from_arrays([exp_idx1, exp_idx2]) + expected = DataFrame({'value1': [0, 3, 1, 4, 2, 5], 'value2': [1, 2, 2, 1, 1, 2]}, + index=exp_idx, columns=['value1', 'value2']) + + result = df.groupby(['datetime', 'label']).sum() + assert_frame_equal(result, expected) + + # by level + didx = pd.DatetimeIndex(dates, tz='Asia/Tokyo') + df = DataFrame({'value1': range(6), 'value2': [1, 2, 3, 1, 2, 3]}, index=didx) + + exp_idx = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], tz='Asia/Tokyo') + expected = DataFrame({'value1': [3, 5, 7], 'value2': [2, 4, 6]}, + index=exp_idx, columns=['value1', 'value2']) + + result = df.groupby(level=0).sum() + assert_frame_equal(result, expected) + def test_groupby_reindex_inside_function(self): from pandas.tseries.api import DatetimeIndex diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index b318e18fd6481..d9018ad92eb17 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -806,19 +806,6 @@ def to_period(self, freq=None): return PeriodIndex(self.values, freq=freq, tz=self.tz) - def factorize(self, sort=False, na_sentinel=-1): - """ - Index.factorize with handling for DatetimeIndex metadata - - Returns - ------- - result : DatetimeIndex - """ - from pandas.core.algorithms import factorize - labels, uniques = factorize(self.asi8, sort=sort, na_sentinel=na_sentinel) - uniques = DatetimeIndex._simple_new(uniques, name=self.name, freq=self.freq, tz=self.tz) - return labels, uniques - def order(self, return_indexer=False, ascending=True): """ Return sorted copy of Index