From b716e672ca76adbf580b3458f6bb1811d97d70ea Mon Sep 17 00:00:00 2001 From: sinhrks Date: Tue, 6 May 2014 17:34:30 +0900 Subject: [PATCH] BUG: tz info lost by set_index and reindex --- doc/source/release.rst | 2 + pandas/core/frame.py | 7 ++- pandas/core/index.py | 19 ++++--- pandas/core/reshape.py | 9 ++-- pandas/tests/test_index.py | 13 +++++ pandas/tests/test_multilevel.py | 72 +++++++++++++++++++++++++ pandas/tools/tests/test_pivot.py | 42 +++++++++++++-- pandas/tseries/period.py | 7 +++ pandas/tseries/tests/test_period.py | 19 +++++++ pandas/tseries/tests/test_timeseries.py | 11 ++++ 10 files changed, 184 insertions(+), 17 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 463cf928660dd..708300bca725d 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -495,6 +495,8 @@ Bug Fixes - Bug in ``boxplot`` and ``hist`` draws unnecessary axes (:issue:`6769`) - Regression in ``groupby.nth()`` for out-of-bounds indexers (:issue:`6621`) - Bug in ``quantile`` with datetime values (:issue:`6965`) +- Bug in ``Dataframe.set_index``, ``reindex`` and ``pivot`` don't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`3950`, :issue:`5878`, :issue:`6631`) +- Bug in ``MultiIndex.get_level_values`` doesn't preserve ``DatetimeIndex`` and ``PeriodIndex`` attributes (:issue:`7092`) pandas 0.13.1 ------------- diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 66ba061ab35ef..773270ba1d593 100755 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2220,7 +2220,7 @@ def set_index(self, keys, drop=True, append=False, inplace=False, for i in range(self.index.nlevels): arrays.append(self.index.get_level_values(i)) else: - arrays.append(np.asarray(self.index)) + arrays.append(self.index) to_remove = [] for col in keys: @@ -2232,9 +2232,12 @@ def set_index(self, keys, drop=True, append=False, inplace=False, level = col.get_level_values(col.nlevels - 1) names.extend(col.names) - elif isinstance(col, (Series, Index)): + elif isinstance(col, Series): level = col.values names.append(col.name) + elif isinstance(col, Index): + level = col + names.append(col.name) elif isinstance(col, (list, np.ndarray)): level = col names.append(None) diff --git a/pandas/core/index.py b/pandas/core/index.py index c3619b992028d..7a2a160e0dc33 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -114,9 +114,7 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, # no class inference! if fastpath: - subarr = data.view(cls) - subarr.name = name - return subarr + return cls._simple_new(data, name) from pandas.tseries.period import PeriodIndex if isinstance(data, (np.ndarray, ABCSeries)): @@ -185,6 +183,12 @@ def __new__(cls, data, dtype=None, copy=False, name=None, fastpath=False, subarr._set_names([name]) return subarr + @classmethod + def _simple_new(cls, values, name, **kwargs): + result = values.view(cls) + result.name = name + return result + def is_(self, other): """ More flexible, faster check like ``is`` but that works through views @@ -2588,11 +2592,12 @@ def get_level_values(self, level): values : ndarray """ num = self._get_level_number(level) - unique_vals = self.levels[num] # .values + unique = self.levels[num] # .values labels = self.labels[num] - values = Index(com.take_1d(unique_vals.values, labels, - fill_value=unique_vals._na_value)) - values.name = self.names[num] + filled = com.take_1d(unique.values, labels, fill_value=unique._na_value) + values = unique._simple_new(filled, self.names[num], + freq=getattr(unique, 'freq', None), + tz=getattr(unique, 'tz', None)) return values def format(self, space=2, sparsify=None, adjoin=True, names=False, diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py index 65eadff002eb6..a3a2e6849bce4 100644 --- a/pandas/core/reshape.py +++ b/pandas/core/reshape.py @@ -82,11 +82,10 @@ def __init__(self, values, index, level=-1, value_columns=None): labels = index.labels def _make_index(lev, lab): - if isinstance(lev, PeriodIndex): - i = lev.copy() - else: - i = lev.__class__(_make_index_array_level(lev.values, lab)) - i.name = lev.name + values = _make_index_array_level(lev.values, lab) + i = lev._simple_new(values, lev.name, + freq=getattr(lev, 'freq', None), + tz=getattr(lev, 'tz', None)) return i self.new_index_levels = [_make_index(lev, lab) diff --git a/pandas/tests/test_index.py b/pandas/tests/test_index.py index dafbfd07ca51d..f12d3c505741b 100644 --- a/pandas/tests/test_index.py +++ b/pandas/tests/test_index.py @@ -180,6 +180,19 @@ def test_index_ctor_infer_periodindex(self): assert_array_equal(rs, xp) tm.assert_isinstance(rs, PeriodIndex) + def test_constructor_simple_new(self): + idx = Index([1, 2, 3, 4, 5], name='int') + result = idx._simple_new(idx, 'int') + self.assert_(result.equals(idx)) + + idx = Index([1.1, np.nan, 2.2, 3.0], name='float') + result = idx._simple_new(idx, 'float') + self.assert_(result.equals(idx)) + + idx = Index(['A', 'B', 'C', np.nan], name='obj') + result = idx._simple_new(idx, 'obj') + self.assert_(result.equals(idx)) + def test_copy(self): i = Index([], name='Foo') i_copy = i.copy() diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 63bace138884f..8a75257a71eaa 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1989,6 +1989,78 @@ def test_datetimeindex(self): self.assert_(idx.levels[0].equals(expected1)) self.assert_(idx.levels[1].equals(idx2)) + def test_set_index_datetime(self): + # GH 3950 + df = pd.DataFrame({'label':['a', 'a', 'a', 'b', 'b', 'b'], + 'datetime':['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', '2011-07-19 07:00:00', + '2011-07-19 08:00:00', '2011-07-19 09:00:00'], + 'value':range(6)}) + df.index = pd.to_datetime(df.pop('datetime'), utc=True) + df.index = df.index.tz_localize('UTC').tz_convert('US/Pacific') + + expected = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', '2011-07-19 09:00:00']) + expected = expected.tz_localize('UTC').tz_convert('US/Pacific') + + df = df.set_index('label', append=True) + self.assert_(df.index.levels[0].equals(expected)) + self.assert_(df.index.levels[1].equals(pd.Index(['a', 'b']))) + + df = df.swaplevel(0, 1) + self.assert_(df.index.levels[0].equals(pd.Index(['a', 'b']))) + self.assert_(df.index.levels[1].equals(expected)) + + + df = DataFrame(np.random.random(6)) + idx1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00', '2011-07-19 07:00:00', + '2011-07-19 08:00:00', '2011-07-19 09:00:00'], tz='US/Eastern') + idx2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-01 09:00', '2012-04-01 09:00', + '2012-04-02 09:00', '2012-04-02 09:00', '2012-04-02 09:00'], + tz='US/Eastern') + idx3 = pd.date_range('2011-01-01 09:00', periods=6, tz='Asia/Tokyo') + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = pd.DatetimeIndex(['2011-07-19 07:00:00', '2011-07-19 08:00:00', + '2011-07-19 09:00:00'], tz='US/Eastern') + expected2 = pd.DatetimeIndex(['2012-04-01 09:00', '2012-04-02 09:00'], tz='US/Eastern') + + self.assert_(df.index.levels[0].equals(expected1)) + self.assert_(df.index.levels[1].equals(expected2)) + self.assert_(df.index.levels[2].equals(idx3)) + + # GH 7092 + self.assert_(df.index.get_level_values(0).equals(idx1)) + self.assert_(df.index.get_level_values(1).equals(idx2)) + self.assert_(df.index.get_level_values(2).equals(idx3)) + + def test_set_index_period(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = pd.period_range('2011-01-01', periods=3, freq='M') + idx1 = idx1.append(idx1) + idx2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') + idx2 = idx2.append(idx2).append(idx2) + idx3 = pd.period_range('2005', periods=6, freq='Y') + + df = df.set_index(idx1) + df = df.set_index(idx2, append=True) + df = df.set_index(idx3, append=True) + + expected1 = pd.period_range('2011-01-01', periods=3, freq='M') + expected2 = pd.period_range('2013-01-01 09:00', periods=2, freq='H') + + self.assert_(df.index.levels[0].equals(expected1)) + self.assert_(df.index.levels[1].equals(expected2)) + self.assert_(df.index.levels[2].equals(idx3)) + + self.assert_(df.index.get_level_values(0).equals(idx1)) + self.assert_(df.index.get_level_values(1).equals(idx2)) + self.assert_(df.index.get_level_values(2).equals(idx3)) + if __name__ == '__main__': diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index ed1f63a42cbae..09fdb5e3fed3e 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -3,7 +3,7 @@ import numpy as np from numpy.testing import assert_equal -import pandas +import pandas as pd from pandas import DataFrame, Series, Index, MultiIndex, Grouper from pandas.tools.merge import concat from pandas.tools.pivot import pivot_table, crosstab @@ -181,6 +181,42 @@ def test_pivot_index_with_nan(self): columns = Index(['C1','C2','C3','C4'],name='b')) tm.assert_frame_equal(result, expected) + def test_pivot_with_tz(self): + # GH 5878 + df = DataFrame({'dt1': [datetime.datetime(2013, 1, 1, 9, 0), + datetime.datetime(2013, 1, 2, 9, 0), + datetime.datetime(2013, 1, 1, 9, 0), + datetime.datetime(2013, 1, 2, 9, 0)], + 'dt2': [datetime.datetime(2014, 1, 1, 9, 0), + datetime.datetime(2014, 1, 1, 9, 0), + datetime.datetime(2014, 1, 2, 9, 0), + datetime.datetime(2014, 1, 2, 9, 0)], + 'data1': range(4), 'data2': range(4)}) + + df['dt1'] = df['dt1'].apply(lambda d: pd.Timestamp(d, tz='US/Pacific')) + df['dt2'] = df['dt2'].apply(lambda d: pd.Timestamp(d, tz='Asia/Tokyo')) + + exp_col1 = Index(['data1', 'data1', 'data2', 'data2']) + exp_col2 = pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'] * 2, + name='dt2', tz='Asia/Tokyo') + exp_col = pd.MultiIndex.from_arrays([exp_col1, exp_col2]) + expected = DataFrame([[0, 2, 0, 2], [1, 3, 1, 3]], + index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], + name='dt1', tz='US/Pacific'), + columns=exp_col) + + pv = df.pivot(index='dt1', columns='dt2') + tm.assert_frame_equal(pv, expected) + + expected = DataFrame([[0, 2], [1, 3]], + index=pd.DatetimeIndex(['2013/01/01 09:00', '2013/01/02 09:00'], + name='dt1', tz='US/Pacific'), + columns=pd.DatetimeIndex(['2014/01/01 09:00', '2014/01/02 09:00'], + name='dt2', tz='Asia/Tokyo')) + + pv = df.pivot(index='dt1', columns='dt2', values='data1') + tm.assert_frame_equal(pv, expected) + def test_margins(self): def _check_output(res, col, index=['A', 'B'], columns=['C']): cmarg = res['All'][:-1] @@ -235,7 +271,7 @@ def test_pivot_integer_columns(self): d = datetime.date.min data = list(product(['foo', 'bar'], ['A', 'B', 'C'], ['x1', 'x2'], [d + datetime.timedelta(i) for i in range(20)], [1.0])) - df = pandas.DataFrame(data) + df = DataFrame(data) table = df.pivot_table(values=4, index=[0, 1, 3], columns=[2]) df2 = df.rename(columns=str) @@ -286,7 +322,7 @@ def test_pivot_columns_lexsorted(self): iproduct = np.random.randint(0, len(products), n) items['Index'] = products['Index'][iproduct] items['Symbol'] = products['Symbol'][iproduct] - dr = pandas.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) + dr = pd.date_range(datetime.date(2000, 1, 1), datetime.date(2010, 12, 31)) dates = dr[np.random.randint(0, len(dr), n)] items['Year'] = dates.year items['Month'] = dates.month diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 01a93b712b42c..113be28f86976 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -669,6 +669,13 @@ def _from_arraylike(cls, data, freq, tz): return data, freq + @classmethod + def _simple_new(cls, values, name, freq=None, **kwargs): + result = values.view(cls) + result.name = name + result.freq = freq + return result + def __contains__(self, key): if not isinstance(key, Period) or key.freq != self.freq: if isinstance(key, compat.string_types): diff --git a/pandas/tseries/tests/test_period.py b/pandas/tseries/tests/test_period.py index 1a72c7925b6ee..4117ca660db35 100644 --- a/pandas/tseries/tests/test_period.py +++ b/pandas/tseries/tests/test_period.py @@ -1194,6 +1194,14 @@ def test_constructor_datetime64arr(self): self.assertRaises(ValueError, PeriodIndex, vals, freq='D') + def test_constructor_simple_new(self): + idx = period_range('2007-01', name='p', periods=20, freq='M') + result = idx._simple_new(idx, 'p', freq=idx.freq) + self.assert_(result.equals(idx)) + + result = idx._simple_new(idx.astype('i8'), 'p', freq=idx.freq) + self.assert_(result.equals(idx)) + def test_is_(self): create_index = lambda: PeriodIndex(freq='A', start='1/1/2001', end='12/1/2009') @@ -1390,6 +1398,17 @@ def test_frame_setitem(self): tm.assert_isinstance(rs.index, PeriodIndex) self.assert_(rs.index.equals(rng)) + def test_period_set_index_reindex(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = period_range('2011/01/01', periods=6, freq='M') + idx2 = period_range('2013', periods=6, freq='A') + + df = df.set_index(idx1) + self.assert_(df.index.equals(idx1)) + df = df.reindex(idx2) + self.assert_(df.index.equals(idx2)) + def test_nested_dict_frame_constructor(self): rng = period_range('1/1/2000', periods=5) df = DataFrame(randn(10, 5), columns=rng) diff --git a/pandas/tseries/tests/test_timeseries.py b/pandas/tseries/tests/test_timeseries.py index 0c0e7692b7d4c..79fd7cc6421e2 100644 --- a/pandas/tseries/tests/test_timeseries.py +++ b/pandas/tseries/tests/test_timeseries.py @@ -2510,6 +2510,17 @@ def test_dti_reset_index_round_trip(self): self.assertEquals(df.index[0], stamp) self.assertEquals(df.reset_index()['Date'][0], stamp) + def test_dti_set_index_reindex(self): + # GH 6631 + df = DataFrame(np.random.random(6)) + idx1 = date_range('2011/01/01', periods=6, freq='M', tz='US/Eastern') + idx2 = date_range('2013', periods=6, freq='A', tz='Asia/Tokyo') + + df = df.set_index(idx1) + self.assert_(df.index.equals(idx1)) + df = df.reindex(idx2) + self.assert_(df.index.equals(idx2)) + def test_datetimeindex_union_join_empty(self): dti = DatetimeIndex(start='1/1/2001', end='2/1/2001', freq='D') empty = Index([])