diff --git a/RELEASE.rst b/RELEASE.rst index 69cfd1eb99d7e..f3f4d7c895931 100644 --- a/RELEASE.rst +++ b/RELEASE.rst @@ -38,6 +38,12 @@ pandas 0.11.1 - Fixed various issues with internal pprinting code, the repr() for various objects including TimeStamp and *Index now produces valid python code strings and can be used to recreate the object, (GH3038_), (GH3379_), (GH3251_) + - ``HDFStore`` + + - will retain index attributes (freq,tz,name) on recreation (GH3499_) + - will warn with a FrequencyWarning if you are attempting to append + an index with a different frequency than the existing + - support datelike columns with a timezone as data_columns (GH2852_) **API Changes** @@ -87,6 +93,7 @@ pandas 0.11.1 .. _GH3251: https://github.com/pydata/pandas/issues/3251 .. _GH3379: https://github.com/pydata/pandas/issues/3379 .. _GH3480: https://github.com/pydata/pandas/issues/3480 +.. _GH2852: https://github.com/pydata/pandas/issues/2852 .. _GH3454: https://github.com/pydata/pandas/issues/3454 .. _GH3457: https://github.com/pydata/pandas/issues/3457 .. _GH3491: https://github.com/pydata/pandas/issues/3491 @@ -102,7 +109,7 @@ pandas 0.11.1 .. _GH3461: https://github.com/pydata/pandas/issues/3461 .. _GH3468: https://github.com/pydata/pandas/issues/3468 .. _GH3448: https://github.com/pydata/pandas/issues/3448 -.. _GH3449: https://github.com/pydata/pandas/issues/3449 +.. _GH3499: https://github.com/pydata/pandas/issues/3499 .. _GH3495: https://github.com/pydata/pandas/issues/3495 .. _GH3492: https://github.com/pydata/pandas/issues/3492 .. _GH3493: https://github.com/pydata/pandas/issues/3493 diff --git a/doc/source/v0.11.1.txt b/doc/source/v0.11.1.txt index 4be34cdbf84eb..2e3a67ead65e0 100644 --- a/doc/source/v0.11.1.txt +++ b/doc/source/v0.11.1.txt @@ -1,6 +1,6 @@ -.. _whatsnew_0120: +.. _whatsnew_0111: -v0.12.0 (??) +v0.11.1 (??) ------------------------ This is a major release from 0.11.0 and includes many new features and @@ -12,13 +12,21 @@ API changes Enhancements ~~~~~~~~~~~~ - - pd.read_html() can now parse HTML string, files or urls and return dataframes + - ``pd.read_html()`` can now parse HTML string, files or urls and return dataframes courtesy of @cpcloud. (GH3477_) + - ``HDFStore`` + + - will retain index attributes (freq,tz,name) on recreation (GH3499_) + - will warn with a FrequencyWarning if you are attempting to append + an index with a different frequency than the existing + - support datelike columns with a timezone as data_columns (GH2852_) See the `full release notes `__ or issue tracker on GitHub for a complete list. .. _GH2437: https://github.com/pydata/pandas/issues/2437 +.. _GH2852: https://github.com/pydata/pandas/issues/2852 .. _GH3477: https://github.com/pydata/pandas/issues/3477 .. _GH3492: https://github.com/pydata/pandas/issues/3492 +.. _GH3499: https://github.com/pydata/pandas/issues/3499 diff --git a/doc/source/whatsnew.rst b/doc/source/whatsnew.rst index 81bd39dd0e70f..a02e41176ced1 100644 --- a/doc/source/whatsnew.rst +++ b/doc/source/whatsnew.rst @@ -18,6 +18,8 @@ These are new features and improvements of note in each release. .. include:: v0.12.0.txt +.. include:: v0.11.1.txt + .. include:: v0.11.0.txt .. include:: v0.10.1.txt diff --git a/pandas/core/index.py b/pandas/core/index.py index 101b69ffc3c7e..4a7981e57c622 100644 --- a/pandas/core/index.py +++ b/pandas/core/index.py @@ -83,12 +83,12 @@ class Index(np.ndarray): _engine_type = _index.ObjectEngine - def __new__(cls, data, dtype=None, copy=False, name=None): + def __new__(cls, data, dtype=None, copy=False, name=None, **kwargs): from pandas.tseries.period import PeriodIndex if isinstance(data, np.ndarray): if issubclass(data.dtype.type, np.datetime64): from pandas.tseries.index import DatetimeIndex - result = DatetimeIndex(data, copy=copy, name=name) + result = DatetimeIndex(data, copy=copy, name=name, **kwargs) if dtype is not None and _o_dtype == dtype: return Index(result.to_pydatetime(), dtype=_o_dtype) else: @@ -102,7 +102,7 @@ def __new__(cls, data, dtype=None, copy=False, name=None): except TypeError: pass elif isinstance(data, PeriodIndex): - return PeriodIndex(data, copy=copy, name=name) + return PeriodIndex(data, copy=copy, name=name, **kwargs) if issubclass(data.dtype.type, np.integer): return Int64Index(data, copy=copy, dtype=dtype, name=name) @@ -123,10 +123,10 @@ def __new__(cls, data, dtype=None, copy=False, name=None): if (inferred.startswith('datetime') or tslib.is_timestamp_array(subarr)): from pandas.tseries.index import DatetimeIndex - return DatetimeIndex(subarr, copy=copy, name=name) + return DatetimeIndex(subarr, copy=copy, name=name, **kwargs) elif inferred == 'period': - return PeriodIndex(subarr, name=name) + return PeriodIndex(subarr, name=name, **kwargs) subarr = subarr.view(cls) subarr.name = name diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 06ae9a7f7f11f..1661080b11799 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -42,6 +42,11 @@ class IncompatibilityWarning(Warning): pass where criteria is being ignored as this version [%s] is too old (or not-defined), read the file in and write it out to a new file to upgrade (with the copy_to method) """ +class FrequencyWarning(Warning): pass +frequency_doc = """ +the frequency of the existing index is [%s] which conflicts with the new freq [%s], +resetting the frequency to None +""" class PerformanceWarning(Warning): pass performance_doc = """ your performance may suffer as PyTables will pickle object types that it cannot map @@ -149,9 +154,12 @@ def get_store(path, mode='a', complevel=None, complib=None, ### interface to/from ### -def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, **kwargs): +def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): """ store this object, close it if we opened it """ - f = lambda store: store.put(key, value, **kwargs) + if append: + f = lambda store: store.append(key, value, **kwargs) + else: + f = lambda store: store.put(key, value, **kwargs) if isinstance(path_or_buf, basestring): with get_store(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store: @@ -941,9 +949,11 @@ class IndexCol(object): is_an_indexable = True is_data_indexable = True is_searchable = False + _info_fields = ['freq','tz','name'] def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None, - name=None, axis=None, kind_attr=None, pos=None, **kwargs): + name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, + index_name=None, **kwargs): self.values = values self.kind = kind self.typ = typ @@ -953,6 +963,9 @@ def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None, self.kind_attr = kind_attr self.axis = axis self.pos = pos + self.freq = freq + self.tz = tz + self.index_name = None self.table = None if name is not None: @@ -1023,7 +1036,22 @@ def convert(self, values, nan_rep): values = values[self.cname] except: pass - self.values = Index(_maybe_convert(values, self.kind)) + + kwargs = dict() + if self.freq is not None: + kwargs['freq'] = self.freq + if self.tz is not None: + kwargs['tz'] = self.tz + if self.name is not None: + kwargs['name'] = self.index_name + try: + self.values = Index(_maybe_convert(values, self.kind), **kwargs) + except: + + # if the output freq is different that what we recorded, then infer it + if 'freq' in kwargs: + kwargs['freq'] = 'infer' + self.values = Index(_maybe_convert(values, self.kind), **kwargs) return self def take_data(self): @@ -1098,6 +1126,46 @@ def validate_attr(self, append): raise TypeError("incompatible kind in col [%s - %s]" % (existing_kind, self.kind)) + def update_info(self, info): + """ set/update the info for this indexable with the key/value + if validate is True, then raise if an existing value does not match the value """ + + for key in self._info_fields: + + value = getattr(self,key,None) + + try: + idx = info[self.name] + except: + idx = info[self.name] = dict() + + existing_value = idx.get(key) + if key in idx and existing_value != value: + + # frequency just warn + if key == 'freq': + ws = frequency_doc % (existing_value,value) + warnings.warn(ws, FrequencyWarning) + + # reset + idx[key] = None + + else: + raise ValueError("invalid info for [%s] for [%s]""" + ", existing_value [%s] conflicts with new value [%s]" % (self.name, + key,existing_value,value)) + else: + if value is not None or existing_value is not None: + idx[key] = value + + return self + + def set_info(self, info): + """ set my state from the passed info """ + idx = info.get(self.name) + if idx is not None: + self.__dict__.update(idx) + def get_attr(self): """ set the kind for this colummn """ self.kind = getattr(self.attrs, self.kind_attr, None) @@ -1137,6 +1205,7 @@ class DataCol(IndexCol): is_an_indexable = False is_data_indexable = False is_searchable = False + _info_fields = ['tz'] @classmethod def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs): @@ -1206,7 +1275,7 @@ def set_kind(self): if self.typ is None: self.typ = getattr(self.description,self.cname,None) - def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs): + def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, **kwargs): """ create and setup my atom from the block b """ self.values = list(block.items) @@ -1221,10 +1290,27 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs): "[date] is not implemented as a table column") elif inferred_type == 'datetime': if getattr(rvalues[0],'tzinfo',None) is not None: + + # if this block has more than one timezone, raise + if len(set([r.tzinfo for r in rvalues])) != 1: + raise TypeError( + "too many timezones in this block, create separate data columns") + + # convert this column to datetime64[ns] utc, and save the tz + index = DatetimeIndex(rvalues) + tz = getattr(index,'tz',None) + if tz is None: + raise TypeError( + "invalid timezone specification") + + values = index.tz_convert('UTC').values.view('i8') + self.tz = tz + self.update_info(info) + self.set_atom_datetime64(block, values.reshape(block.values.shape)) + + else: raise TypeError( - "timezone support on datetimes is not yet implemented as a table column") - raise TypeError( - "[datetime] is not implemented as a table column") + "[datetime] is not implemented as a table column") elif inferred_type == 'unicode': raise TypeError( "[unicode] is not implemented as a table column") @@ -1304,10 +1390,12 @@ def set_atom_data(self, block): def get_atom_datetime64(self, block): return _tables().Int64Col(shape=block.shape[0]) - def set_atom_datetime64(self, block): + def set_atom_datetime64(self, block, values = None): self.kind = 'datetime64' self.typ = self.get_atom_datetime64(block) - self.set_data(block.values.view('i8'), 'datetime64') + if values is None: + values = block.values.view('i8') + self.set_data(values, 'datetime64') @property def shape(self): @@ -1346,7 +1434,18 @@ def convert(self, values, nan_rep): # reverse converts if self.dtype == 'datetime64': - self.data = np.asarray(self.data, dtype='M8[ns]') + # recreate the timezone + if self.tz is not None: + + # data should be 2-dim here + # we stored as utc, so just set the tz + + index = DatetimeIndex(self.data.ravel(),tz='UTC').tz_convert(self.tz) + self.data = np.array(index.tolist(),dtype=object).reshape(self.data.shape) + + else: + self.data = np.asarray(self.data, dtype='M8[ns]') + elif self.dtype == 'date': self.data = np.array( [date.fromtimestamp(v) for v in self.data], dtype=object) @@ -2060,6 +2159,7 @@ def __init__(self, *args, **kwargs): self.non_index_axes = [] self.values_axes = [] self.data_columns = [] + self.info = dict() self.nan_rep = None self.selection = None @@ -2173,18 +2273,20 @@ def values_cols(self): def set_attrs(self): """ set our table type & indexables """ - self.attrs.table_type = self.table_type - self.attrs.index_cols = self.index_cols() - self.attrs.values_cols = self.values_cols() + self.attrs.table_type = self.table_type + self.attrs.index_cols = self.index_cols() + self.attrs.values_cols = self.values_cols() self.attrs.non_index_axes = self.non_index_axes self.attrs.data_columns = self.data_columns - self.attrs.nan_rep = self.nan_rep - self.attrs.levels = self.levels + self.attrs.info = self.info + self.attrs.nan_rep = self.nan_rep + self.attrs.levels = self.levels def get_attrs(self): """ retrieve our attributes """ self.non_index_axes = getattr(self.attrs,'non_index_axes',None) or [] self.data_columns = getattr(self.attrs,'data_columns',None) or [] + self.info = getattr(self.attrs,'info',None) or dict() self.nan_rep = getattr(self.attrs,'nan_rep',None) self.levels = getattr(self.attrs,'levels',None) or [] t = self.table @@ -2222,7 +2324,7 @@ def indexables(self): self._indexables = [] # index columns - self._indexables.extend([IndexCol(name=name, axis=axis, pos=i) for i, (axis, name) in enumerate(self.attrs.index_cols)]) + self._indexables.extend([ IndexCol(name=name,axis=axis,pos=i) for i, (axis, name) in enumerate(self.attrs.index_cols)]) # values columns dc = set(self.data_columns) @@ -2315,6 +2417,7 @@ def read_axes(self, where, **kwargs): # convert the data for a in self.axes: + a.set_info(self.info) a.convert(values, nan_rep=self.nan_rep) return True @@ -2379,7 +2482,8 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, existing_table.infer_axes() axes = [ a.axis for a in existing_table.index_axes] data_columns = existing_table.data_columns - nan_rep = existing_table.nan_rep + nan_rep = existing_table.nan_rep + self.info = existing_table.info else: existing_table = None @@ -2421,7 +2525,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, self.non_index_axes.append((i, append_axis)) # set axis positions (based on the axes) - self.index_axes = [index_axes_map[a].set_pos(j) for j, + self.index_axes = [index_axes_map[a].set_pos(j).update_info(self.info) for j, a in enumerate(axes)] j = len(self.index_axes) @@ -2479,6 +2583,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None, existing_col=existing_col, min_itemsize=min_itemsize, nan_rep=nan_rep, + info=self.info, **kwargs) col.set_pos(j) @@ -2598,6 +2703,7 @@ def read_column(self, column, where = None, **kwargs): # column must be an indexable or a data column c = getattr(self.table.cols, column) + a.set_info(self.info) return Series(a.convert(c[:], nan_rep=self.nan_rep).take_data()) raise KeyError("column [%s] not found in the table" % column) @@ -3042,10 +3148,10 @@ class AppendableNDimTable(AppendablePanelTable): def _convert_index(index): if isinstance(index, DatetimeIndex): converted = index.asi8 - return IndexCol(converted, 'datetime64', _tables().Int64Col()) + return IndexCol(converted, 'datetime64', _tables().Int64Col(), freq=getattr(index,'freq',None), tz=getattr(index,'tz',None)) elif isinstance(index, (Int64Index, PeriodIndex)): atom = _tables().Int64Col() - return IndexCol(index.values, 'integer', atom) + return IndexCol(index.values, 'integer', atom, freq=getattr(index,'freq',None)) if isinstance(index, MultiIndex): raise Exception('MultiIndex not supported here!') @@ -3309,6 +3415,8 @@ def convert_value(self, v): if self.kind == 'datetime64' or self.kind == 'datetime' : v = lib.Timestamp(v) + if v.tz is not None: + v = v.tz_convert('UTC') return [v.value, v] elif isinstance(v, datetime) or hasattr(v, 'timetuple') or self.kind == 'date': v = time.mktime(v.timetuple()) diff --git a/pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 b/pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 new file mode 100644 index 0000000000000..958effc2ce6f8 Binary files /dev/null and b/pandas/io/tests/data/legacy_hdf/legacy_table_0.11.h5 differ diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d7f497648236a..3daa08a0d591a 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -7,9 +7,12 @@ import datetime import numpy as np +import pandas from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range, date_range, Index) -from pandas.io.pytables import HDFStore, get_store, Term, IncompatibilityWarning, PerformanceWarning +from pandas.io.pytables import (HDFStore, get_store, Term, + IncompatibilityWarning, PerformanceWarning, + FrequencyWarning) import pandas.util.testing as tm from pandas.tests.test_series import assert_series_equal from pandas.tests.test_frame import assert_frame_equal @@ -1259,16 +1262,48 @@ def test_unimplemented_dtypes_table_columns(self): self.assertRaises(TypeError, store.append, 'df_unimplemented', df) def test_table_append_with_timezones(self): - # not implemented yet with ensure_clean(self.path) as store: - # check with mixed dtypes - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern')),index=range(5)) - - # timezones not yet supported + def compare(a,b): + tm.assert_frame_equal(a,b) + + # compare the zones on each element + for c in a.columns: + for i in a.index: + a_e = a[c][i] + b_e = b[c][i] + if not (a_e == b_e and a_e.tz == b_e.tz): + raise AssertionError("invalid tz comparsion [%s] [%s]" % (a_e,b_e)) + + from datetime import timedelta + + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A = [ Timestamp('20130102 2:00:00',tz='US/Eastern') + timedelta(hours=1)*i for i in range(5) ])) + store.append('df_tz',df,data_columns=['A']) + compare(store['df_tz'],df) + + # select with tz aware + compare(store.select('df_tz',where=Term('A','>=',df.A[3])),df[df.A>=df.A[3]]) + + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=range(5)) + store.append('df_tz',df) + compare(store['df_tz'],df) + + _maybe_remove(store, 'df_tz') + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=range(5)) self.assertRaises(TypeError, store.append, 'df_tz', df) + # this is ok + _maybe_remove(store, 'df_tz') + store.append('df_tz',df,data_columns=['A','B']) + compare(store['df_tz'],df) + + # can't append with diff timezone + df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=range(5)) + self.assertRaises(ValueError, store.append, 'df_tz', df) + def test_remove(self): with ensure_clean(self.path) as store: @@ -2041,6 +2076,51 @@ def test_select_iterator(self): result = concat(results) tm.assert_frame_equal(expected, result) + def test_retain_index_attributes(self): + + # GH 3499, losing frequency info on index recreation + df = DataFrame(dict(A = Series(xrange(3), + index=date_range('2000-1-1',periods=3,freq='H')))) + + with ensure_clean(self.path) as store: + _maybe_remove(store,'data') + store.put('data', df, table=True) + + result = store.get('data') + tm.assert_frame_equal(df,result) + + for attr in ['freq','tz']: + for idx in ['index','columns']: + self.assert_(getattr(getattr(df,idx),attr,None) == getattr(getattr(result,idx),attr,None)) + + + # try to append a table with a different frequency + warnings.filterwarnings('ignore', category=FrequencyWarning) + df2 = DataFrame(dict(A = Series(xrange(3), + index=date_range('2002-1-1',periods=3,freq='D')))) + store.append('data',df2) + warnings.filterwarnings('always', category=FrequencyWarning) + + self.assert_(store.get_storer('data').info['index']['freq'] is None) + + # this is ok + _maybe_remove(store,'df2') + df2 = DataFrame(dict(A = Series(xrange(3), + index=[Timestamp('20010101'),Timestamp('20010102'),Timestamp('20020101')]))) + store.append('df2',df2) + df3 = DataFrame(dict(A = Series(xrange(3),index=date_range('2002-1-1',periods=3,freq='D')))) + store.append('df2',df3) + + def test_retain_index_attributes2(self): + + with tm.ensure_clean(self.path) as path: + warnings.filterwarnings('ignore', category=FrequencyWarning) + df = DataFrame(dict(A = Series(xrange(3), index=date_range('2000-1-1',periods=3,freq='H')))) + df.to_hdf(path,'data',mode='w',append=True) + df2 = DataFrame(dict(A = Series(xrange(3), index=date_range('2002-1-1',periods=3,freq='D')))) + df2.to_hdf(path,'data',append=True) + warnings.filterwarnings('always', category=FrequencyWarning) + def test_panel_select(self): wp = tm.makePanel() @@ -2437,6 +2517,16 @@ def test_legacy_0_10_read(self): finally: safe_close(store) + def test_legacy_0_11_read(self): + # legacy from 0.11 + try: + store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_0.11.h5'), 'r') + df = store.select('df') + df1 = store.select('df1') + mi = store.select('mi') + finally: + safe_close(store) + def test_copy(self): def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): @@ -2497,14 +2587,22 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): def test_legacy_table_write(self): raise nose.SkipTest - # legacy table types + store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_%s.h5' % pandas.__version__), 'a') + df = tm.makeDataFrame() wp = tm.makePanel() - store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table.h5'), 'a') + index = MultiIndex(levels=[['foo', 'bar', 'baz', 'qux'], + ['one', 'two', 'three']], + labels=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], + [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]], + names=['foo', 'bar']) + df = DataFrame(np.random.randn(10, 3), index=index, + columns=['A', 'B', 'C']) + store.append('mi', df) - self.assertRaises(Exception, store.append, 'df1', df) - self.assertRaises(Exception, store.append, 'wp1', wp) + df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) + store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) store.close()