Skip to content

Commit 9120b05

Browse files
committed
ENH: support timezone data_columns in HDFStore (GH2852)
DOC: update release notes/whatsnew, added whatsnew 0.11.1 to index.rst ENH: warn a FrequencyWarning if appending with a different frequency that existing
1 parent d461d41 commit 9120b05

File tree

5 files changed

+167
-36
lines changed

5 files changed

+167
-36
lines changed

RELEASE.rst

+8-2
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,12 @@ pandas 0.11.1
3838
- Fixed various issues with internal pprinting code, the repr() for various objects
3939
including TimeStamp and *Index now produces valid python code strings and
4040
can be used to recreate the object, (GH3038_), (GH3379_), (GH3251_)
41-
- ``HDFStore`` will retain index attributes (freq,tz,name) on recreation (GH3499_)
41+
- ``HDFStore``
42+
43+
- will retain index attributes (freq,tz,name) on recreation (GH3499_)
44+
- will warn with a FrequencyWarning if you are attempting to append
45+
an index with a different frequency than the existing
46+
- support datelike columns with a timezone as data_columns (GH2852_)
4247

4348
**API Changes**
4449

@@ -88,6 +93,7 @@ pandas 0.11.1
8893
.. _GH3251: https://github.com/pydata/pandas/issues/3251
8994
.. _GH3379: https://github.com/pydata/pandas/issues/3379
9095
.. _GH3480: https://github.com/pydata/pandas/issues/3480
96+
.. _GH2852: https://github.com/pydata/pandas/issues/2852
9197
.. _GH3454: https://github.com/pydata/pandas/issues/3454
9298
.. _GH3457: https://github.com/pydata/pandas/issues/3457
9399
.. _GH3491: https://github.com/pydata/pandas/issues/3491
@@ -103,7 +109,7 @@ pandas 0.11.1
103109
.. _GH3461: https://github.com/pydata/pandas/issues/3461
104110
.. _GH3468: https://github.com/pydata/pandas/issues/3468
105111
.. _GH3448: https://github.com/pydata/pandas/issues/3448
106-
.. _GH3449: https://github.com/pydata/pandas/issues/3449
112+
.. _GH3499: https://github.com/pydata/pandas/issues/3499
107113
.. _GH3495: https://github.com/pydata/pandas/issues/3495
108114
.. _GH3492: https://github.com/pydata/pandas/issues/3492
109115
.. _GH3493: https://github.com/pydata/pandas/issues/3493

doc/source/v0.11.1.txt

+11-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
1-
.. _whatsnew_0120:
1+
.. _whatsnew_0111:
22

3-
v0.12.0 (??)
3+
v0.11.1 (??)
44
------------------------
55

66
This is a major release from 0.11.0 and includes many new features and
@@ -12,13 +12,21 @@ API changes
1212

1313
Enhancements
1414
~~~~~~~~~~~~
15-
- pd.read_html() can now parse HTML string, files or urls and return dataframes
15+
- ``pd.read_html()`` can now parse HTML string, files or urls and return dataframes
1616
courtesy of @cpcloud. (GH3477_)
17+
- ``HDFStore``
18+
19+
- will retain index attributes (freq,tz,name) on recreation (GH3499_)
20+
- will warn with a FrequencyWarning if you are attempting to append
21+
an index with a different frequency than the existing
22+
- support datelike columns with a timezone as data_columns (GH2852_)
1723

1824
See the `full release notes
1925
<https://github.com/pydata/pandas/blob/master/RELEASE.rst>`__ or issue tracker
2026
on GitHub for a complete list.
2127

2228
.. _GH2437: https://github.com/pydata/pandas/issues/2437
29+
.. _GH2852: https://github.com/pydata/pandas/issues/2852
2330
.. _GH3477: https://github.com/pydata/pandas/issues/3477
2431
.. _GH3492: https://github.com/pydata/pandas/issues/3492
32+
.. _GH3499: https://github.com/pydata/pandas/issues/3499

doc/source/whatsnew.rst

+2
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@ These are new features and improvements of note in each release.
1818

1919
.. include:: v0.12.0.txt
2020

21+
.. include:: v0.11.1.txt
22+
2123
.. include:: v0.11.0.txt
2224

2325
.. include:: v0.10.1.txt

pandas/io/pytables.py

+77-25
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,11 @@ class IncompatibilityWarning(Warning): pass
4242
where criteria is being ignored as this version [%s] is too old (or not-defined),
4343
read the file in and write it out to a new file to upgrade (with the copy_to method)
4444
"""
45+
class FrequencyWarning(Warning): pass
46+
frequency_doc = """
47+
the frequency of the existing index is [%s] which conflicts with the new freq [%s],
48+
resetting the frequency to None
49+
"""
4550
class PerformanceWarning(Warning): pass
4651
performance_doc = """
4752
your performance may suffer as PyTables will pickle object types that it cannot map
@@ -149,9 +154,12 @@ def get_store(path, mode='a', complevel=None, complib=None,
149154

150155
### interface to/from ###
151156

152-
def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, **kwargs):
157+
def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs):
153158
""" store this object, close it if we opened it """
154-
f = lambda store: store.put(key, value, **kwargs)
159+
if append:
160+
f = lambda store: store.append(key, value, **kwargs)
161+
else:
162+
f = lambda store: store.put(key, value, **kwargs)
155163

156164
if isinstance(path_or_buf, basestring):
157165
with get_store(path_or_buf, mode=mode, complevel=complevel, complib=complib) as store:
@@ -941,6 +949,7 @@ class IndexCol(object):
941949
is_an_indexable = True
942950
is_data_indexable = True
943951
is_searchable = False
952+
_info_fields = ['freq','tz','name']
944953

945954
def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None,
946955
name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None,
@@ -1121,7 +1130,7 @@ def update_info(self, info):
11211130
""" set/update the info for this indexable with the key/value
11221131
if validate is True, then raise if an existing value does not match the value """
11231132

1124-
for key in ['freq','tz','name']:
1133+
for key in self._info_fields:
11251134

11261135
value = getattr(self,key,None)
11271136

@@ -1132,15 +1141,31 @@ def update_info(self, info):
11321141

11331142
existing_value = idx.get(key)
11341143
if key in idx and existing_value != value:
1135-
raise ValueError("invalid info for [%s] for [%s]"""
1136-
", existing_value [%s] conflicts with new value [%s]" % (self.name,
1137-
key,existing_value,value))
11381144

1139-
if value is not None or existing_value is not None:
1140-
idx[key] = value
1145+
# frequency just warn
1146+
if key == 'freq':
1147+
ws = frequency_doc % (existing_value,value)
1148+
warnings.warn(ws, FrequencyWarning)
1149+
1150+
# reset
1151+
idx[key] = None
1152+
1153+
else:
1154+
raise ValueError("invalid info for [%s] for [%s]"""
1155+
", existing_value [%s] conflicts with new value [%s]" % (self.name,
1156+
key,existing_value,value))
1157+
else:
1158+
if value is not None or existing_value is not None:
1159+
idx[key] = value
11411160

11421161
return self
11431162

1163+
def set_info(self, info):
1164+
""" set my state from the passed info """
1165+
idx = info.get(self.name)
1166+
if idx is not None:
1167+
self.__dict__.update(idx)
1168+
11441169
def get_attr(self):
11451170
""" set the kind for this colummn """
11461171
self.kind = getattr(self.attrs, self.kind_attr, None)
@@ -1180,6 +1205,7 @@ class DataCol(IndexCol):
11801205
is_an_indexable = False
11811206
is_data_indexable = False
11821207
is_searchable = False
1208+
_info_fields = ['tz']
11831209

11841210
@classmethod
11851211
def create_for_block(cls, i=None, name=None, cname=None, version=None, **kwargs):
@@ -1249,7 +1275,7 @@ def set_kind(self):
12491275
if self.typ is None:
12501276
self.typ = getattr(self.description,self.cname,None)
12511277

1252-
def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs):
1278+
def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, **kwargs):
12531279
""" create and setup my atom from the block b """
12541280

12551281
self.values = list(block.items)
@@ -1264,10 +1290,27 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, **kwargs):
12641290
"[date] is not implemented as a table column")
12651291
elif inferred_type == 'datetime':
12661292
if getattr(rvalues[0],'tzinfo',None) is not None:
1293+
1294+
# if this block has more than one timezone, raise
1295+
if len(set([r.tzinfo for r in rvalues])) != 1:
1296+
raise TypeError(
1297+
"too many timezones in this block, create separate data columns")
1298+
1299+
# convert this column to datetime64[ns] utc, and save the tz
1300+
index = DatetimeIndex(rvalues)
1301+
tz = getattr(index,'tz',None)
1302+
if tz is None:
1303+
raise TypeError(
1304+
"invalid timezone specification")
1305+
1306+
values = index.tz_convert('UTC').values.view('i8')
1307+
self.tz = tz
1308+
self.update_info(info)
1309+
self.set_atom_datetime64(block, values.reshape(block.values.shape))
1310+
1311+
else:
12671312
raise TypeError(
1268-
"timezone support on datetimes is not yet implemented as a table column")
1269-
raise TypeError(
1270-
"[datetime] is not implemented as a table column")
1313+
"[datetime] is not implemented as a table column")
12711314
elif inferred_type == 'unicode':
12721315
raise TypeError(
12731316
"[unicode] is not implemented as a table column")
@@ -1347,10 +1390,12 @@ def set_atom_data(self, block):
13471390
def get_atom_datetime64(self, block):
13481391
return _tables().Int64Col(shape=block.shape[0])
13491392

1350-
def set_atom_datetime64(self, block):
1393+
def set_atom_datetime64(self, block, values = None):
13511394
self.kind = 'datetime64'
13521395
self.typ = self.get_atom_datetime64(block)
1353-
self.set_data(block.values.view('i8'), 'datetime64')
1396+
if values is None:
1397+
values = block.values.view('i8')
1398+
self.set_data(values, 'datetime64')
13541399

13551400
@property
13561401
def shape(self):
@@ -1389,7 +1434,18 @@ def convert(self, values, nan_rep):
13891434

13901435
# reverse converts
13911436
if self.dtype == 'datetime64':
1392-
self.data = np.asarray(self.data, dtype='M8[ns]')
1437+
# recreate the timezone
1438+
if self.tz is not None:
1439+
1440+
# data should be 2-dim here
1441+
# we stored as utc, so just set the tz
1442+
1443+
index = DatetimeIndex(self.data.ravel(),tz='UTC').tz_convert(self.tz)
1444+
self.data = np.array(index.tolist(),dtype=object).reshape(self.data.shape)
1445+
1446+
else:
1447+
self.data = np.asarray(self.data, dtype='M8[ns]')
1448+
13931449
elif self.dtype == 'date':
13941450
self.data = np.array(
13951451
[date.fromtimestamp(v) for v in self.data], dtype=object)
@@ -2267,17 +2323,8 @@ def indexables(self):
22672323
d = self.description
22682324
self._indexables = []
22692325

2270-
# info
2271-
info = getattr(self.attrs,'info',None) or dict()
2272-
22732326
# index columns
2274-
def create_index(i, axis, name):
2275-
kwargs = dict( name=name, axis=axis, pos=i )
2276-
i = info.get(name)
2277-
if i is not None and len(i):
2278-
kwargs.update(i)
2279-
return IndexCol(**kwargs)
2280-
self._indexables.extend([ create_index(i,axis,name) for i, (axis, name) in enumerate(self.attrs.index_cols)])
2327+
self._indexables.extend([ IndexCol(name=name,axis=axis,pos=i) for i, (axis, name) in enumerate(self.attrs.index_cols)])
22812328

22822329
# values columns
22832330
dc = set(self.data_columns)
@@ -2370,6 +2417,7 @@ def read_axes(self, where, **kwargs):
23702417

23712418
# convert the data
23722419
for a in self.axes:
2420+
a.set_info(self.info)
23732421
a.convert(values, nan_rep=self.nan_rep)
23742422

23752423
return True
@@ -2535,6 +2583,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
25352583
existing_col=existing_col,
25362584
min_itemsize=min_itemsize,
25372585
nan_rep=nan_rep,
2586+
info=self.info,
25382587
**kwargs)
25392588
col.set_pos(j)
25402589

@@ -2654,6 +2703,7 @@ def read_column(self, column, where = None, **kwargs):
26542703

26552704
# column must be an indexable or a data column
26562705
c = getattr(self.table.cols, column)
2706+
a.set_info(self.info)
26572707
return Series(a.convert(c[:], nan_rep=self.nan_rep).take_data())
26582708

26592709
raise KeyError("column [%s] not found in the table" % column)
@@ -3365,6 +3415,8 @@ def convert_value(self, v):
33653415

33663416
if self.kind == 'datetime64' or self.kind == 'datetime' :
33673417
v = lib.Timestamp(v)
3418+
if v.tz is not None:
3419+
v = v.tz_convert('UTC')
33683420
return [v.value, v]
33693421
elif isinstance(v, datetime) or hasattr(v, 'timetuple') or self.kind == 'date':
33703422
v = time.mktime(v.timetuple())

pandas/io/tests/test_pytables.py

+69-6
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
import pandas
1111
from pandas import (Series, DataFrame, Panel, MultiIndex, bdate_range,
1212
date_range, Index)
13-
from pandas.io.pytables import HDFStore, get_store, Term, IncompatibilityWarning, PerformanceWarning
13+
from pandas.io.pytables import (HDFStore, get_store, Term,
14+
IncompatibilityWarning, PerformanceWarning,
15+
FrequencyWarning)
1416
import pandas.util.testing as tm
1517
from pandas.tests.test_series import assert_series_equal
1618
from pandas.tests.test_frame import assert_frame_equal
@@ -1260,16 +1262,48 @@ def test_unimplemented_dtypes_table_columns(self):
12601262
self.assertRaises(TypeError, store.append, 'df_unimplemented', df)
12611263

12621264
def test_table_append_with_timezones(self):
1263-
# not implemented yet
12641265

12651266
with ensure_clean(self.path) as store:
12661267

1267-
# check with mixed dtypes
1268-
df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern')),index=range(5))
1269-
1270-
# timezones not yet supported
1268+
def compare(a,b):
1269+
tm.assert_frame_equal(a,b)
1270+
1271+
# compare the zones on each element
1272+
for c in a.columns:
1273+
for i in a.index:
1274+
a_e = a[c][i]
1275+
b_e = b[c][i]
1276+
if not (a_e == b_e and a_e.tz == b_e.tz):
1277+
raise AssertionError("invalid tz comparsion [%s] [%s]" % (a_e,b_e))
1278+
1279+
from datetime import timedelta
1280+
1281+
_maybe_remove(store, 'df_tz')
1282+
df = DataFrame(dict(A = [ Timestamp('20130102 2:00:00',tz='US/Eastern') + timedelta(hours=1)*i for i in range(5) ]))
1283+
store.append('df_tz',df,data_columns=['A'])
1284+
compare(store['df_tz'],df)
1285+
1286+
# select with tz aware
1287+
compare(store.select('df_tz',where=Term('A','>=',df.A[3])),df[df.A>=df.A[3]])
1288+
1289+
_maybe_remove(store, 'df_tz')
1290+
df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130103',tz='US/Eastern')),index=range(5))
1291+
store.append('df_tz',df)
1292+
compare(store['df_tz'],df)
1293+
1294+
_maybe_remove(store, 'df_tz')
1295+
df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=range(5))
12711296
self.assertRaises(TypeError, store.append, 'df_tz', df)
12721297

1298+
# this is ok
1299+
_maybe_remove(store, 'df_tz')
1300+
store.append('df_tz',df,data_columns=['A','B'])
1301+
compare(store['df_tz'],df)
1302+
1303+
# can't append with diff timezone
1304+
df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=range(5))
1305+
self.assertRaises(ValueError, store.append, 'df_tz', df)
1306+
12731307
def test_remove(self):
12741308

12751309
with ensure_clean(self.path) as store:
@@ -2049,6 +2083,7 @@ def test_retain_index_attributes(self):
20492083
index=date_range('2000-1-1',periods=3,freq='H'))))
20502084

20512085
with ensure_clean(self.path) as store:
2086+
_maybe_remove(store,'data')
20522087
store.put('data', df, table=True)
20532088

20542089
result = store.get('data')
@@ -2058,6 +2093,34 @@ def test_retain_index_attributes(self):
20582093
for idx in ['index','columns']:
20592094
self.assert_(getattr(getattr(df,idx),attr,None) == getattr(getattr(result,idx),attr,None))
20602095

2096+
2097+
# try to append a table with a different frequency
2098+
warnings.filterwarnings('ignore', category=FrequencyWarning)
2099+
df2 = DataFrame(dict(A = Series(xrange(3),
2100+
index=date_range('2002-1-1',periods=3,freq='D'))))
2101+
store.append('data',df2)
2102+
warnings.filterwarnings('always', category=FrequencyWarning)
2103+
2104+
self.assert_(store.get_storer('data').info['index']['freq'] is None)
2105+
2106+
# this is ok
2107+
_maybe_remove(store,'df2')
2108+
df2 = DataFrame(dict(A = Series(xrange(3),
2109+
index=[Timestamp('20010101'),Timestamp('20010102'),Timestamp('20020101')])))
2110+
store.append('df2',df2)
2111+
df3 = DataFrame(dict(A = Series(xrange(3),index=date_range('2002-1-1',periods=3,freq='D'))))
2112+
store.append('df2',df3)
2113+
2114+
def test_retain_index_attributes2(self):
2115+
2116+
with tm.ensure_clean(self.path) as path:
2117+
warnings.filterwarnings('ignore', category=FrequencyWarning)
2118+
df = DataFrame(dict(A = Series(xrange(3), index=date_range('2000-1-1',periods=3,freq='H'))))
2119+
df.to_hdf(path,'data',mode='w',append=True)
2120+
df2 = DataFrame(dict(A = Series(xrange(3), index=date_range('2002-1-1',periods=3,freq='D'))))
2121+
df2.to_hdf(path,'data',append=True)
2122+
warnings.filterwarnings('always', category=FrequencyWarning)
2123+
20612124
def test_panel_select(self):
20622125

20632126
wp = tm.makePanel()

0 commit comments

Comments
 (0)