Skip to content

Commit addcec2

Browse files
committed
ENH: Implement export of datetime64[ns, tz] dtypes with a fixed HDF5 store #11411
1 parent 91407ff commit addcec2

File tree

3 files changed

+76
-47
lines changed

3 files changed

+76
-47
lines changed

doc/source/whatsnew/v0.17.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ Enhancements
6666
pd.Index([1, np.nan, 3]).fillna(2)
6767

6868
- ``pivot_table`` now has a ``margins_name`` argument so you can use something other than the default of 'All' (:issue:`3335`)
69+
- Implement export of ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`11411`)
6970

7071
.. _whatsnew_0171.api:
7172

@@ -159,4 +160,3 @@ Bug Fixes
159160
- Bug in ``DataFrame.join()`` with ``how='right'`` producing a ``TypeError`` (:issue:`11519`)
160161
- Bug in ``Series.quantile`` with empty list results has ``Index`` with ``object`` dtype (:issue:`11588`)
161162
- Bug in ``pd.merge`` results in empty ``Int64Index`` rather than ``Index(dtype=object)`` when the merge result is empty (:issue:`11588`)
162-

pandas/io/pytables.py

+56-38
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
# versioning attribute
4848
_version = '0.15.2'
4949

50+
### encoding ###
5051
# PY3 encoding if we don't specify
5152
_default_encoding = 'UTF-8'
5253

@@ -64,22 +65,8 @@ def _ensure_encoding(encoding):
6465
encoding = _default_encoding
6566
return encoding
6667

67-
def _set_tz(values, tz, preserve_UTC=False):
68-
""" set the timezone if values are an Index """
69-
if tz is not None and isinstance(values, Index):
70-
tz = _ensure_decoded(tz)
71-
if values.tz is None:
72-
values = values.tz_localize('UTC').tz_convert(tz)
73-
if preserve_UTC:
74-
if tslib.get_timezone(tz) == 'UTC':
75-
values = list(values)
76-
77-
return values
78-
79-
8068
Term = Expr
8169

82-
8370
def _ensure_term(where, scope_level):
8471
"""
8572
ensure that the where is a Term or a list of Term
@@ -1947,14 +1934,11 @@ def set_atom_datetime64tz(self, block, info, values=None):
19471934
if values is None:
19481935
values = block.values
19491936

1950-
# convert this column to datetime64[ns] utc, and save the tz
1951-
values = values.tz_convert('UTC').values.view('i8').reshape(block.shape)
1937+
# convert this column to i8 in UTC, and save the tz
1938+
values = values.asi8.reshape(block.shape)
19521939

19531940
# store a converted timezone
1954-
zone = tslib.get_timezone(block.values.tz)
1955-
if zone is None:
1956-
zone = tslib.tot_seconds(block.values.tz.utcoffset())
1957-
self.tz = zone
1941+
self.tz = _get_tz(block.values.tz)
19581942
self.update_info(info)
19591943

19601944
self.kind = 'datetime64'
@@ -2015,18 +1999,9 @@ def convert(self, values, nan_rep, encoding):
20151999

20162000
# reverse converts
20172001
if dtype == u('datetime64'):
2018-
# recreate the timezone
2019-
if self.tz is not None:
2020-
2021-
# data should be 2-dim here
2022-
# we stored as utc, so just set the tz
20232002

2024-
index = DatetimeIndex(
2025-
self.data.ravel(), tz='UTC').tz_convert(tslib.maybe_get_tz(self.tz))
2026-
self.data = index
2027-
2028-
else:
2029-
self.data = np.asarray(self.data, dtype='M8[ns]')
2003+
# recreate with tz if indicated
2004+
self.data = _set_tz(self.data, self.tz, coerce=True)
20302005

20312006
elif dtype == u('timedelta64'):
20322007
self.data = np.asarray(self.data, dtype='m8[ns]')
@@ -2347,7 +2322,10 @@ def read_array(self, key):
23472322
ret = data
23482323

23492324
if dtype == u('datetime64'):
2350-
ret = np.asarray(ret, dtype='M8[ns]')
2325+
2326+
# reconstruct a timezone if indicated
2327+
ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True)
2328+
23512329
elif dtype == u('timedelta64'):
23522330
ret = np.asarray(ret, dtype='m8[ns]')
23532331

@@ -2397,10 +2375,7 @@ def write_index(self, key, index):
23972375
node._v_attrs.freq = index.freq
23982376

23992377
if hasattr(index, 'tz') and index.tz is not None:
2400-
zone = tslib.get_timezone(index.tz)
2401-
if zone is None:
2402-
zone = tslib.tot_seconds(index.tz.utcoffset())
2403-
node._v_attrs.tz = zone
2378+
node._v_attrs.tz = _get_tz(index.tz)
24042379

24052380
def write_block_index(self, key, index):
24062381
self.write_array('%s_blocs' % key, index.blocs)
@@ -2574,11 +2549,20 @@ def write_array(self, key, value, items=None):
25742549
if empty_array:
25752550
self.write_array_empty(key, value)
25762551
else:
2577-
if value.dtype.type == np.datetime64:
2552+
if com.is_datetime64_dtype(value.dtype):
25782553
self._handle.create_array(self.group, key, value.view('i8'))
25792554
getattr(
25802555
self.group, key)._v_attrs.value_type = 'datetime64'
2581-
elif value.dtype.type == np.timedelta64:
2556+
elif com.is_datetime64tz_dtype(value.dtype):
2557+
# store as UTC
2558+
# with a zone
2559+
self._handle.create_array(self.group, key,
2560+
value.asi8)
2561+
2562+
node = getattr(self.group, key)
2563+
node._v_attrs.tz = _get_tz(value.tz)
2564+
node._v_attrs.value_type = 'datetime64'
2565+
elif com.is_timedelta64_dtype(value.dtype):
25822566
self._handle.create_array(self.group, key, value.view('i8'))
25832567
getattr(
25842568
self.group, key)._v_attrs.value_type = 'timedelta64'
@@ -4248,6 +4232,40 @@ def _get_info(info, name):
42484232
idx = info[name] = dict()
42494233
return idx
42504234

4235+
### tz to/from coercion ###
4236+
def _get_tz(tz):
4237+
""" for a tz-aware type, return an encoded zone """
4238+
zone = tslib.get_timezone(tz)
4239+
if zone is None:
4240+
zone = tslib.tot_seconds(tz.utcoffset())
4241+
return zone
4242+
4243+
def _set_tz(values, tz, preserve_UTC=False, coerce=False):
4244+
"""
4245+
coerce the values to a DatetimeIndex if tz is set
4246+
preserve the input shape if possible
4247+
4248+
Parameters
4249+
----------
4250+
values : ndarray
4251+
tz : string/pickled tz object
4252+
preserve_UTC : boolean,
4253+
preserve the UTC of the result
4254+
coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray
4255+
"""
4256+
if tz is not None:
4257+
values = values.ravel()
4258+
tz = tslib.get_timezone(_ensure_decoded(tz))
4259+
values = DatetimeIndex(values)
4260+
if values.tz is None:
4261+
values = values.tz_localize('UTC').tz_convert(tz)
4262+
if preserve_UTC:
4263+
if tz == 'UTC':
4264+
values = list(values)
4265+
elif coerce:
4266+
values = np.asarray(values, dtype='M8[ns]')
4267+
4268+
return values
42514269

42524270
def _convert_index(index, encoding=None, format_type=None):
42534271
index_name = getattr(index, 'name', None)

pandas/io/tests/test_pytables.py

+19-8
Original file line numberDiff line numberDiff line change
@@ -4909,15 +4909,26 @@ def test_tseries_select_index_column(self):
49094909
result = store.select_column('frame', 'index')
49104910
self.assertEqual(rng.tz, result.dt.tz)
49114911

4912-
def test_timezones(self):
4913-
rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
4914-
frame = DataFrame(np.random.randn(len(rng), 4), index=rng)
4915-
4912+
def test_timezones_fixed(self):
49164913
with ensure_clean_store(self.path) as store:
4917-
store['frame'] = frame
4918-
recons = store['frame']
4919-
self.assertTrue(recons.index.equals(rng))
4920-
self.assertEqual(rng.tz, recons.index.tz)
4914+
4915+
# index
4916+
rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern')
4917+
df = DataFrame(np.random.randn(len(rng), 4), index=rng)
4918+
store['df'] = df
4919+
result = store['df']
4920+
assert_frame_equal(result, df)
4921+
4922+
# as data
4923+
# GH11411
4924+
_maybe_remove(store, 'df')
4925+
df = DataFrame({'A' : rng,
4926+
'B' : rng.tz_convert('UTC').tz_localize(None),
4927+
'C' : rng.tz_convert('CET'),
4928+
'D' : range(len(rng))}, index=rng)
4929+
store['df'] = df
4930+
result = store['df']
4931+
assert_frame_equal(result, df)
49214932

49224933
def test_fixed_offset_tz(self):
49234934
rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00')

0 commit comments

Comments
 (0)