From addcec2eb8a8f990234c19f2a0a0bb471e73fe43 Mon Sep 17 00:00:00 2001 From: Jeff Reback Date: Tue, 17 Nov 2015 07:40:55 -0500 Subject: [PATCH] ENH: Implement export of datetime64[ns, tz] dtypes with a fixed HDF5 store #11411 --- doc/source/whatsnew/v0.17.1.txt | 2 +- pandas/io/pytables.py | 94 +++++++++++++++++++------------- pandas/io/tests/test_pytables.py | 27 ++++++--- 3 files changed, 76 insertions(+), 47 deletions(-) diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 96f936c58c5d5..d32725449c49b 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -66,6 +66,7 @@ Enhancements pd.Index([1, np.nan, 3]).fillna(2) - ``pivot_table`` now has a ``margins_name`` argument so you can use something other than the default of 'All' (:issue:`3335`) +- Implement export of ``datetime64[ns, tz]`` dtypes with a fixed HDF5 store (:issue:`11411`) .. _whatsnew_0171.api: @@ -159,4 +160,3 @@ Bug Fixes - Bug in ``DataFrame.join()`` with ``how='right'`` producing a ``TypeError`` (:issue:`11519`) - Bug in ``Series.quantile`` with empty list results has ``Index`` with ``object`` dtype (:issue:`11588`) - Bug in ``pd.merge`` results in empty ``Int64Index`` rather than ``Index(dtype=object)`` when the merge result is empty (:issue:`11588`) - diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 4e25b546bddf2..fb57a7f8bd838 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -47,6 +47,7 @@ # versioning attribute _version = '0.15.2' +### encoding ### # PY3 encoding if we don't specify _default_encoding = 'UTF-8' @@ -64,22 +65,8 @@ def _ensure_encoding(encoding): encoding = _default_encoding return encoding -def _set_tz(values, tz, preserve_UTC=False): - """ set the timezone if values are an Index """ - if tz is not None and isinstance(values, Index): - tz = _ensure_decoded(tz) - if values.tz is None: - values = values.tz_localize('UTC').tz_convert(tz) - if preserve_UTC: - if tslib.get_timezone(tz) == 'UTC': - values = list(values) - - return values - - Term = Expr - def _ensure_term(where, scope_level): """ ensure that the where is a Term or a list of Term @@ -1947,14 +1934,11 @@ def set_atom_datetime64tz(self, block, info, values=None): if values is None: values = block.values - # convert this column to datetime64[ns] utc, and save the tz - values = values.tz_convert('UTC').values.view('i8').reshape(block.shape) + # convert this column to i8 in UTC, and save the tz + values = values.asi8.reshape(block.shape) # store a converted timezone - zone = tslib.get_timezone(block.values.tz) - if zone is None: - zone = tslib.tot_seconds(block.values.tz.utcoffset()) - self.tz = zone + self.tz = _get_tz(block.values.tz) self.update_info(info) self.kind = 'datetime64' @@ -2015,18 +1999,9 @@ def convert(self, values, nan_rep, encoding): # reverse converts if dtype == u('datetime64'): - # recreate the timezone - if self.tz is not None: - - # data should be 2-dim here - # we stored as utc, so just set the tz - index = DatetimeIndex( - self.data.ravel(), tz='UTC').tz_convert(tslib.maybe_get_tz(self.tz)) - self.data = index - - else: - self.data = np.asarray(self.data, dtype='M8[ns]') + # recreate with tz if indicated + self.data = _set_tz(self.data, self.tz, coerce=True) elif dtype == u('timedelta64'): self.data = np.asarray(self.data, dtype='m8[ns]') @@ -2347,7 +2322,10 @@ def read_array(self, key): ret = data if dtype == u('datetime64'): - ret = np.asarray(ret, dtype='M8[ns]') + + # reconstruct a timezone if indicated + ret = _set_tz(ret, getattr(attrs, 'tz', None), coerce=True) + elif dtype == u('timedelta64'): ret = np.asarray(ret, dtype='m8[ns]') @@ -2397,10 +2375,7 @@ def write_index(self, key, index): node._v_attrs.freq = index.freq if hasattr(index, 'tz') and index.tz is not None: - zone = tslib.get_timezone(index.tz) - if zone is None: - zone = tslib.tot_seconds(index.tz.utcoffset()) - node._v_attrs.tz = zone + node._v_attrs.tz = _get_tz(index.tz) def write_block_index(self, key, index): self.write_array('%s_blocs' % key, index.blocs) @@ -2574,11 +2549,20 @@ def write_array(self, key, value, items=None): if empty_array: self.write_array_empty(key, value) else: - if value.dtype.type == np.datetime64: + if com.is_datetime64_dtype(value.dtype): self._handle.create_array(self.group, key, value.view('i8')) getattr( self.group, key)._v_attrs.value_type = 'datetime64' - elif value.dtype.type == np.timedelta64: + elif com.is_datetime64tz_dtype(value.dtype): + # store as UTC + # with a zone + self._handle.create_array(self.group, key, + value.asi8) + + node = getattr(self.group, key) + node._v_attrs.tz = _get_tz(value.tz) + node._v_attrs.value_type = 'datetime64' + elif com.is_timedelta64_dtype(value.dtype): self._handle.create_array(self.group, key, value.view('i8')) getattr( self.group, key)._v_attrs.value_type = 'timedelta64' @@ -4248,6 +4232,40 @@ def _get_info(info, name): idx = info[name] = dict() return idx +### tz to/from coercion ### +def _get_tz(tz): + """ for a tz-aware type, return an encoded zone """ + zone = tslib.get_timezone(tz) + if zone is None: + zone = tslib.tot_seconds(tz.utcoffset()) + return zone + +def _set_tz(values, tz, preserve_UTC=False, coerce=False): + """ + coerce the values to a DatetimeIndex if tz is set + preserve the input shape if possible + + Parameters + ---------- + values : ndarray + tz : string/pickled tz object + preserve_UTC : boolean, + preserve the UTC of the result + coerce : if we do not have a passed timezone, coerce to M8[ns] ndarray + """ + if tz is not None: + values = values.ravel() + tz = tslib.get_timezone(_ensure_decoded(tz)) + values = DatetimeIndex(values) + if values.tz is None: + values = values.tz_localize('UTC').tz_convert(tz) + if preserve_UTC: + if tz == 'UTC': + values = list(values) + elif coerce: + values = np.asarray(values, dtype='M8[ns]') + + return values def _convert_index(index, encoding=None, format_type=None): index_name = getattr(index, 'name', None) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 9ffb0bfe79b8d..fd8c28bfe0f85 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4909,15 +4909,26 @@ def test_tseries_select_index_column(self): result = store.select_column('frame', 'index') self.assertEqual(rng.tz, result.dt.tz) - def test_timezones(self): - rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') - frame = DataFrame(np.random.randn(len(rng), 4), index=rng) - + def test_timezones_fixed(self): with ensure_clean_store(self.path) as store: - store['frame'] = frame - recons = store['frame'] - self.assertTrue(recons.index.equals(rng)) - self.assertEqual(rng.tz, recons.index.tz) + + # index + rng = date_range('1/1/2000', '1/30/2000', tz='US/Eastern') + df = DataFrame(np.random.randn(len(rng), 4), index=rng) + store['df'] = df + result = store['df'] + assert_frame_equal(result, df) + + # as data + # GH11411 + _maybe_remove(store, 'df') + df = DataFrame({'A' : rng, + 'B' : rng.tz_convert('UTC').tz_localize(None), + 'C' : rng.tz_convert('CET'), + 'D' : range(len(rng))}, index=rng) + store['df'] = df + result = store['df'] + assert_frame_equal(result, df) def test_fixed_offset_tz(self): rng = date_range('1/1/2000 00:00:00-07:00', '1/30/2000 00:00:00-07:00')