Merge pull request #4099 from jreback/hdf_tz

jreback · jreback · commit b8c5c67aae2f · 2013-07-01T13:39:08.000-07:00
BUG: GH4098, HDFStore not recreating a datetime index properly when has a timezone
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -52,7 +52,7 @@ pandas 0.12
   - A ``filter`` method on grouped Series or DataFrames returns a subset of
     the original (:issue:`3680`, :issue:`919`)
   - Access to historical Google Finance data in pandas.io.data (:issue:`3814`)
-  - DataFrame plotting methods can sample column colors from a Matplotlib 
+  - DataFrame plotting methods can sample column colors from a Matplotlib
     colormap via the ``colormap`` keyword. (:issue:`3860`)
 
 **Improvements to existing features**
@@ -63,7 +63,7 @@ pandas 0.12
   - ``convert_objects`` now accepts a ``copy`` parameter (defaults to ``True``)
   - ``HDFStore``
 
-    - will retain index attributes (freq,tz,name) on recreation (:issue:`3499`)
+    - will retain index attributes (freq,tz,name) on recreation (:issue:`3499`,:issue:`4098`)
     - will warn with a ``AttributeConflictWarning`` if you are attempting to append
       an index with a different frequency than the existing, or attempting
       to append an index with a different name than the existing
@@ -158,7 +158,7 @@ pandas 0.12
     - removed ``clipboard`` support to ``pandas.io.clipboard``
     - replace top-level and instance methods ``save`` and ``load`` with
       top-level ``read_pickle`` and ``to_pickle`` instance method, ``save`` and
-      ``load`` will give deprecation warning.  
+      ``load`` will give deprecation warning.
   - the ``method`` and ``axis`` arguments of ``DataFrame.replace()`` are
     deprecated
   - set FutureWarning to require data_source, and to replace year/month with
@@ -215,7 +215,7 @@ pandas 0.12
     - Extend ``reindex`` to correctly deal with non-unique indices (:issue:`3679`)
     - ``DataFrame.itertuples()`` now works with frames with duplicate column
       names (:issue:`3873`)
-    - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to 
+    - Bug in non-unique indexing via ``iloc`` (:issue:`4017`); added ``takeable`` argument to
       ``reindex`` for location-based taking
 
   - Fixed bug in groupby with empty series referencing a variable before assignment. (:issue:`3510`)
@@ -272,16 +272,16 @@ pandas 0.12
   - Correctly parse when passed the ``dtype=str`` (or other variable-len string dtypes)
     in ``read_csv`` (:issue:`3795`)
   - Fix index name not propogating when using ``loc/ix`` (:issue:`3880`)
-  - Fix groupby when applying a custom function resulting in a returned DataFrame was 
+  - Fix groupby when applying a custom function resulting in a returned DataFrame was
     not converting dtypes (:issue:`3911`)
   - Fixed a bug where ``DataFrame.replace`` with a compiled regular expression
     in the ``to_replace`` argument wasn't working (:issue:`3907`)
   - Fixed ``__truediv__`` in Python 2.7 with ``numexpr`` installed to actually do true division when dividing
     two integer arrays with at least 10000 cells total (:issue:`3764`)
   - Indexing with a string with seconds resolution not selecting from a time index (:issue:`3925`)
-  - csv parsers would loop infinitely if ``iterator=True`` but no ``chunksize`` was 
+  - csv parsers would loop infinitely if ``iterator=True`` but no ``chunksize`` was
     specified (:issue:`3967`), python parser failing with ``chunksize=1``
-  - Fix index name not propogating when using ``shift`` 
+  - Fix index name not propogating when using ``shift``
   - Fixed dropna=False being ignored with multi-index stack (:issue:`3997`)
   - Fixed flattening of columns when renaming MultiIndex columns DataFrame (:issue:`4004`)
   - Fix ``Series.clip`` for datetime series. NA/NaN threshold values will now throw ValueError (:issue:`3996`)
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -151,8 +151,8 @@ def _tables():
 def h5_open(path, mode):
     tables = _tables()
     return tables.openFile(path, mode)
-    
-    
+
+
 @contextmanager
 def get_store(path, mode='a', complevel=None, complib=None,
               fletcher32=False):
@@ -217,7 +217,7 @@ def read_hdf(path_or_buf, key, **kwargs):
 
     # a passed store; user controls open/close
     f(path_or_buf, False)
-    
+
 class HDFStore(object):
     """
     dict-like IO interface for storing pandas objects in PyTables
@@ -757,7 +757,7 @@ def get_node(self, key):
     def get_storer(self, key):
         """ return the storer object for a key, raise if not in the file """
         group = self.get_node(key)
-        if group is None: 
+        if group is None:
             return None
         s = self._create_storer(group)
         s.infer_axes()
@@ -810,9 +810,9 @@ def _create_storer(self, group, value = None, table = False, append = False, **k
         """ return a suitable Storer class to operate """
 
         def error(t):
-            raise TypeError("cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" % 
+            raise TypeError("cannot properly create the storer for: [%s] [group->%s,value->%s,table->%s,append->%s,kwargs->%s]" %
                             (t,group,type(value),table,append,kwargs))
-        
+
         pt = _ensure_decoded(getattr(group._v_attrs,'pandas_type',None))
         tt = _ensure_decoded(getattr(group._v_attrs,'table_type',None))
 
@@ -863,7 +863,7 @@ def error(t):
                     tt = u'appendable_ndim'
 
             else:
-                
+
                 # distiguish between a frame/table
                 tt = u'legacy_panel'
                 try:
@@ -930,7 +930,7 @@ def _read_group(self, group, **kwargs):
 
 class TableIterator(object):
     """ define the iteration interface on a table
-        
+
         Parameters
         ----------
 
@@ -974,7 +974,7 @@ def __iter__(self):
             yield v
 
         self.close()
-            
+
     def close(self):
         if self.auto_close:
             self.store.close()
@@ -1003,7 +1003,7 @@ class IndexCol(object):
     _info_fields = ['freq','tz','index_name']
 
     def __init__(self, values=None, kind=None, typ=None, cname=None, itemsize=None,
-                 name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None, 
+                 name=None, axis=None, kind_attr=None, pos=None, freq=None, tz=None,
                  index_name=None, **kwargs):
         self.values = values
         self.kind = kind
@@ -1088,21 +1088,27 @@ def convert(self, values, nan_rep, encoding):
         except:
             pass
 
+        values =_maybe_convert(values, self.kind, encoding)
+
         kwargs = dict()
         if self.freq is not None:
             kwargs['freq'] = _ensure_decoded(self.freq)
-        if self.tz is not None:
-            kwargs['tz'] = _ensure_decoded(self.tz)
         if self.index_name is not None:
             kwargs['name'] = _ensure_decoded(self.index_name)
         try:
-            self.values = Index(_maybe_convert(values, self.kind, self.encoding), **kwargs)
+            self.values = Index(values, **kwargs)
         except:
 
             # if the output freq is different that what we recorded, then infer it
             if 'freq' in kwargs:
                 kwargs['freq'] = 'infer'
             self.values = Index(_maybe_convert(values, self.kind, encoding), **kwargs)
+
+        # set the timezone if indicated
+        # we stored in utc, so reverse to local timezone
+        if self.tz is not None:
+            self.values = self.values.tz_localize('UTC').tz_convert(_ensure_decoded(self.tz))
+
         return self
 
     def take_data(self):
@@ -1189,7 +1195,7 @@ def update_info(self, info):
                 idx = info[self.name]
             except:
                 idx = info[self.name] = dict()
-        
+
             existing_value = idx.get(key)
             if key in idx and value is not None and existing_value != value:
 
@@ -1235,7 +1241,7 @@ def is_indexed(self):
 
     def convert(self, values, nan_rep, encoding):
         """ set the values from this selection: take = take ownership """
-        
+
         self.values = Int64Index(np.arange(self.table.nrows))
         return self
 
@@ -1359,7 +1365,13 @@ def set_atom(self, block, existing_col, min_itemsize, nan_rep, info, encoding=No
                         "invalid timezone specification")
 
                 values = index.tz_convert('UTC').values.view('i8')
-                self.tz = tz
+
+                # store a converted timezone
+                zone = tslib.get_timezone(index.tz)
+                if zone is None:
+                    zone = tslib.tot_seconds(index.tz.utcoffset())
+                self.tz = zone
+
                 self.update_info(info)
                 self.set_atom_datetime64(block, values.reshape(block.values.shape))
 
@@ -1398,7 +1410,7 @@ def set_atom_string(self, block, existing_col, min_itemsize, nan_rep, encoding):
                 inferred_type = lib.infer_dtype(col.ravel())
                 if inferred_type != 'string':
                     raise TypeError("Cannot serialize the column [%s] because\n"
-                                    "its data contents are [%s] object dtype" % 
+                                    "its data contents are [%s] object dtype" %
                                     (item,inferred_type))
 
 
@@ -1607,7 +1619,7 @@ def __repr__(self):
                 s = "[%s]" % ','.join([ str(x) for x in s ])
             return "%-12.12s (shape->%s)" % (self.pandas_type,s)
         return self.pandas_type
-    
+
     def __str__(self):
         return self.__repr__()
 
@@ -1929,7 +1941,7 @@ def write_array_empty(self, key, value):
         self._handle.createArray(self.group, key, arr)
         getattr(self.group, key)._v_attrs.value_type = str(value.dtype)
         getattr(self.group, key)._v_attrs.shape = value.shape
-        
+
     def write_array(self, key, value, items=None):
         if key in self.group:
             self._handle.removeNode(self.group, key)
@@ -2142,7 +2154,7 @@ def shape(self):
         try:
             ndim = self.ndim
 
-            # items 
+            # items
             items = 0
             for i in range(self.nblocks):
                 node = getattr(self.group, 'block%d_items' % i)
@@ -2212,7 +2224,7 @@ class PanelStorer(BlockManagerStorer):
     pandas_kind = u'wide'
     obj_type    = Panel
     is_shape_reversed = True
-    
+
     def write(self, obj, **kwargs):
         obj._consolidate_inplace()
         return super(PanelStorer, self).write(obj, **kwargs)
@@ -2270,7 +2282,7 @@ def __repr__(self):
                                                                               self.ncols,
                                                                               ','.join([ a.name for a in self.index_axes ]),
                                                                               dc)
-    
+
     def __getitem__(self, c):
         """ return the axis for c """
         for a in self.axes:
@@ -2568,7 +2580,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
             try:
                 axes = _AXES_MAP[type(obj)]
             except:
-                raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" % 
+                raise TypeError("cannot properly create the storer for: [group->%s,value->%s]" %
                                 (self.group._v_name,type(obj)))
 
         # map axes to numbers
@@ -2597,7 +2609,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
         # nan_representation
         if nan_rep is None:
             nan_rep = 'nan'
-            
+
         self.nan_rep = nan_rep
 
         # create axes to index and non_index
@@ -2665,7 +2677,7 @@ def create_axes(self, axes, obj, validate=True, nan_rep=None, data_columns=None,
                 name = b.items[0]
                 self.data_columns.append(name)
 
-            # make sure that we match up the existing columns 
+            # make sure that we match up the existing columns
             # if we have an existing table
             if existing_table is not None and validate:
                 try:
@@ -2740,7 +2752,7 @@ def process_filter(field, filt):
                             return obj.ix._getitem_axis(takers,axis=axis_number)
 
                     raise ValueError("cannot find the field [%s] for filtering!" % field)
-  
+
                 obj = process_filter(field, filt)
 
         return obj
@@ -3053,7 +3065,7 @@ def write_data_chunk(self, indexes, mask, search, values):
                 self.table.flush()
         except (Exception), detail:
             raise Exception("tables cannot write this data -> %s" % str(detail))
- 
+
     def delete(self, where=None, **kwargs):
 
         # delete all rows (and return the nrows)
@@ -3113,7 +3125,7 @@ class AppendableFrameTable(AppendableTable):
     table_type = u'appendable_frame'
     ndim = 2
     obj_type = DataFrame
-    
+
     @property
     def is_transposed(self):
         return self.index_axes[0].axis == 1
@@ -3266,7 +3278,7 @@ def _convert_index(index, encoding=None):
 
     if isinstance(index, DatetimeIndex):
         converted = index.asi8
-        return IndexCol(converted, 'datetime64', _tables().Int64Col(), 
+        return IndexCol(converted, 'datetime64', _tables().Int64Col(),
                         freq=getattr(index,'freq',None), tz=getattr(index,'tz',None),
                         index_name=index_name)
     elif isinstance(index, (Int64Index, PeriodIndex)):
@@ -3382,7 +3394,7 @@ def _unconvert_string_array(data, nan_rep=None, encoding=None):
 
     if nan_rep is None:
         nan_rep = 'nan'
-            
+
     data = lib.string_array_replace_from_nan_rep(data, nan_rep)
     return data.reshape(shape)
 
@@ -3421,7 +3433,7 @@ class Term(object):
     value : a value or list of values (required)
     queryables : a kinds map (dict of column name -> kind), or None i column is non-indexable
     encoding : an encoding that will encode the query terms
-    
+
     Returns
     -------
     a Term object
@@ -3582,7 +3594,7 @@ def eval(self):
             if self.is_in_table:
 
                 self.condition = self.generate(values[0])
-                        
+
             else:
 
                 raise TypeError("passing a filterable condition to a non-table indexer [%s]" % str(self))
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py