BUG: invalid column names in a HDF5 table format

jameshiebert · jameshiebert · commit 8273356cf5d3 · 2015-06-02T08:05:07.000-07:00
Have DataFrame.to_hdf() raise an error when using pytables with non-string column types. Fixes #9057
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -73,7 +73,7 @@ Bug Fixes
 - Bug in ``Timestamp``'s' ``microsecond``, ``quarter``, ``dayofyear``, ``week`` and ``daysinmonth`` properties return ``np.int`` type, not built-in ``int``. (:issue:`10050`)
 - Bug in ``NaT`` raises ``AttributeError`` when accessing to ``daysinmonth``, ``dayofweek`` properties. (:issue:`10096`)
 
-
+- Bug in ``DataFrame.to_hdf()`` where table format would raise a seemingly unrelated error for invalid (non-string) column names. This is now explicitly forbidden. (:issue:`9057`)
 - Bug in getting timezone data with ``dateutil`` on various platforms ( :issue:`9059`, :issue:`8639`, :issue:`9663`, :issue:`10121`)
 - Bug in display datetimes with mixed frequencies uniformly; display 'ms' datetimes to the proper precision. (:issue:`10170`)
 
@@ -94,4 +94,3 @@ Bug Fixes
 
 
 - Bug where infer_freq infers timerule (WOM-5XXX) unsupported by to_offset (:issue:`9425`)
-
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -257,6 +257,7 @@ def _tables():
 def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None,
            append=None, **kwargs):
     """ store this object, close it if we opened it """
+
     if append:
         f = lambda store: store.append(key, value, **kwargs)
     else:
@@ -1535,6 +1536,12 @@ def maybe_set_size(self, min_itemsize=None, **kwargs):
                 self.typ = _tables(
                 ).StringCol(itemsize=min_itemsize, pos=self.pos)
 
+    def validate(self, handler, append, **kwargs):
+        self.validate_names()
+
+    def validate_names(self):
+        pass
+
     def validate_and_set(self, handler, append, **kwargs):
         self.set_table(handler.table)
         self.validate_col()
@@ -2080,6 +2087,10 @@ class DataIndexableCol(DataCol):
     """ represent a data column that can be indexed """
     is_data_indexable = True
 
+    def validate_names(self):
+        if not Index(self.values).is_object():
+            raise ValueError("cannot have non-object label DataIndexableCol")
+
     def get_atom_string(self, block, itemsize):
         return _tables().StringCol(itemsize=itemsize)
 
@@ -3756,6 +3767,9 @@ def write(self, obj, axes=None, append=False, complib=None,
                          min_itemsize=min_itemsize,
                          **kwargs)
 
+        for a in self.axes:
+            a.validate(self, append)
+
         if not self.is_exists:
 
             # create the table
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -4640,6 +4640,35 @@ def test_colums_multiindex_modified(self):
             df_loaded = read_hdf(path, 'df', columns=cols2load)
             self.assertTrue(cols2load_original == cols2load)
 
+    def test_to_hdf_with_object_column_names(self):
+        # GH9057
+        # Writing HDF5 table format should only work for string-like
+        # column types
+
+        types_should_fail = [ tm.makeIntIndex, tm.makeFloatIndex,
+                                tm.makeDateIndex, tm.makeTimedeltaIndex,
+                                tm.makePeriodIndex ]
+        types_should_run = [ tm.makeStringIndex, tm.makeCategoricalIndex ]
+
+        if compat.PY3:
+            types_should_run.append(tm.makeUnicodeIndex)
+        else:
+            types_should_fail.append(tm.makeUnicodeIndex)
+
+        for index in types_should_fail:
+            df = DataFrame(np.random.randn(10, 2), columns=index(2))
+            with ensure_clean_path(self.path) as path:
+                with self.assertRaises(ValueError,
+                        msg="cannot have non-object label DataIndexableCol"):
+                    df.to_hdf(path, 'df', format='table', data_columns=True)
+
+        for index in types_should_run:
+            df = DataFrame(np.random.randn(10, 2), columns=index(2))
+            with ensure_clean_path(self.path) as path:
+                df.to_hdf(path, 'df', format='table', data_columns=True)
+                result = pd.read_hdf(path, 'df', where="index = [{0}]".format(df.index[0]))
+                assert(len(result))
+
 
 def _test_sort(obj):
     if isinstance(obj, DataFrame):