BUG: Make pd.read_hdf('data.h5') work when pandas object stored contained categorical columns

chrish42 · jreback · commit 5a9b498e43a4 · 2016-06-05T10:06:00.000-04:00
closes #13231 Author: Christian Hudon <chrish@pianocktail.org> Closes #13359 from chrish42/gh13231 and squashes the following commits: e839638 [Christian Hudon] Raise a better exception when the HDF file is empty and kwy=None. 611aa28 [Christian Hudon] Formatting fixes. e7c8313 [Christian Hudon] Add changelog entry. df10016 [Christian Hudon] Make logic that detects if there is only one dataset in a HDF5 file work when storing a dataframe that contains categorical data. 2f41aef [Christian Hudon] Tweak comment to be clearer. b3a5773 [Christian Hudon] Add test that fails for GitHub bug #13231 02f90d5 [Christian Hudon] Use if-expression.
diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt
@@ -342,6 +342,7 @@ Bug Fixes
 - Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`)
 - Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`)
 - Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue: `13306`)
+- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`)
 
 
 
diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py
@@ -331,11 +331,20 @@ def read_hdf(path_or_buf, key=None, **kwargs):
 
     try:
         if key is None:
-            keys = store.keys()
-            if len(keys) != 1:
-                raise ValueError('key must be provided when HDF file contains '
-                                 'multiple datasets.')
-            key = keys[0]
+            groups = store.groups()
+            if len(groups) == 0:
+                raise ValueError('No dataset in HDF5 file.')
+            candidate_only_group = groups[0]
+
+            # For the HDF file to have only one dataset, all other groups
+            # should then be metadata groups for that candidate group. (This
+            # assumes that the groups() method enumerates parent groups
+            # before their children.)
+            for group_to_check in groups[1:]:
+                if not _is_metadata_of(group_to_check, candidate_only_group):
+                    raise ValueError('key must be provided when HDF5 file '
+                                     'contains multiple datasets.')
+            key = candidate_only_group._v_pathname
         return store.select(key, auto_close=auto_close, **kwargs)
     except:
         # if there is an error, close the store
@@ -347,6 +356,20 @@ def read_hdf(path_or_buf, key=None, **kwargs):
         raise
 
 
+def _is_metadata_of(group, parent_group):
+    """Check if a given group is a metadata group for a given parent_group."""
+    if group._v_depth <= parent_group._v_depth:
+        return False
+
+    current = group
+    while current._v_depth > 1:
+        parent = current._v_parent
+        if parent == parent_group and current._v_name == 'meta':
+            return True
+        current = current._v_parent
+    return False
+
+
 class HDFStore(StringMixin):
 
     """
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -46,8 +46,8 @@
 
 from distutils.version import LooseVersion
 
-_default_compressor = LooseVersion(tables.__version__) >= '2.2' \
-    and 'blosc' or 'zlib'
+_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2'
+                       else 'zlib')
 
 _multiprocess_can_split_ = False
 
@@ -4877,13 +4877,34 @@ def test_read_nokey(self):
         df = DataFrame(np.random.rand(4, 5),
                        index=list('abcd'),
                        columns=list('ABCDE'))
+
+        # Categorical dtype not supported for "fixed" format. So no need
+        # to test with that dtype in the dataframe here.
         with ensure_clean_path(self.path) as path:
             df.to_hdf(path, 'df', mode='a')
             reread = read_hdf(path)
             assert_frame_equal(df, reread)
             df.to_hdf(path, 'df2', mode='a')
             self.assertRaises(ValueError, read_hdf, path)
 
+    def test_read_nokey_table(self):
+        # GH13231
+        df = DataFrame({'i': range(5),
+                        'c': Series(list('abacd'), dtype='category')})
+
+        with ensure_clean_path(self.path) as path:
+            df.to_hdf(path, 'df', mode='a', format='table')
+            reread = read_hdf(path)
+            assert_frame_equal(df, reread)
+            df.to_hdf(path, 'df2', mode='a', format='table')
+            self.assertRaises(ValueError, read_hdf, path)
+
+    def test_read_nokey_empty(self):
+        with ensure_clean_path(self.path) as path:
+            store = HDFStore(path)
+            store.close()
+            self.assertRaises(ValueError, read_hdf, path)
+
     def test_read_from_pathlib_path(self):
 
         # GH11773