From 02f90d5b3a4afef95b8fcb8e0e7529acaf5ad9a3 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Thu, 2 Jun 2016 16:46:16 -0700 Subject: [PATCH 1/7] Use if-expression. --- pandas/io/tests/test_pytables.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 96b66265ea586..28d687b00d73f 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -46,8 +46,8 @@ from distutils.version import LooseVersion -_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ - and 'blosc' or 'zlib' +_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' + else 'zlib') _multiprocess_can_split_ = False From b3a577319a0a4c00cd8a3c6211e910b5a160424d Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Thu, 2 Jun 2016 16:47:01 -0700 Subject: [PATCH 2/7] Add test that fails for GitHub bug #13231 --- pandas/io/tests/test_pytables.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 28d687b00d73f..14f5b867280a3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4877,6 +4877,8 @@ def test_read_nokey(self): df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) + # Categorical dtype not supported for "fixed" format. So no need + # to test for that. with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a') reread = read_hdf(path) @@ -4884,6 +4886,17 @@ def test_read_nokey(self): df.to_hdf(path, 'df2', mode='a') self.assertRaises(ValueError, read_hdf, path) + def test_read_nokey_table(self): + # GH13231 + df = DataFrame({'i': range(5), + 'c': Series(list('abacd'), dtype='category')}) + with ensure_clean_path(self.path) as path: + df.to_hdf(path, 'df', mode='a', format='table') + reread = read_hdf(path) + assert_frame_equal(df, reread) + df.to_hdf(path, 'df2', mode='a', format='table') + self.assertRaises(ValueError, read_hdf, path) + def test_read_from_pathlib_path(self): # GH11773 From 2f41aef53d97bfbba2cb5f4a01e5aa2928b9f4e5 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Fri, 3 Jun 2016 15:28:06 -0700 Subject: [PATCH 3/7] Tweak comment to be clearer. --- pandas/io/tests/test_pytables.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 14f5b867280a3..6dbc2746ebf38 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4878,7 +4878,7 @@ def test_read_nokey(self): index=list('abcd'), columns=list('ABCDE')) # Categorical dtype not supported for "fixed" format. So no need - # to test for that. + # to test with that dtype in the dataframe here. with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a') reread = read_hdf(path) From df100160ded4d36e957e701de9289e54275e00c1 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Fri, 3 Jun 2016 15:29:45 -0700 Subject: [PATCH 4/7] Make logic that detects if there is only one dataset in a HDF5 file work when storing a dataframe that contains categorical data. --- pandas/io/pytables.py | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fcf5125d956c6..072f634618c58 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -331,11 +331,17 @@ def read_hdf(path_or_buf, key=None, **kwargs): try: if key is None: - keys = store.keys() - if len(keys) != 1: - raise ValueError('key must be provided when HDF file contains ' - 'multiple datasets.') - key = keys[0] + groups = store.groups() + candidate_only_group = groups[0] + # For the HDF file to have only one dataset, all other groups + # should then be metadata groups for that candidate group. (This + # assumes that the groups() method enumerates parent groups + # before their children.) + for group_to_check in groups[1:]: + if not _is_metadata_of(group_to_check, candidate_only_group): + raise ValueError('key must be provided when HDF file ' + 'contains multiple datasets.') + key = candidate_only_group._v_pathname return store.select(key, auto_close=auto_close, **kwargs) except: # if there is an error, close the store @@ -347,6 +353,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): raise +def _is_metadata_of(group, parent_group): + """Check if a given group is a metadata group for a given parent_group.""" + if group._v_depth <= parent_group._v_depth: + return False + + current = group + while current._v_depth > 1: + parent = current._v_parent + if parent == parent_group and current._v_name == 'meta': + return True + current = current._v_parent + return False + + class HDFStore(StringMixin): """ From e7c8313e611ca13024f31ce179a4758155047835 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Fri, 3 Jun 2016 16:03:17 -0700 Subject: [PATCH 5/7] Add changelog entry. --- doc/source/whatsnew/v0.18.2.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/doc/source/whatsnew/v0.18.2.txt b/doc/source/whatsnew/v0.18.2.txt index 950bf397f43b5..7cf27d13a44ac 100644 --- a/doc/source/whatsnew/v0.18.2.txt +++ b/doc/source/whatsnew/v0.18.2.txt @@ -374,3 +374,6 @@ Bug Fixes - Bug in ``Categorical.remove_unused_categories()`` changes ``.codes`` dtype to platform int (:issue:`13261`) + +- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset (that had one or more categorical columns) failed unless the key argument was set to the name of the dataset. (:issue:`13231`) + From 611aa284695af9c6a5b22f155961795729664432 Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Sat, 4 Jun 2016 12:11:26 -0700 Subject: [PATCH 6/7] Formatting fixes. --- pandas/io/pytables.py | 1 + pandas/io/tests/test_pytables.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 072f634618c58..2d26ab06207a8 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -333,6 +333,7 @@ def read_hdf(path_or_buf, key=None, **kwargs): if key is None: groups = store.groups() candidate_only_group = groups[0] + # For the HDF file to have only one dataset, all other groups # should then be metadata groups for that candidate group. (This # assumes that the groups() method enumerates parent groups diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 6dbc2746ebf38..d1d665a7f1a43 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4877,6 +4877,7 @@ def test_read_nokey(self): df = DataFrame(np.random.rand(4, 5), index=list('abcd'), columns=list('ABCDE')) + # Categorical dtype not supported for "fixed" format. So no need # to test with that dtype in the dataframe here. with ensure_clean_path(self.path) as path: @@ -4890,6 +4891,7 @@ def test_read_nokey_table(self): # GH13231 df = DataFrame({'i': range(5), 'c': Series(list('abacd'), dtype='category')}) + with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a', format='table') reread = read_hdf(path) From e8396382fa164573614950e3d0f16e0bcb22c47d Mon Sep 17 00:00:00 2001 From: Christian Hudon Date: Sat, 4 Jun 2016 12:22:42 -0700 Subject: [PATCH 7/7] Raise a better exception when the HDF file is empty and kwy=None. --- pandas/io/pytables.py | 2 ++ pandas/io/tests/test_pytables.py | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index 2d26ab06207a8..6c7623ec7ed4a 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -332,6 +332,8 @@ def read_hdf(path_or_buf, key=None, **kwargs): try: if key is None: groups = store.groups() + if len(groups) == 0: + raise ValueError('No dataset in HDF file.') candidate_only_group = groups[0] # For the HDF file to have only one dataset, all other groups diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index d1d665a7f1a43..9c13162bd774c 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -4899,6 +4899,12 @@ def test_read_nokey_table(self): df.to_hdf(path, 'df2', mode='a', format='table') self.assertRaises(ValueError, read_hdf, path) + def test_read_nokey_empty(self): + with ensure_clean_path(self.path) as path: + store = HDFStore(path) + store.close() + self.assertRaises(ValueError, read_hdf, path) + def test_read_from_pathlib_path(self): # GH11773