Skip to content

Commit 5a9b498

Browse files
chrish42jreback
authored andcommitted
BUG: Make pd.read_hdf('data.h5') work when pandas object stored contained categorical columns
closes #13231 Author: Christian Hudon <[email protected]> Closes #13359 from chrish42/gh13231 and squashes the following commits: e839638 [Christian Hudon] Raise a better exception when the HDF file is empty and kwy=None. 611aa28 [Christian Hudon] Formatting fixes. e7c8313 [Christian Hudon] Add changelog entry. df10016 [Christian Hudon] Make logic that detects if there is only one dataset in a HDF5 file work when storing a dataframe that contains categorical data. 2f41aef [Christian Hudon] Tweak comment to be clearer. b3a5773 [Christian Hudon] Add test that fails for GitHub bug #13231 02f90d5 [Christian Hudon] Use if-expression.
1 parent 863cbc5 commit 5a9b498

File tree

3 files changed

+52
-7
lines changed

3 files changed

+52
-7
lines changed

doc/source/whatsnew/v0.18.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ Bug Fixes
342342
- Bug in ``.resample(..)`` with a ``PeriodIndex`` not retaining its type or name with an empty ``DataFrame``appropriately when empty (:issue:`13212`)
343343
- Bug in ``groupby(..).resample(..)`` where passing some keywords would raise an exception (:issue:`13235`)
344344
- Bug in ``.tz_convert`` on a tz-aware ``DateTimeIndex`` that relied on index being sorted for correct results (:issue: `13306`)
345+
- Bug in ``pd.read_hdf()`` where attempting to load an HDF file with a single dataset, that had one or more categorical columns, failed unless the key argument was set to the name of the dataset. (:issue:`13231`)
345346

346347

347348

pandas/io/pytables.py

+28-5
Original file line numberDiff line numberDiff line change
@@ -331,11 +331,20 @@ def read_hdf(path_or_buf, key=None, **kwargs):
331331

332332
try:
333333
if key is None:
334-
keys = store.keys()
335-
if len(keys) != 1:
336-
raise ValueError('key must be provided when HDF file contains '
337-
'multiple datasets.')
338-
key = keys[0]
334+
groups = store.groups()
335+
if len(groups) == 0:
336+
raise ValueError('No dataset in HDF5 file.')
337+
candidate_only_group = groups[0]
338+
339+
# For the HDF file to have only one dataset, all other groups
340+
# should then be metadata groups for that candidate group. (This
341+
# assumes that the groups() method enumerates parent groups
342+
# before their children.)
343+
for group_to_check in groups[1:]:
344+
if not _is_metadata_of(group_to_check, candidate_only_group):
345+
raise ValueError('key must be provided when HDF5 file '
346+
'contains multiple datasets.')
347+
key = candidate_only_group._v_pathname
339348
return store.select(key, auto_close=auto_close, **kwargs)
340349
except:
341350
# if there is an error, close the store
@@ -347,6 +356,20 @@ def read_hdf(path_or_buf, key=None, **kwargs):
347356
raise
348357

349358

359+
def _is_metadata_of(group, parent_group):
360+
"""Check if a given group is a metadata group for a given parent_group."""
361+
if group._v_depth <= parent_group._v_depth:
362+
return False
363+
364+
current = group
365+
while current._v_depth > 1:
366+
parent = current._v_parent
367+
if parent == parent_group and current._v_name == 'meta':
368+
return True
369+
current = current._v_parent
370+
return False
371+
372+
350373
class HDFStore(StringMixin):
351374

352375
"""

pandas/io/tests/test_pytables.py

+23-2
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@
4646

4747
from distutils.version import LooseVersion
4848

49-
_default_compressor = LooseVersion(tables.__version__) >= '2.2' \
50-
and 'blosc' or 'zlib'
49+
_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2'
50+
else 'zlib')
5151

5252
_multiprocess_can_split_ = False
5353

@@ -4877,13 +4877,34 @@ def test_read_nokey(self):
48774877
df = DataFrame(np.random.rand(4, 5),
48784878
index=list('abcd'),
48794879
columns=list('ABCDE'))
4880+
4881+
# Categorical dtype not supported for "fixed" format. So no need
4882+
# to test with that dtype in the dataframe here.
48804883
with ensure_clean_path(self.path) as path:
48814884
df.to_hdf(path, 'df', mode='a')
48824885
reread = read_hdf(path)
48834886
assert_frame_equal(df, reread)
48844887
df.to_hdf(path, 'df2', mode='a')
48854888
self.assertRaises(ValueError, read_hdf, path)
48864889

4890+
def test_read_nokey_table(self):
4891+
# GH13231
4892+
df = DataFrame({'i': range(5),
4893+
'c': Series(list('abacd'), dtype='category')})
4894+
4895+
with ensure_clean_path(self.path) as path:
4896+
df.to_hdf(path, 'df', mode='a', format='table')
4897+
reread = read_hdf(path)
4898+
assert_frame_equal(df, reread)
4899+
df.to_hdf(path, 'df2', mode='a', format='table')
4900+
self.assertRaises(ValueError, read_hdf, path)
4901+
4902+
def test_read_nokey_empty(self):
4903+
with ensure_clean_path(self.path) as path:
4904+
store = HDFStore(path)
4905+
store.close()
4906+
self.assertRaises(ValueError, read_hdf, path)
4907+
48874908
def test_read_from_pathlib_path(self):
48884909

48894910
# GH11773

0 commit comments

Comments
 (0)