-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Make pd.read_hdf('data.h5') work when pandas object stored contained categorical columns #13359
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 5 commits
02f90d5
b3a5773
2f41aef
df10016
e7c8313
611aa28
e839638
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -331,11 +331,17 @@ def read_hdf(path_or_buf, key=None, **kwargs): | |
|
||
try: | ||
if key is None: | ||
keys = store.keys() | ||
if len(keys) != 1: | ||
raise ValueError('key must be provided when HDF file contains ' | ||
'multiple datasets.') | ||
key = keys[0] | ||
groups = store.groups() | ||
candidate_only_group = groups[0] | ||
# For the HDF file to have only one dataset, all other groups | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line |
||
# should then be metadata groups for that candidate group. (This | ||
# assumes that the groups() method enumerates parent groups | ||
# before their children.) | ||
for group_to_check in groups[1:]: | ||
if not _is_metadata_of(group_to_check, candidate_only_group): | ||
raise ValueError('key must be provided when HDF file ' | ||
'contains multiple datasets.') | ||
key = candidate_only_group._v_pathname | ||
return store.select(key, auto_close=auto_close, **kwargs) | ||
except: | ||
# if there is an error, close the store | ||
|
@@ -347,6 +353,20 @@ def read_hdf(path_or_buf, key=None, **kwargs): | |
raise | ||
|
||
|
||
def _is_metadata_of(group, parent_group): | ||
"""Check if a given group is a metadata group for a given parent_group.""" | ||
if group._v_depth <= parent_group._v_depth: | ||
return False | ||
|
||
current = group | ||
while current._v_depth > 1: | ||
parent = current._v_parent | ||
if parent == parent_group and current._v_name == 'meta': | ||
return True | ||
current = current._v_parent | ||
return False | ||
|
||
|
||
class HDFStore(StringMixin): | ||
|
||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,8 +46,8 @@ | |
|
||
from distutils.version import LooseVersion | ||
|
||
_default_compressor = LooseVersion(tables.__version__) >= '2.2' \ | ||
and 'blosc' or 'zlib' | ||
_default_compressor = ('blosc' if LooseVersion(tables.__version__) >= '2.2' | ||
else 'zlib') | ||
|
||
_multiprocess_can_split_ = False | ||
|
||
|
@@ -4877,13 +4877,26 @@ def test_read_nokey(self): | |
df = DataFrame(np.random.rand(4, 5), | ||
index=list('abcd'), | ||
columns=list('ABCDE')) | ||
# Categorical dtype not supported for "fixed" format. So no need | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line |
||
# to test with that dtype in the dataframe here. | ||
with ensure_clean_path(self.path) as path: | ||
df.to_hdf(path, 'df', mode='a') | ||
reread = read_hdf(path) | ||
assert_frame_equal(df, reread) | ||
df.to_hdf(path, 'df2', mode='a') | ||
self.assertRaises(ValueError, read_hdf, path) | ||
|
||
def test_read_nokey_table(self): | ||
# GH13231 | ||
df = DataFrame({'i': range(5), | ||
'c': Series(list('abacd'), dtype='category')}) | ||
with ensure_clean_path(self.path) as path: | ||
df.to_hdf(path, 'df', mode='a', format='table') | ||
reread = read_hdf(path) | ||
assert_frame_equal(df, reread) | ||
df.to_hdf(path, 'df2', mode='a', format='table') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. blank line There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do you really mean "add a blank line before the last assert (assertRaises) of test_read_nokey_table()?" Because then that with-block would not look the same as the pretty much identical one in the very similar test_read_nokey(). I'm assuming for consistency's sake that you meant a blank like before the with-block (like for test_read_nokey()). If not, please let me know. |
||
self.assertRaises(ValueError, read_hdf, path) | ||
|
||
def test_read_from_pathlib_path(self): | ||
|
||
# GH11773 | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this will fail on an empty store (and the error it raises will be odd, e.g.
IndexError
).