Skip to content

Commit a461208

Browse files
committed
BUG: fix categories in HDFStore not filtering correctly (#13322)
1 parent cc216ad commit a461208

File tree

3 files changed

+36
-0
lines changed

3 files changed

+36
-0
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -788,3 +788,4 @@ Bug Fixes
788788
- Bugs in ``Index.difference`` and ``DataFrame.join`` raise in Python3 when using mixed-integer indexes (:issue:`13432`, :issue:`12814`)
789789

790790
- Bug in ``.to_excel()`` when DataFrame contains a MultiIndex which contains a label with a NaN value (:issue:`13511`)
791+
- Bug in ``pd.read_hdf()`` returns incorrect result when HDF Store contains a DataFrame with a categorical column and query doesn't match any values(:issue:`13792`)

pandas/computation/pytables.py

+5
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,11 @@ def stringify(value):
198198
elif meta == u('category'):
199199
metadata = com._values_from_object(self.metadata)
200200
result = metadata.searchsorted(v, side='left')
201+
202+
# result returns 0 if v is first element or if v is not in metadata
203+
# check that metadata contains v
204+
if not result and v not in metadata:
205+
result = -1
201206
return TermValue(result, result, u('integer'))
202207
elif kind == u('integer'):
203208
v = int(float(v))

pandas/io/tests/test_pytables.py

+30
Original file line numberDiff line numberDiff line change
@@ -4733,6 +4733,36 @@ def test_categorical(self):
47334733
self.assertRaises(
47344734
KeyError, lambda: store.select('df3/meta/s/meta'))
47354735

4736+
def test_categorical_conversion(self):
4737+
4738+
# GH13322
4739+
# Check that read_hdf with categorical columns doesn't return rows if
4740+
# where criteria isn't met.
4741+
obsids = ['ESP_012345_6789', 'ESP_987654_3210']
4742+
imgids = ['APF00006np', 'APF0001imm']
4743+
data = [4.3, 9.8]
4744+
4745+
# Test without categories
4746+
df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data))
4747+
4748+
# We are expecting an empty DataFrame matching types of df
4749+
expected = df.iloc[[], :]
4750+
with ensure_clean_path(self.path) as path:
4751+
df.to_hdf(path, 'df', format='table', data_columns=True)
4752+
result = read_hdf(path, 'df', where='obsids=B')
4753+
tm.assert_frame_equal(result, expected)
4754+
4755+
# Test with categories
4756+
df.obsids = df.obsids.astype('category')
4757+
df.imgids = df.imgids.astype('category')
4758+
4759+
# We are expecting an empty DataFrame matching types of df
4760+
expected = df.iloc[[], :]
4761+
with ensure_clean_path(self.path) as path:
4762+
df.to_hdf(path, 'df', format='table', data_columns=True)
4763+
result = read_hdf(path, 'df', where='obsids=B')
4764+
tm.assert_frame_equal(result, expected)
4765+
47364766
def test_duplicate_column_name(self):
47374767
df = DataFrame(columns=["a", "a"], data=[[0, 0]])
47384768

0 commit comments

Comments
 (0)