Skip to content

Commit e908733

Browse files
shawnheidejreback
authored andcommitted
BUG: fix categories in HDFStore not filtering correctly
closes #13322 closes #13792
1 parent 5b0d947 commit e908733

File tree

3 files changed

+36
-0
lines changed

3 files changed

+36
-0
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -816,6 +816,7 @@ Bug Fixes
816816
- Clean some compile time warnings in datetime parsing (:issue:`13607`)
817817
- Bug in ``factorize`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13750`)
818818
- Bug in ``.set_index`` raises ``AmbiguousTimeError`` if new index contains DST boundary and multi levels (:issue:`12920`)
819+
- Bug in ``pd.read_hdf()`` returns incorrect result when a ``DataFrame`` with a ``categorical`` column and a query which doesn't match any values (:issue:`13792`)
819820

820821

821822
- Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`)

pandas/computation/pytables.py

+5
Original file line numberDiff line numberDiff line change
@@ -198,6 +198,11 @@ def stringify(value):
198198
elif meta == u('category'):
199199
metadata = com._values_from_object(self.metadata)
200200
result = metadata.searchsorted(v, side='left')
201+
202+
# result returns 0 if v is first element or if v is not in metadata
203+
# check that metadata contains v
204+
if not result and v not in metadata:
205+
result = -1
201206
return TermValue(result, result, u('integer'))
202207
elif kind == u('integer'):
203208
v = int(float(v))

pandas/io/tests/test_pytables.py

+30
Original file line numberDiff line numberDiff line change
@@ -4733,6 +4733,36 @@ def test_categorical(self):
47334733
self.assertRaises(
47344734
KeyError, lambda: store.select('df3/meta/s/meta'))
47354735

4736+
def test_categorical_conversion(self):
4737+
4738+
# GH13322
4739+
# Check that read_hdf with categorical columns doesn't return rows if
4740+
# where criteria isn't met.
4741+
obsids = ['ESP_012345_6789', 'ESP_987654_3210']
4742+
imgids = ['APF00006np', 'APF0001imm']
4743+
data = [4.3, 9.8]
4744+
4745+
# Test without categories
4746+
df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data))
4747+
4748+
# We are expecting an empty DataFrame matching types of df
4749+
expected = df.iloc[[], :]
4750+
with ensure_clean_path(self.path) as path:
4751+
df.to_hdf(path, 'df', format='table', data_columns=True)
4752+
result = read_hdf(path, 'df', where='obsids=B')
4753+
tm.assert_frame_equal(result, expected)
4754+
4755+
# Test with categories
4756+
df.obsids = df.obsids.astype('category')
4757+
df.imgids = df.imgids.astype('category')
4758+
4759+
# We are expecting an empty DataFrame matching types of df
4760+
expected = df.iloc[[], :]
4761+
with ensure_clean_path(self.path) as path:
4762+
df.to_hdf(path, 'df', format='table', data_columns=True)
4763+
result = read_hdf(path, 'df', where='obsids=B')
4764+
tm.assert_frame_equal(result, expected)
4765+
47364766
def test_duplicate_column_name(self):
47374767
df = DataFrame(columns=["a", "a"], data=[[0, 0]])
47384768

0 commit comments

Comments
 (0)