BUG: fix categories in HDFStore not filtering correctly

shawnheide · jreback · commit e9087339cd6e · 2016-07-28T20:19:30.000-04:00
closes #13322 closes #13792
diff --git a/doc/source/whatsnew/v0.19.0.txt b/doc/source/whatsnew/v0.19.0.txt
@@ -816,6 +816,7 @@ Bug Fixes
 - Clean some compile time warnings in datetime parsing (:issue:`13607`)
 - Bug in ``factorize`` raises ``AmbiguousTimeError`` if data contains datetime near DST boundary (:issue:`13750`)
 - Bug in ``.set_index`` raises ``AmbiguousTimeError`` if new index contains DST boundary and multi levels (:issue:`12920`)
+- Bug in ``pd.read_hdf()`` returns incorrect result when a ``DataFrame`` with a ``categorical`` column and a query which doesn't match any values (:issue:`13792`)
 
 
 - Bug in ``Series`` comparison operators when dealing with zero dim NumPy arrays (:issue:`13006`)
diff --git a/pandas/computation/pytables.py b/pandas/computation/pytables.py
@@ -198,6 +198,11 @@ def stringify(value):
         elif meta == u('category'):
             metadata = com._values_from_object(self.metadata)
             result = metadata.searchsorted(v, side='left')
+
+            # result returns 0 if v is first element or if v is not in metadata
+            # check that metadata contains v
+            if not result and v not in metadata:
+                result = -1
             return TermValue(result, result, u('integer'))
         elif kind == u('integer'):
             v = int(float(v))
diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py
@@ -4733,6 +4733,36 @@ def test_categorical(self):
             self.assertRaises(
                 KeyError, lambda: store.select('df3/meta/s/meta'))
 
+    def test_categorical_conversion(self):
+
+        # GH13322
+        # Check that read_hdf with categorical columns doesn't return rows if
+        # where criteria isn't met.
+        obsids = ['ESP_012345_6789', 'ESP_987654_3210']
+        imgids = ['APF00006np', 'APF0001imm']
+        data = [4.3, 9.8]
+
+        # Test without categories
+        df = DataFrame(dict(obsids=obsids, imgids=imgids, data=data))
+
+        # We are expecting an empty DataFrame matching types of df
+        expected = df.iloc[[], :]
+        with ensure_clean_path(self.path) as path:
+            df.to_hdf(path, 'df', format='table', data_columns=True)
+            result = read_hdf(path, 'df', where='obsids=B')
+            tm.assert_frame_equal(result, expected)
+
+        # Test with categories
+        df.obsids = df.obsids.astype('category')
+        df.imgids = df.imgids.astype('category')
+
+        # We are expecting an empty DataFrame matching types of df
+        expected = df.iloc[[], :]
+        with ensure_clean_path(self.path) as path:
+            df.to_hdf(path, 'df', format='table', data_columns=True)
+            result = read_hdf(path, 'df', where='obsids=B')
+            tm.assert_frame_equal(result, expected)
+
     def test_duplicate_column_name(self):
         df = DataFrame(columns=["a", "a"], data=[[0, 0]])