diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 1dcde2000fc89..3be38e123b5d5 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -330,6 +330,7 @@ I/O - Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`) - Bug in :func:`read_json` when ``orient="split"`` does not maintain numeric string index (:issue:`28556`) - :meth:`read_sql` returned an empty generator if ``chunksize`` was no-zero and the query returned no results. Now returns a generator with a single empty dataframe (:issue:`34411`) +- Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`) Period ^^^^^^ diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index db2385de06e93..6a3b95186d666 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -210,12 +210,10 @@ def stringify(value): return TermValue(int(v), v, kind) elif meta == "category": metadata = extract_array(self.metadata, extract_numpy=True) - result = metadata.searchsorted(v, side="left") - - # result returns 0 if v is first element or if v is not in metadata - # check that metadata contains v - if not result and v not in metadata: + if v not in metadata: result = -1 + else: + result = metadata.searchsorted(v, side="left") return TermValue(result, result, "integer") elif kind == "integer": v = int(float(v)) diff --git a/pandas/tests/io/pytables/test_categorical.py b/pandas/tests/io/pytables/test_categorical.py index 67209c2bc0d57..b873811de616c 100644 --- a/pandas/tests/io/pytables/test_categorical.py +++ b/pandas/tests/io/pytables/test_categorical.py @@ -184,3 +184,25 @@ def test_categorical_nan_only_columns(setup_path): df.to_hdf(path, "df", format="table", data_columns=True) result = read_hdf(path, "df") tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize( + "where, df, expected", + [ + ('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})), + ('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})), + ], +) +def test_convert_value(setup_path, where: str, df: DataFrame, expected: DataFrame): + # GH39420 + # Check that read_hdf with categorical columns can filter by where condition. + df.col = df.col.astype("category") + max_widths = {"col": 1} + categorical_values = sorted(df.col.unique()) + expected.col = expected.col.astype("category") + expected.col.cat.set_categories(categorical_values, inplace=True) + + with ensure_clean_path(setup_path) as path: + df.to_hdf(path, "df", format="table", min_itemsize=max_widths) + result = read_hdf(path, where=where) + tm.assert_frame_equal(result, expected)