Skip to content

Commit 0b16fb3

Browse files
authored
Read hdf returns unexpected values for categorical (pandas-dev#39420)
1 parent 2075d20 commit 0b16fb3

File tree

3 files changed

+26
-5
lines changed

3 files changed

+26
-5
lines changed

doc/source/whatsnew/v1.3.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ I/O
375375
- Bug in :func:`read_csv` not switching ``true_values`` and ``false_values`` for nullable ``boolean`` dtype (:issue:`34655`)
376376
- Bug in :func:`read_json` when ``orient="split"`` does not maintain numeric string index (:issue:`28556`)
377377
- :meth:`read_sql` returned an empty generator if ``chunksize`` was no-zero and the query returned no results. Now returns a generator with a single empty dataframe (:issue:`34411`)
378+
- Bug in :func:`read_hdf` returning unexpected records when filtering on categorical string columns using ``where`` parameter (:issue:`39189`)
378379

379380
Period
380381
^^^^^^

pandas/core/computation/pytables.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -210,12 +210,10 @@ def stringify(value):
210210
return TermValue(int(v), v, kind)
211211
elif meta == "category":
212212
metadata = extract_array(self.metadata, extract_numpy=True)
213-
result = metadata.searchsorted(v, side="left")
214-
215-
# result returns 0 if v is first element or if v is not in metadata
216-
# check that metadata contains v
217-
if not result and v not in metadata:
213+
if v not in metadata:
218214
result = -1
215+
else:
216+
result = metadata.searchsorted(v, side="left")
219217
return TermValue(result, result, "integer")
220218
elif kind == "integer":
221219
v = int(float(v))

pandas/tests/io/pytables/test_categorical.py

+22
Original file line numberDiff line numberDiff line change
@@ -184,3 +184,25 @@ def test_categorical_nan_only_columns(setup_path):
184184
df.to_hdf(path, "df", format="table", data_columns=True)
185185
result = read_hdf(path, "df")
186186
tm.assert_frame_equal(result, expected)
187+
188+
189+
@pytest.mark.parametrize(
190+
"where, df, expected",
191+
[
192+
('col=="q"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": []})),
193+
('col=="a"', DataFrame({"col": ["a", "b", "s"]}), DataFrame({"col": ["a"]})),
194+
],
195+
)
196+
def test_convert_value(setup_path, where: str, df: DataFrame, expected: DataFrame):
197+
# GH39420
198+
# Check that read_hdf with categorical columns can filter by where condition.
199+
df.col = df.col.astype("category")
200+
max_widths = {"col": 1}
201+
categorical_values = sorted(df.col.unique())
202+
expected.col = expected.col.astype("category")
203+
expected.col.cat.set_categories(categorical_values, inplace=True)
204+
205+
with ensure_clean_path(setup_path) as path:
206+
df.to_hdf(path, "df", format="table", min_itemsize=max_widths)
207+
result = read_hdf(path, where=where)
208+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)