Skip to content

Commit 2db1cc0

Browse files
sschejreback
authored andcommitted
BUG: Categorical data fails to load from hdf when all columns are NaN (#18652)
1 parent 1c1f507 commit 2db1cc0

File tree

4 files changed

+32
-4
lines changed

4 files changed

+32
-4
lines changed

.gitignore

+1
Original file line numberDiff line numberDiff line change
@@ -106,3 +106,4 @@ doc/build/html/index.html
106106
doc/tmp.sv
107107
doc/source/styled.xlsx
108108
doc/source/templates/
109+
env/

doc/source/whatsnew/v0.21.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ I/O
126126
- Bug in :func:`read_json` not decoding when reading line deliminted JSON from S3 (:issue:`17200`)
127127
- Bug in :func:`pandas.io.json.json_normalize` to avoid modification of ``meta`` (:issue:`18610`)
128128
- Bug in :func:`to_latex` where repeated multi-index values were not printed even though a higher level index differed from the previous row (:issue:`14484`)
129+
- Bug when reading NaN-only categorical columns in :class:`HDFStore` (:issue:`18413`)
129130

130131
Plotting
131132
^^^^^^^^

pandas/io/pytables.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -2137,10 +2137,17 @@ def convert(self, values, nan_rep, encoding):
21372137
# if we have stored a NaN in the categories
21382138
# then strip it; in theory we could have BOTH
21392139
# -1s in the codes and nulls :<
2140-
mask = isna(categories)
2141-
if mask.any():
2142-
categories = categories[~mask]
2143-
codes[codes != -1] -= mask.astype(int).cumsum().values
2140+
if categories is None:
2141+
# Handle case of NaN-only categorical columns in which case
2142+
# the categories are an empty array; when this is stored,
2143+
# pytables cannot write a zero-len array, so on readback
2144+
# the categories would be None and `read_hdf()` would fail.
2145+
categories = Index([], dtype=np.float64)
2146+
else:
2147+
mask = isna(categories)
2148+
if mask.any():
2149+
categories = categories[~mask]
2150+
codes[codes != -1] -= mask.astype(int).cumsum().values
21442151

21452152
self.data = Categorical.from_codes(codes,
21462153
categories=categories,

pandas/tests/io/test_pytables.py

+19
Original file line numberDiff line numberDiff line change
@@ -4928,6 +4928,25 @@ def test_categorical_conversion(self):
49284928
result = read_hdf(path, 'df', where='obsids=B')
49294929
tm.assert_frame_equal(result, expected)
49304930

4931+
def test_categorical_nan_only_columns(self):
4932+
# GH18413
4933+
# Check that read_hdf with categorical columns with NaN-only values can
4934+
# be read back.
4935+
df = pd.DataFrame({
4936+
'a': ['a', 'b', 'c', np.nan],
4937+
'b': [np.nan, np.nan, np.nan, np.nan],
4938+
'c': [1, 2, 3, 4],
4939+
'd': pd.Series([None] * 4, dtype=object)
4940+
})
4941+
df['a'] = df.a.astype('category')
4942+
df['b'] = df.b.astype('category')
4943+
df['d'] = df.b.astype('category')
4944+
expected = df
4945+
with ensure_clean_path(self.path) as path:
4946+
df.to_hdf(path, 'df', format='table', data_columns=True)
4947+
result = read_hdf(path, 'df')
4948+
tm.assert_frame_equal(result, expected)
4949+
49314950
def test_duplicate_column_name(self):
49324951
df = DataFrame(columns=["a", "a"], data=[[0, 0]])
49334952

0 commit comments

Comments
 (0)