|
| 1 | +import numpy as np |
| 2 | +import pytest |
| 3 | + |
| 4 | +from pandas import Categorical, DataFrame, Series, _testing as tm, concat, read_hdf |
| 5 | +from pandas.tests.io.pytables.common import ( |
| 6 | + _maybe_remove, |
| 7 | + ensure_clean_path, |
| 8 | + ensure_clean_store, |
| 9 | +) |
| 10 | + |
| 11 | +pytestmark = pytest.mark.single |
| 12 | + |
| 13 | + |
| 14 | +def test_categorical(setup_path): |
| 15 | + |
| 16 | + with ensure_clean_store(setup_path) as store: |
| 17 | + |
| 18 | + # Basic |
| 19 | + _maybe_remove(store, "s") |
| 20 | + s = Series( |
| 21 | + Categorical( |
| 22 | + ["a", "b", "b", "a", "a", "c"], |
| 23 | + categories=["a", "b", "c", "d"], |
| 24 | + ordered=False, |
| 25 | + ) |
| 26 | + ) |
| 27 | + store.append("s", s, format="table") |
| 28 | + result = store.select("s") |
| 29 | + tm.assert_series_equal(s, result) |
| 30 | + |
| 31 | + _maybe_remove(store, "s_ordered") |
| 32 | + s = Series( |
| 33 | + Categorical( |
| 34 | + ["a", "b", "b", "a", "a", "c"], |
| 35 | + categories=["a", "b", "c", "d"], |
| 36 | + ordered=True, |
| 37 | + ) |
| 38 | + ) |
| 39 | + store.append("s_ordered", s, format="table") |
| 40 | + result = store.select("s_ordered") |
| 41 | + tm.assert_series_equal(s, result) |
| 42 | + |
| 43 | + _maybe_remove(store, "df") |
| 44 | + df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) |
| 45 | + store.append("df", df, format="table") |
| 46 | + result = store.select("df") |
| 47 | + tm.assert_frame_equal(result, df) |
| 48 | + |
| 49 | + # Dtypes |
| 50 | + _maybe_remove(store, "si") |
| 51 | + s = Series([1, 1, 2, 2, 3, 4, 5]).astype("category") |
| 52 | + store.append("si", s) |
| 53 | + result = store.select("si") |
| 54 | + tm.assert_series_equal(result, s) |
| 55 | + |
| 56 | + _maybe_remove(store, "si2") |
| 57 | + s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype("category") |
| 58 | + store.append("si2", s) |
| 59 | + result = store.select("si2") |
| 60 | + tm.assert_series_equal(result, s) |
| 61 | + |
| 62 | + # Multiple |
| 63 | + _maybe_remove(store, "df2") |
| 64 | + df2 = df.copy() |
| 65 | + df2["s2"] = Series(list("abcdefg")).astype("category") |
| 66 | + store.append("df2", df2) |
| 67 | + result = store.select("df2") |
| 68 | + tm.assert_frame_equal(result, df2) |
| 69 | + |
| 70 | + # Make sure the metadata is OK |
| 71 | + info = store.info() |
| 72 | + assert "/df2 " in info |
| 73 | + # assert '/df2/meta/values_block_0/meta' in info |
| 74 | + assert "/df2/meta/values_block_1/meta" in info |
| 75 | + |
| 76 | + # unordered |
| 77 | + _maybe_remove(store, "s2") |
| 78 | + s = Series( |
| 79 | + Categorical( |
| 80 | + ["a", "b", "b", "a", "a", "c"], |
| 81 | + categories=["a", "b", "c", "d"], |
| 82 | + ordered=False, |
| 83 | + ) |
| 84 | + ) |
| 85 | + store.append("s2", s, format="table") |
| 86 | + result = store.select("s2") |
| 87 | + tm.assert_series_equal(result, s) |
| 88 | + |
| 89 | + # Query |
| 90 | + _maybe_remove(store, "df3") |
| 91 | + store.append("df3", df, data_columns=["s"]) |
| 92 | + expected = df[df.s.isin(["b", "c"])] |
| 93 | + result = store.select("df3", where=['s in ["b","c"]']) |
| 94 | + tm.assert_frame_equal(result, expected) |
| 95 | + |
| 96 | + expected = df[df.s.isin(["b", "c"])] |
| 97 | + result = store.select("df3", where=['s = ["b","c"]']) |
| 98 | + tm.assert_frame_equal(result, expected) |
| 99 | + |
| 100 | + expected = df[df.s.isin(["d"])] |
| 101 | + result = store.select("df3", where=['s in ["d"]']) |
| 102 | + tm.assert_frame_equal(result, expected) |
| 103 | + |
| 104 | + expected = df[df.s.isin(["f"])] |
| 105 | + result = store.select("df3", where=['s in ["f"]']) |
| 106 | + tm.assert_frame_equal(result, expected) |
| 107 | + |
| 108 | + # Appending with same categories is ok |
| 109 | + store.append("df3", df) |
| 110 | + |
| 111 | + df = concat([df, df]) |
| 112 | + expected = df[df.s.isin(["b", "c"])] |
| 113 | + result = store.select("df3", where=['s in ["b","c"]']) |
| 114 | + tm.assert_frame_equal(result, expected) |
| 115 | + |
| 116 | + # Appending must have the same categories |
| 117 | + df3 = df.copy() |
| 118 | + df3["s"] = df3["s"].cat.remove_unused_categories() |
| 119 | + |
| 120 | + msg = "cannot append a categorical with different categories to the existing" |
| 121 | + with pytest.raises(ValueError, match=msg): |
| 122 | + store.append("df3", df3) |
| 123 | + |
| 124 | + # Remove, and make sure meta data is removed (its a recursive |
| 125 | + # removal so should be). |
| 126 | + result = store.select("df3/meta/s/meta") |
| 127 | + assert result is not None |
| 128 | + store.remove("df3") |
| 129 | + |
| 130 | + with pytest.raises( |
| 131 | + KeyError, match="'No object named df3/meta/s/meta in the file'" |
| 132 | + ): |
| 133 | + store.select("df3/meta/s/meta") |
| 134 | + |
| 135 | + |
| 136 | +def test_categorical_conversion(setup_path): |
| 137 | + |
| 138 | + # GH13322 |
| 139 | + # Check that read_hdf with categorical columns doesn't return rows if |
| 140 | + # where criteria isn't met. |
| 141 | + obsids = ["ESP_012345_6789", "ESP_987654_3210"] |
| 142 | + imgids = ["APF00006np", "APF0001imm"] |
| 143 | + data = [4.3, 9.8] |
| 144 | + |
| 145 | + # Test without categories |
| 146 | + df = DataFrame({"obsids": obsids, "imgids": imgids, "data": data}) |
| 147 | + |
| 148 | + # We are expecting an empty DataFrame matching types of df |
| 149 | + expected = df.iloc[[], :] |
| 150 | + with ensure_clean_path(setup_path) as path: |
| 151 | + df.to_hdf(path, "df", format="table", data_columns=True) |
| 152 | + result = read_hdf(path, "df", where="obsids=B") |
| 153 | + tm.assert_frame_equal(result, expected) |
| 154 | + |
| 155 | + # Test with categories |
| 156 | + df.obsids = df.obsids.astype("category") |
| 157 | + df.imgids = df.imgids.astype("category") |
| 158 | + |
| 159 | + # We are expecting an empty DataFrame matching types of df |
| 160 | + expected = df.iloc[[], :] |
| 161 | + with ensure_clean_path(setup_path) as path: |
| 162 | + df.to_hdf(path, "df", format="table", data_columns=True) |
| 163 | + result = read_hdf(path, "df", where="obsids=B") |
| 164 | + tm.assert_frame_equal(result, expected) |
| 165 | + |
| 166 | + |
| 167 | +def test_categorical_nan_only_columns(setup_path): |
| 168 | + # GH18413 |
| 169 | + # Check that read_hdf with categorical columns with NaN-only values can |
| 170 | + # be read back. |
| 171 | + df = DataFrame( |
| 172 | + { |
| 173 | + "a": ["a", "b", "c", np.nan], |
| 174 | + "b": [np.nan, np.nan, np.nan, np.nan], |
| 175 | + "c": [1, 2, 3, 4], |
| 176 | + "d": Series([None] * 4, dtype=object), |
| 177 | + } |
| 178 | + ) |
| 179 | + df["a"] = df.a.astype("category") |
| 180 | + df["b"] = df.b.astype("category") |
| 181 | + df["d"] = df.b.astype("category") |
| 182 | + expected = df |
| 183 | + with ensure_clean_path(setup_path) as path: |
| 184 | + df.to_hdf(path, "df", format="table", data_columns=True) |
| 185 | + result = read_hdf(path, "df") |
| 186 | + tm.assert_frame_equal(result, expected) |
0 commit comments