|
| 1 | +import numpy as np |
| 2 | +import pytest |
| 3 | + |
| 4 | +from pandas.core.dtypes.dtypes import CategoricalDtype |
| 5 | + |
| 6 | +import pandas as pd |
| 7 | +from pandas import Categorical, DataFrame, Index, Series |
| 8 | +import pandas.util.testing as tm |
| 9 | + |
| 10 | + |
| 11 | +class TestDataFrameIndexingCategorical: |
| 12 | + def test_assignment(self): |
| 13 | + # assignment |
| 14 | + df = DataFrame( |
| 15 | + {"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")} |
| 16 | + ) |
| 17 | + labels = Categorical( |
| 18 | + ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)] |
| 19 | + ) |
| 20 | + |
| 21 | + df = df.sort_values(by=["value"], ascending=True) |
| 22 | + s = pd.cut(df.value, range(0, 10500, 500), right=False, labels=labels) |
| 23 | + d = s.values |
| 24 | + df["D"] = d |
| 25 | + str(df) |
| 26 | + |
| 27 | + result = df.dtypes |
| 28 | + expected = Series( |
| 29 | + [np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)], |
| 30 | + index=["value", "D"], |
| 31 | + ) |
| 32 | + tm.assert_series_equal(result, expected) |
| 33 | + |
| 34 | + df["E"] = s |
| 35 | + str(df) |
| 36 | + |
| 37 | + result = df.dtypes |
| 38 | + expected = Series( |
| 39 | + [ |
| 40 | + np.dtype("int32"), |
| 41 | + CategoricalDtype(categories=labels, ordered=False), |
| 42 | + CategoricalDtype(categories=labels, ordered=False), |
| 43 | + ], |
| 44 | + index=["value", "D", "E"], |
| 45 | + ) |
| 46 | + tm.assert_series_equal(result, expected) |
| 47 | + |
| 48 | + result1 = df["D"] |
| 49 | + result2 = df["E"] |
| 50 | + tm.assert_categorical_equal(result1._data._block.values, d) |
| 51 | + |
| 52 | + # sorting |
| 53 | + s.name = "E" |
| 54 | + tm.assert_series_equal(result2.sort_index(), s.sort_index()) |
| 55 | + |
| 56 | + cat = Categorical([1, 2, 3, 10], categories=[1, 2, 3, 4, 10]) |
| 57 | + df = DataFrame(Series(cat)) |
| 58 | + |
| 59 | + def test_assigning_ops(self): |
| 60 | + # systematically test the assigning operations: |
| 61 | + # for all slicing ops: |
| 62 | + # for value in categories and value not in categories: |
| 63 | + |
| 64 | + # - assign a single value -> exp_single_cats_value |
| 65 | + |
| 66 | + # - assign a complete row (mixed values) -> exp_single_row |
| 67 | + |
| 68 | + # assign multiple rows (mixed values) (-> array) -> exp_multi_row |
| 69 | + |
| 70 | + # assign a part of a column with dtype == categorical -> |
| 71 | + # exp_parts_cats_col |
| 72 | + |
| 73 | + # assign a part of a column with dtype != categorical -> |
| 74 | + # exp_parts_cats_col |
| 75 | + |
| 76 | + cats = Categorical(["a", "a", "a", "a", "a", "a", "a"], categories=["a", "b"]) |
| 77 | + idx = Index(["h", "i", "j", "k", "l", "m", "n"]) |
| 78 | + values = [1, 1, 1, 1, 1, 1, 1] |
| 79 | + orig = DataFrame({"cats": cats, "values": values}, index=idx) |
| 80 | + |
| 81 | + # the expected values |
| 82 | + # changed single row |
| 83 | + cats1 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) |
| 84 | + idx1 = Index(["h", "i", "j", "k", "l", "m", "n"]) |
| 85 | + values1 = [1, 1, 2, 1, 1, 1, 1] |
| 86 | + exp_single_row = DataFrame({"cats": cats1, "values": values1}, index=idx1) |
| 87 | + |
| 88 | + # changed multiple rows |
| 89 | + cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) |
| 90 | + idx2 = Index(["h", "i", "j", "k", "l", "m", "n"]) |
| 91 | + values2 = [1, 1, 2, 2, 1, 1, 1] |
| 92 | + exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2) |
| 93 | + |
| 94 | + # changed part of the cats column |
| 95 | + cats3 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"]) |
| 96 | + idx3 = Index(["h", "i", "j", "k", "l", "m", "n"]) |
| 97 | + values3 = [1, 1, 1, 1, 1, 1, 1] |
| 98 | + exp_parts_cats_col = DataFrame({"cats": cats3, "values": values3}, index=idx3) |
| 99 | + |
| 100 | + # changed single value in cats col |
| 101 | + cats4 = Categorical(["a", "a", "b", "a", "a", "a", "a"], categories=["a", "b"]) |
| 102 | + idx4 = Index(["h", "i", "j", "k", "l", "m", "n"]) |
| 103 | + values4 = [1, 1, 1, 1, 1, 1, 1] |
| 104 | + exp_single_cats_value = DataFrame( |
| 105 | + {"cats": cats4, "values": values4}, index=idx4 |
| 106 | + ) |
| 107 | + |
| 108 | + # iloc |
| 109 | + # ############### |
| 110 | + # - assign a single value -> exp_single_cats_value |
| 111 | + df = orig.copy() |
| 112 | + df.iloc[2, 0] = "b" |
| 113 | + tm.assert_frame_equal(df, exp_single_cats_value) |
| 114 | + |
| 115 | + df = orig.copy() |
| 116 | + df.iloc[df.index == "j", 0] = "b" |
| 117 | + tm.assert_frame_equal(df, exp_single_cats_value) |
| 118 | + |
| 119 | + # - assign a single value not in the current categories set |
| 120 | + with pytest.raises(ValueError): |
| 121 | + df = orig.copy() |
| 122 | + df.iloc[2, 0] = "c" |
| 123 | + |
| 124 | + # - assign a complete row (mixed values) -> exp_single_row |
| 125 | + df = orig.copy() |
| 126 | + df.iloc[2, :] = ["b", 2] |
| 127 | + tm.assert_frame_equal(df, exp_single_row) |
| 128 | + |
| 129 | + # - assign a complete row (mixed values) not in categories set |
| 130 | + with pytest.raises(ValueError): |
| 131 | + df = orig.copy() |
| 132 | + df.iloc[2, :] = ["c", 2] |
| 133 | + |
| 134 | + # - assign multiple rows (mixed values) -> exp_multi_row |
| 135 | + df = orig.copy() |
| 136 | + df.iloc[2:4, :] = [["b", 2], ["b", 2]] |
| 137 | + tm.assert_frame_equal(df, exp_multi_row) |
| 138 | + |
| 139 | + with pytest.raises(ValueError): |
| 140 | + df = orig.copy() |
| 141 | + df.iloc[2:4, :] = [["c", 2], ["c", 2]] |
| 142 | + |
| 143 | + # assign a part of a column with dtype == categorical -> |
| 144 | + # exp_parts_cats_col |
| 145 | + df = orig.copy() |
| 146 | + df.iloc[2:4, 0] = Categorical(["b", "b"], categories=["a", "b"]) |
| 147 | + tm.assert_frame_equal(df, exp_parts_cats_col) |
| 148 | + |
| 149 | + with pytest.raises(ValueError): |
| 150 | + # different categories -> not sure if this should fail or pass |
| 151 | + df = orig.copy() |
| 152 | + df.iloc[2:4, 0] = Categorical(list("bb"), categories=list("abc")) |
| 153 | + |
| 154 | + with pytest.raises(ValueError): |
| 155 | + # different values |
| 156 | + df = orig.copy() |
| 157 | + df.iloc[2:4, 0] = Categorical(list("cc"), categories=list("abc")) |
| 158 | + |
| 159 | + # assign a part of a column with dtype != categorical -> |
| 160 | + # exp_parts_cats_col |
| 161 | + df = orig.copy() |
| 162 | + df.iloc[2:4, 0] = ["b", "b"] |
| 163 | + tm.assert_frame_equal(df, exp_parts_cats_col) |
| 164 | + |
| 165 | + with pytest.raises(ValueError): |
| 166 | + df.iloc[2:4, 0] = ["c", "c"] |
| 167 | + |
| 168 | + # loc |
| 169 | + # ############## |
| 170 | + # - assign a single value -> exp_single_cats_value |
| 171 | + df = orig.copy() |
| 172 | + df.loc["j", "cats"] = "b" |
| 173 | + tm.assert_frame_equal(df, exp_single_cats_value) |
| 174 | + |
| 175 | + df = orig.copy() |
| 176 | + df.loc[df.index == "j", "cats"] = "b" |
| 177 | + tm.assert_frame_equal(df, exp_single_cats_value) |
| 178 | + |
| 179 | + # - assign a single value not in the current categories set |
| 180 | + with pytest.raises(ValueError): |
| 181 | + df = orig.copy() |
| 182 | + df.loc["j", "cats"] = "c" |
| 183 | + |
| 184 | + # - assign a complete row (mixed values) -> exp_single_row |
| 185 | + df = orig.copy() |
| 186 | + df.loc["j", :] = ["b", 2] |
| 187 | + tm.assert_frame_equal(df, exp_single_row) |
| 188 | + |
| 189 | + # - assign a complete row (mixed values) not in categories set |
| 190 | + with pytest.raises(ValueError): |
| 191 | + df = orig.copy() |
| 192 | + df.loc["j", :] = ["c", 2] |
| 193 | + |
| 194 | + # - assign multiple rows (mixed values) -> exp_multi_row |
| 195 | + df = orig.copy() |
| 196 | + df.loc["j":"k", :] = [["b", 2], ["b", 2]] |
| 197 | + tm.assert_frame_equal(df, exp_multi_row) |
| 198 | + |
| 199 | + with pytest.raises(ValueError): |
| 200 | + df = orig.copy() |
| 201 | + df.loc["j":"k", :] = [["c", 2], ["c", 2]] |
| 202 | + |
| 203 | + # assign a part of a column with dtype == categorical -> |
| 204 | + # exp_parts_cats_col |
| 205 | + df = orig.copy() |
| 206 | + df.loc["j":"k", "cats"] = Categorical(["b", "b"], categories=["a", "b"]) |
| 207 | + tm.assert_frame_equal(df, exp_parts_cats_col) |
| 208 | + |
| 209 | + with pytest.raises(ValueError): |
| 210 | + # different categories -> not sure if this should fail or pass |
| 211 | + df = orig.copy() |
| 212 | + df.loc["j":"k", "cats"] = Categorical( |
| 213 | + ["b", "b"], categories=["a", "b", "c"] |
| 214 | + ) |
| 215 | + |
| 216 | + with pytest.raises(ValueError): |
| 217 | + # different values |
| 218 | + df = orig.copy() |
| 219 | + df.loc["j":"k", "cats"] = Categorical( |
| 220 | + ["c", "c"], categories=["a", "b", "c"] |
| 221 | + ) |
| 222 | + |
| 223 | + # assign a part of a column with dtype != categorical -> |
| 224 | + # exp_parts_cats_col |
| 225 | + df = orig.copy() |
| 226 | + df.loc["j":"k", "cats"] = ["b", "b"] |
| 227 | + tm.assert_frame_equal(df, exp_parts_cats_col) |
| 228 | + |
| 229 | + with pytest.raises(ValueError): |
| 230 | + df.loc["j":"k", "cats"] = ["c", "c"] |
| 231 | + |
| 232 | + # loc |
| 233 | + # ############## |
| 234 | + # - assign a single value -> exp_single_cats_value |
| 235 | + df = orig.copy() |
| 236 | + df.loc["j", df.columns[0]] = "b" |
| 237 | + tm.assert_frame_equal(df, exp_single_cats_value) |
| 238 | + |
| 239 | + df = orig.copy() |
| 240 | + df.loc[df.index == "j", df.columns[0]] = "b" |
| 241 | + tm.assert_frame_equal(df, exp_single_cats_value) |
| 242 | + |
| 243 | + # - assign a single value not in the current categories set |
| 244 | + with pytest.raises(ValueError): |
| 245 | + df = orig.copy() |
| 246 | + df.loc["j", df.columns[0]] = "c" |
| 247 | + |
| 248 | + # - assign a complete row (mixed values) -> exp_single_row |
| 249 | + df = orig.copy() |
| 250 | + df.loc["j", :] = ["b", 2] |
| 251 | + tm.assert_frame_equal(df, exp_single_row) |
| 252 | + |
| 253 | + # - assign a complete row (mixed values) not in categories set |
| 254 | + with pytest.raises(ValueError): |
| 255 | + df = orig.copy() |
| 256 | + df.loc["j", :] = ["c", 2] |
| 257 | + |
| 258 | + # - assign multiple rows (mixed values) -> exp_multi_row |
| 259 | + df = orig.copy() |
| 260 | + df.loc["j":"k", :] = [["b", 2], ["b", 2]] |
| 261 | + tm.assert_frame_equal(df, exp_multi_row) |
| 262 | + |
| 263 | + with pytest.raises(ValueError): |
| 264 | + df = orig.copy() |
| 265 | + df.loc["j":"k", :] = [["c", 2], ["c", 2]] |
| 266 | + |
| 267 | + # assign a part of a column with dtype == categorical -> |
| 268 | + # exp_parts_cats_col |
| 269 | + df = orig.copy() |
| 270 | + df.loc["j":"k", df.columns[0]] = Categorical(["b", "b"], categories=["a", "b"]) |
| 271 | + tm.assert_frame_equal(df, exp_parts_cats_col) |
| 272 | + |
| 273 | + with pytest.raises(ValueError): |
| 274 | + # different categories -> not sure if this should fail or pass |
| 275 | + df = orig.copy() |
| 276 | + df.loc["j":"k", df.columns[0]] = Categorical( |
| 277 | + ["b", "b"], categories=["a", "b", "c"] |
| 278 | + ) |
| 279 | + |
| 280 | + with pytest.raises(ValueError): |
| 281 | + # different values |
| 282 | + df = orig.copy() |
| 283 | + df.loc["j":"k", df.columns[0]] = Categorical( |
| 284 | + ["c", "c"], categories=["a", "b", "c"] |
| 285 | + ) |
| 286 | + |
| 287 | + # assign a part of a column with dtype != categorical -> |
| 288 | + # exp_parts_cats_col |
| 289 | + df = orig.copy() |
| 290 | + df.loc["j":"k", df.columns[0]] = ["b", "b"] |
| 291 | + tm.assert_frame_equal(df, exp_parts_cats_col) |
| 292 | + |
| 293 | + with pytest.raises(ValueError): |
| 294 | + df.loc["j":"k", df.columns[0]] = ["c", "c"] |
| 295 | + |
| 296 | + # iat |
| 297 | + df = orig.copy() |
| 298 | + df.iat[2, 0] = "b" |
| 299 | + tm.assert_frame_equal(df, exp_single_cats_value) |
| 300 | + |
| 301 | + # - assign a single value not in the current categories set |
| 302 | + with pytest.raises(ValueError): |
| 303 | + df = orig.copy() |
| 304 | + df.iat[2, 0] = "c" |
| 305 | + |
| 306 | + # at |
| 307 | + # - assign a single value -> exp_single_cats_value |
| 308 | + df = orig.copy() |
| 309 | + df.at["j", "cats"] = "b" |
| 310 | + tm.assert_frame_equal(df, exp_single_cats_value) |
| 311 | + |
| 312 | + # - assign a single value not in the current categories set |
| 313 | + with pytest.raises(ValueError): |
| 314 | + df = orig.copy() |
| 315 | + df.at["j", "cats"] = "c" |
| 316 | + |
| 317 | + # fancy indexing |
| 318 | + catsf = Categorical( |
| 319 | + ["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"] |
| 320 | + ) |
| 321 | + idxf = Index(["h", "i", "j", "k", "l", "m", "n"]) |
| 322 | + valuesf = [1, 1, 3, 3, 1, 1, 1] |
| 323 | + df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf) |
| 324 | + |
| 325 | + exp_fancy = exp_multi_row.copy() |
| 326 | + exp_fancy["cats"].cat.set_categories(["a", "b", "c"], inplace=True) |
| 327 | + |
| 328 | + df[df["cats"] == "c"] = ["b", 2] |
| 329 | + # category c is kept in .categories |
| 330 | + tm.assert_frame_equal(df, exp_fancy) |
| 331 | + |
| 332 | + # set_value |
| 333 | + df = orig.copy() |
| 334 | + df.at["j", "cats"] = "b" |
| 335 | + tm.assert_frame_equal(df, exp_single_cats_value) |
| 336 | + |
| 337 | + with pytest.raises(ValueError): |
| 338 | + df = orig.copy() |
| 339 | + df.at["j", "cats"] = "c" |
| 340 | + |
| 341 | + # Assigning a Category to parts of a int/... column uses the values of |
| 342 | + # the Categorical |
| 343 | + df = DataFrame({"a": [1, 1, 1, 1, 1], "b": list("aaaaa")}) |
| 344 | + exp = DataFrame({"a": [1, "b", "b", 1, 1], "b": list("aabba")}) |
| 345 | + df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"]) |
| 346 | + df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"]) |
| 347 | + tm.assert_frame_equal(df, exp) |
| 348 | + |
| 349 | + def test_functions_no_warnings(self): |
| 350 | + df = DataFrame({"value": np.random.randint(0, 100, 20)}) |
| 351 | + labels = ["{0} - {1}".format(i, i + 9) for i in range(0, 100, 10)] |
| 352 | + with tm.assert_produces_warning(False): |
| 353 | + df["group"] = pd.cut( |
| 354 | + df.value, range(0, 105, 10), right=False, labels=labels |
| 355 | + ) |
| 356 | + |
| 357 | + def test_loc_indexing_preserves_index_category_dtype(self): |
| 358 | + # GH 15166 |
| 359 | + df = DataFrame( |
| 360 | + data=np.arange(2, 22, 2), |
| 361 | + index=pd.MultiIndex( |
| 362 | + levels=[pd.CategoricalIndex(["a", "b"]), range(10)], |
| 363 | + codes=[[0] * 5 + [1] * 5, range(10)], |
| 364 | + names=["Index1", "Index2"], |
| 365 | + ), |
| 366 | + ) |
| 367 | + |
| 368 | + expected = pd.CategoricalIndex( |
| 369 | + ["a", "b"], |
| 370 | + categories=["a", "b"], |
| 371 | + ordered=False, |
| 372 | + name="Index1", |
| 373 | + dtype="category", |
| 374 | + ) |
| 375 | + |
| 376 | + result = df.index.levels[0] |
| 377 | + tm.assert_index_equal(result, expected) |
| 378 | + |
| 379 | + result = df.loc[["a"]].index.levels[0] |
| 380 | + tm.assert_index_equal(result, expected) |
| 381 | + |
| 382 | + def test_wrong_length_cat_dtype_raises(self): |
| 383 | + # GH29523 |
| 384 | + cat = pd.Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"]) |
| 385 | + df = pd.DataFrame({"bar": range(10)}) |
| 386 | + err = "Length of values does not match length of index" |
| 387 | + with pytest.raises(ValueError, match=err): |
| 388 | + df["foo"] = cat |
0 commit comments