|
1 | 1 | # -*- coding: utf-8 -*-
|
2 | 2 | # pylint: disable=E1101,E1103,W0232
|
3 | 3 |
|
| 4 | +import pytest |
4 | 5 | import sys
|
5 | 6 | from datetime import datetime
|
6 | 7 | from distutils.version import LooseVersion
|
|
17 | 18 | import pandas.compat as compat
|
18 | 19 | import pandas.util.testing as tm
|
19 | 20 | from pandas import (Categorical, Index, Series, DataFrame, PeriodIndex,
|
20 |
| - Timestamp, CategoricalIndex, isnull) |
| 21 | + Timestamp, CategoricalIndex, DatetimeIndex, |
| 22 | + isnull, NaT) |
21 | 23 | from pandas.compat import range, lrange, u, PY3
|
22 | 24 | from pandas.core.config import option_context
|
23 | 25 |
|
@@ -160,12 +162,6 @@ def f():
|
160 | 162 |
|
161 | 163 | self.assertRaises(ValueError, f)
|
162 | 164 |
|
163 |
| - def f(): |
164 |
| - with tm.assert_produces_warning(FutureWarning): |
165 |
| - Categorical([1, 2], [1, 2, np.nan, np.nan]) |
166 |
| - |
167 |
| - self.assertRaises(ValueError, f) |
168 |
| - |
169 | 165 | # The default should be unordered
|
170 | 166 | c1 = Categorical(["a", "b", "c", "a"])
|
171 | 167 | self.assertFalse(c1.ordered)
|
@@ -222,29 +218,12 @@ def f():
|
222 | 218 | cat = pd.Categorical([np.nan, 1., 2., 3.])
|
223 | 219 | self.assertTrue(is_float_dtype(cat.categories))
|
224 | 220 |
|
225 |
| - # Deprecating NaNs in categoires (GH #10748) |
226 |
| - # preserve int as far as possible by converting to object if NaN is in |
227 |
| - # categories |
228 |
| - with tm.assert_produces_warning(FutureWarning): |
229 |
| - cat = pd.Categorical([np.nan, 1, 2, 3], |
230 |
| - categories=[np.nan, 1, 2, 3]) |
231 |
| - self.assertTrue(is_object_dtype(cat.categories)) |
232 |
| - |
233 | 221 | # This doesn't work -> this would probably need some kind of "remember
|
234 | 222 | # the original type" feature to try to cast the array interface result
|
235 | 223 | # to...
|
236 | 224 |
|
237 | 225 | # vals = np.asarray(cat[cat.notnull()])
|
238 | 226 | # self.assertTrue(is_integer_dtype(vals))
|
239 |
| - with tm.assert_produces_warning(FutureWarning): |
240 |
| - cat = pd.Categorical([np.nan, "a", "b", "c"], |
241 |
| - categories=[np.nan, "a", "b", "c"]) |
242 |
| - self.assertTrue(is_object_dtype(cat.categories)) |
243 |
| - # but don't do it for floats |
244 |
| - with tm.assert_produces_warning(FutureWarning): |
245 |
| - cat = pd.Categorical([np.nan, 1., 2., 3.], |
246 |
| - categories=[np.nan, 1., 2., 3.]) |
247 |
| - self.assertTrue(is_float_dtype(cat.categories)) |
248 | 227 |
|
249 | 228 | # corner cases
|
250 | 229 | cat = pd.Categorical([1])
|
@@ -295,6 +274,22 @@ def f():
|
295 | 274 | c = Categorical(np.array([], dtype='int64'), # noqa
|
296 | 275 | categories=[3, 2, 1], ordered=True)
|
297 | 276 |
|
| 277 | + def test_constructor_with_null(self): |
| 278 | + |
| 279 | + # Cannot have NaN in categories |
| 280 | + with pytest.raises(ValueError): |
| 281 | + pd.Categorical([np.nan, "a", "b", "c"], |
| 282 | + categories=[np.nan, "a", "b", "c"]) |
| 283 | + |
| 284 | + with pytest.raises(ValueError): |
| 285 | + pd.Categorical([None, "a", "b", "c"], |
| 286 | + categories=[None, "a", "b", "c"]) |
| 287 | + |
| 288 | + with pytest.raises(ValueError): |
| 289 | + pd.Categorical(DatetimeIndex(['nat', '20160101']), |
| 290 | + categories=[NaT, Timestamp('20160101')]) |
| 291 | + |
| 292 | + |
298 | 293 | def test_constructor_with_index(self):
|
299 | 294 | ci = CategoricalIndex(list('aabbca'), categories=list('cab'))
|
300 | 295 | tm.assert_categorical_equal(ci.values, Categorical(ci))
|
@@ -418,6 +413,12 @@ def f():
|
418 | 413 |
|
419 | 414 | self.assertRaises(ValueError, f)
|
420 | 415 |
|
| 416 | + # NaN categories included |
| 417 | + def f(): |
| 418 | + Categorical.from_codes([0, 1, 2], ["a", "b", np.nan]) |
| 419 | + |
| 420 | + self.assertRaises(ValueError, f) |
| 421 | + |
421 | 422 | # too negative
|
422 | 423 | def f():
|
423 | 424 | Categorical.from_codes([-2, 1, 2], ["a", "b", "c"])
|
@@ -649,30 +650,6 @@ def test_describe(self):
|
649 | 650 | name='categories'))
|
650 | 651 | tm.assert_frame_equal(desc, expected)
|
651 | 652 |
|
652 |
| - # NA as a category |
653 |
| - with tm.assert_produces_warning(FutureWarning): |
654 |
| - cat = pd.Categorical(["a", "c", "c", np.nan], |
655 |
| - categories=["b", "a", "c", np.nan]) |
656 |
| - result = cat.describe() |
657 |
| - |
658 |
| - expected = DataFrame([[0, 0], [1, 0.25], [2, 0.5], [1, 0.25]], |
659 |
| - columns=['counts', 'freqs'], |
660 |
| - index=pd.CategoricalIndex(['b', 'a', 'c', np.nan], |
661 |
| - name='categories')) |
662 |
| - tm.assert_frame_equal(result, expected, check_categorical=False) |
663 |
| - |
664 |
| - # NA as an unused category |
665 |
| - with tm.assert_produces_warning(FutureWarning): |
666 |
| - cat = pd.Categorical(["a", "c", "c"], |
667 |
| - categories=["b", "a", "c", np.nan]) |
668 |
| - result = cat.describe() |
669 |
| - |
670 |
| - exp_idx = pd.CategoricalIndex( |
671 |
| - ['b', 'a', 'c', np.nan], name='categories') |
672 |
| - expected = DataFrame([[0, 0], [1, 1 / 3.], [2, 2 / 3.], [0, 0]], |
673 |
| - columns=['counts', 'freqs'], index=exp_idx) |
674 |
| - tm.assert_frame_equal(result, expected, check_categorical=False) |
675 |
| - |
676 | 653 | def test_print(self):
|
677 | 654 | expected = ["[a, b, b, a, a, c, c, c]",
|
678 | 655 | "Categories (3, object): [a < b < c]"]
|
@@ -1119,90 +1096,18 @@ def test_nan_handling(self):
|
1119 | 1096 | self.assert_numpy_array_equal(c._codes,
|
1120 | 1097 | np.array([0, -1, -1, 0], dtype=np.int8))
|
1121 | 1098 |
|
1122 |
| - # If categories have nan included, the code should point to that |
1123 |
| - # instead |
1124 |
| - with tm.assert_produces_warning(FutureWarning): |
1125 |
| - c = Categorical(["a", "b", np.nan, "a"], |
1126 |
| - categories=["a", "b", np.nan]) |
1127 |
| - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1128 |
| - self.assert_numpy_array_equal(c._codes, |
1129 |
| - np.array([0, 1, 2, 0], dtype=np.int8)) |
1130 |
| - c[1] = np.nan |
1131 |
| - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1132 |
| - self.assert_numpy_array_equal(c._codes, |
1133 |
| - np.array([0, 2, 2, 0], dtype=np.int8)) |
1134 |
| - |
1135 |
| - # Changing categories should also make the replaced category np.nan |
1136 |
| - c = Categorical(["a", "b", "c", "a"]) |
1137 |
| - with tm.assert_produces_warning(FutureWarning): |
1138 |
| - c.categories = ["a", "b", np.nan] # noqa |
1139 |
| - |
1140 |
| - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1141 |
| - self.assert_numpy_array_equal(c._codes, |
1142 |
| - np.array([0, 1, 2, 0], dtype=np.int8)) |
1143 |
| - |
1144 | 1099 | # Adding nan to categories should make assigned nan point to the
|
1145 | 1100 | # category!
|
1146 | 1101 | c = Categorical(["a", "b", np.nan, "a"])
|
1147 | 1102 | self.assert_index_equal(c.categories, Index(["a", "b"]))
|
1148 | 1103 | self.assert_numpy_array_equal(c._codes,
|
1149 | 1104 | np.array([0, 1, -1, 0], dtype=np.int8))
|
1150 |
| - with tm.assert_produces_warning(FutureWarning): |
1151 |
| - c.set_categories(["a", "b", np.nan], rename=True, inplace=True) |
1152 |
| - |
1153 |
| - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1154 |
| - self.assert_numpy_array_equal(c._codes, |
1155 |
| - np.array([0, 1, -1, 0], dtype=np.int8)) |
1156 |
| - c[1] = np.nan |
1157 |
| - self.assert_index_equal(c.categories, Index(["a", "b", np.nan])) |
1158 |
| - self.assert_numpy_array_equal(c._codes, |
1159 |
| - np.array([0, 2, -1, 0], dtype=np.int8)) |
1160 |
| - |
1161 |
| - # Remove null categories (GH 10156) |
1162 |
| - cases = [([1.0, 2.0, np.nan], [1.0, 2.0]), |
1163 |
| - (['a', 'b', None], ['a', 'b']), |
1164 |
| - ([pd.Timestamp('2012-05-01'), pd.NaT], |
1165 |
| - [pd.Timestamp('2012-05-01')])] |
1166 |
| - |
1167 |
| - null_values = [np.nan, None, pd.NaT] |
1168 |
| - |
1169 |
| - for with_null, without in cases: |
1170 |
| - with tm.assert_produces_warning(FutureWarning): |
1171 |
| - base = Categorical([], with_null) |
1172 |
| - expected = Categorical([], without) |
1173 |
| - |
1174 |
| - for nullval in null_values: |
1175 |
| - result = base.remove_categories(nullval) |
1176 |
| - self.assert_categorical_equal(result, expected) |
1177 |
| - |
1178 |
| - # Different null values are indistinguishable |
1179 |
| - for i, j in [(0, 1), (0, 2), (1, 2)]: |
1180 |
| - nulls = [null_values[i], null_values[j]] |
1181 |
| - |
1182 |
| - def f(): |
1183 |
| - with tm.assert_produces_warning(FutureWarning): |
1184 |
| - Categorical([], categories=nulls) |
1185 |
| - |
1186 |
| - self.assertRaises(ValueError, f) |
1187 | 1105 |
|
1188 | 1106 | def test_isnull(self):
|
1189 | 1107 | exp = np.array([False, False, True])
|
1190 | 1108 | c = Categorical(["a", "b", np.nan])
|
1191 | 1109 | res = c.isnull()
|
1192 |
| - self.assert_numpy_array_equal(res, exp) |
1193 | 1110 |
|
1194 |
| - with tm.assert_produces_warning(FutureWarning): |
1195 |
| - c = Categorical(["a", "b", np.nan], categories=["a", "b", np.nan]) |
1196 |
| - res = c.isnull() |
1197 |
| - self.assert_numpy_array_equal(res, exp) |
1198 |
| - |
1199 |
| - # test both nan in categories and as -1 |
1200 |
| - exp = np.array([True, False, True]) |
1201 |
| - c = Categorical(["a", "b", np.nan]) |
1202 |
| - with tm.assert_produces_warning(FutureWarning): |
1203 |
| - c.set_categories(["a", "b", np.nan], rename=True, inplace=True) |
1204 |
| - c[0] = np.nan |
1205 |
| - res = c.isnull() |
1206 | 1111 | self.assert_numpy_array_equal(res, exp)
|
1207 | 1112 |
|
1208 | 1113 | def test_codes_immutable(self):
|
@@ -1487,45 +1392,10 @@ def test_slicing_directly(self):
|
1487 | 1392 |
|
1488 | 1393 | def test_set_item_nan(self):
|
1489 | 1394 | cat = pd.Categorical([1, 2, 3])
|
1490 |
| - exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3]) |
1491 | 1395 | cat[1] = np.nan
|
1492 |
| - tm.assert_categorical_equal(cat, exp) |
1493 | 1396 |
|
1494 |
| - # if nan in categories, the proper code should be set! |
1495 |
| - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) |
1496 |
| - with tm.assert_produces_warning(FutureWarning): |
1497 |
| - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1498 |
| - cat[1] = np.nan |
1499 |
| - exp = np.array([0, 3, 2, -1], dtype=np.int8) |
1500 |
| - self.assert_numpy_array_equal(cat.codes, exp) |
1501 |
| - |
1502 |
| - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) |
1503 |
| - with tm.assert_produces_warning(FutureWarning): |
1504 |
| - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1505 |
| - cat[1:3] = np.nan |
1506 |
| - exp = np.array([0, 3, 3, -1], dtype=np.int8) |
1507 |
| - self.assert_numpy_array_equal(cat.codes, exp) |
1508 |
| - |
1509 |
| - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) |
1510 |
| - with tm.assert_produces_warning(FutureWarning): |
1511 |
| - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1512 |
| - cat[1:3] = [np.nan, 1] |
1513 |
| - exp = np.array([0, 3, 0, -1], dtype=np.int8) |
1514 |
| - self.assert_numpy_array_equal(cat.codes, exp) |
1515 |
| - |
1516 |
| - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) |
1517 |
| - with tm.assert_produces_warning(FutureWarning): |
1518 |
| - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1519 |
| - cat[1:3] = [np.nan, np.nan] |
1520 |
| - exp = np.array([0, 3, 3, -1], dtype=np.int8) |
1521 |
| - self.assert_numpy_array_equal(cat.codes, exp) |
1522 |
| - |
1523 |
| - cat = pd.Categorical([1, 2, np.nan, 3], categories=[1, 2, 3]) |
1524 |
| - with tm.assert_produces_warning(FutureWarning): |
1525 |
| - cat.set_categories([1, 2, 3, np.nan], rename=True, inplace=True) |
1526 |
| - cat[pd.isnull(cat)] = np.nan |
1527 |
| - exp = np.array([0, 1, 3, 2], dtype=np.int8) |
1528 |
| - self.assert_numpy_array_equal(cat.codes, exp) |
| 1397 | + exp = pd.Categorical([1, np.nan, 3], categories=[1, 2, 3]) |
| 1398 | + tm.assert_categorical_equal(cat, exp) |
1529 | 1399 |
|
1530 | 1400 | def test_shift(self):
|
1531 | 1401 | # GH 9416
|
@@ -2026,33 +1896,12 @@ def test_sideeffects_free(self):
|
2026 | 1896 |
|
2027 | 1897 | def test_nan_handling(self):
|
2028 | 1898 |
|
2029 |
| - # Nans are represented as -1 in labels |
| 1899 | + # NaNs are represented as -1 in labels |
2030 | 1900 | s = Series(Categorical(["a", "b", np.nan, "a"]))
|
2031 | 1901 | self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
|
2032 | 1902 | self.assert_numpy_array_equal(s.values.codes,
|
2033 | 1903 | np.array([0, 1, -1, 0], dtype=np.int8))
|
2034 | 1904 |
|
2035 |
| - # If categories have nan included, the label should point to that |
2036 |
| - # instead |
2037 |
| - with tm.assert_produces_warning(FutureWarning): |
2038 |
| - s2 = Series(Categorical(["a", "b", np.nan, "a"], |
2039 |
| - categories=["a", "b", np.nan])) |
2040 |
| - |
2041 |
| - exp_cat = Index(["a", "b", np.nan]) |
2042 |
| - self.assert_index_equal(s2.cat.categories, exp_cat) |
2043 |
| - self.assert_numpy_array_equal(s2.values.codes, |
2044 |
| - np.array([0, 1, 2, 0], dtype=np.int8)) |
2045 |
| - |
2046 |
| - # Changing categories should also make the replaced category np.nan |
2047 |
| - s3 = Series(Categorical(["a", "b", "c", "a"])) |
2048 |
| - with tm.assert_produces_warning(FutureWarning, check_stacklevel=False): |
2049 |
| - s3.cat.categories = ["a", "b", np.nan] |
2050 |
| - |
2051 |
| - exp_cat = Index(["a", "b", np.nan]) |
2052 |
| - self.assert_index_equal(s3.cat.categories, exp_cat) |
2053 |
| - self.assert_numpy_array_equal(s3.values.codes, |
2054 |
| - np.array([0, 1, 2, 0], dtype=np.int8)) |
2055 |
| - |
2056 | 1905 | def test_cat_accessor(self):
|
2057 | 1906 | s = Series(Categorical(["a", "b", np.nan, "a"]))
|
2058 | 1907 | self.assert_index_equal(s.cat.categories, Index(["a", "b"]))
|
|
0 commit comments