Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit c8886dc

Browse files
committedNov 26, 2017
Split test_categorical into subpackage (#18497)
1 parent 674fb96 commit c8886dc

13 files changed

+4962
-4831
lines changed
 

‎pandas/tests/categorical/__init__.py

Whitespace-only changes.

‎pandas/tests/categorical/test_api.py

Lines changed: 1679 additions & 0 deletions
Large diffs are not rendered by default.

‎pandas/tests/categorical/test_constructors.py

Lines changed: 625 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import pytest
4+
5+
import pandas as pd
6+
import pandas.util.testing as tm
7+
from pandas.core.dtypes.dtypes import CategoricalDtype
8+
from pandas import (Categorical, Index, Series, DataFrame, CategoricalIndex)
9+
10+
11+
class TestCategoricalDtypes(object):
12+
13+
def test_is_equal_dtype(self):
14+
15+
# test dtype comparisons between cats
16+
17+
c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False)
18+
c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False)
19+
c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True)
20+
assert c1.is_dtype_equal(c1)
21+
assert c2.is_dtype_equal(c2)
22+
assert c3.is_dtype_equal(c3)
23+
assert c1.is_dtype_equal(c2)
24+
assert not c1.is_dtype_equal(c3)
25+
assert not c1.is_dtype_equal(Index(list('aabca')))
26+
assert not c1.is_dtype_equal(c1.astype(object))
27+
assert c1.is_dtype_equal(CategoricalIndex(c1))
28+
assert (c1.is_dtype_equal(
29+
CategoricalIndex(c1, categories=list('cab'))))
30+
assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
31+
32+
def test_set_dtype_same(self):
33+
c = Categorical(['a', 'b', 'c'])
34+
result = c._set_dtype(CategoricalDtype(['a', 'b', 'c']))
35+
tm.assert_categorical_equal(result, c)
36+
37+
def test_set_dtype_new_categories(self):
38+
c = Categorical(['a', 'b', 'c'])
39+
result = c._set_dtype(CategoricalDtype(list('abcd')))
40+
tm.assert_numpy_array_equal(result.codes, c.codes)
41+
tm.assert_index_equal(result.dtype.categories, Index(list('abcd')))
42+
43+
@pytest.mark.parametrize('values, categories, new_categories', [
44+
# No NaNs, same cats, same order
45+
(['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
46+
# No NaNs, same cats, different order
47+
(['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
48+
# Same, unsorted
49+
(['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
50+
# No NaNs, same cats, different order
51+
(['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
52+
# NaNs
53+
(['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
54+
(['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
55+
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
56+
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
57+
# Introduce NaNs
58+
(['a', 'b', 'c'], ['a', 'b'], ['a']),
59+
(['a', 'b', 'c'], ['a', 'b'], ['b']),
60+
(['b', 'a', 'c'], ['a', 'b'], ['a']),
61+
(['b', 'a', 'c'], ['a', 'b'], ['a']),
62+
# No overlap
63+
(['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
64+
])
65+
@pytest.mark.parametrize('ordered', [True, False])
66+
def test_set_dtype_many(self, values, categories, new_categories,
67+
ordered):
68+
c = Categorical(values, categories)
69+
expected = Categorical(values, new_categories, ordered)
70+
result = c._set_dtype(expected.dtype)
71+
tm.assert_categorical_equal(result, expected)
72+
73+
def test_set_dtype_no_overlap(self):
74+
c = Categorical(['a', 'b', 'c'], ['d', 'e'])
75+
result = c._set_dtype(CategoricalDtype(['a', 'b']))
76+
expected = Categorical([None, None, None], categories=['a', 'b'])
77+
tm.assert_categorical_equal(result, expected)
78+
79+
80+
class TestCategoricalBlockDtypes(object):
81+
82+
def test_dtypes(self):
83+
84+
# GH8143
85+
index = ['cat', 'obj', 'num']
86+
cat = Categorical(['a', 'b', 'c'])
87+
obj = Series(['a', 'b', 'c'])
88+
num = Series([1, 2, 3])
89+
df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
90+
91+
result = df.dtypes == 'object'
92+
expected = Series([False, True, False], index=index)
93+
tm.assert_series_equal(result, expected)
94+
95+
result = df.dtypes == 'int64'
96+
expected = Series([False, False, True], index=index)
97+
tm.assert_series_equal(result, expected)
98+
99+
result = df.dtypes == 'category'
100+
expected = Series([True, False, False], index=index)
101+
tm.assert_series_equal(result, expected)
102+
103+
def test_codes_dtypes(self):
104+
105+
# GH 8453
106+
result = Categorical(['foo', 'bar', 'baz'])
107+
assert result.codes.dtype == 'int8'
108+
109+
result = Categorical(['foo%05d' % i for i in range(400)])
110+
assert result.codes.dtype == 'int16'
111+
112+
result = Categorical(['foo%05d' % i for i in range(40000)])
113+
assert result.codes.dtype == 'int32'
114+
115+
# adding cats
116+
result = Categorical(['foo', 'bar', 'baz'])
117+
assert result.codes.dtype == 'int8'
118+
result = result.add_categories(['foo%05d' % i for i in range(400)])
119+
assert result.codes.dtype == 'int16'
120+
121+
# removing cats
122+
result = result.remove_categories(['foo%05d' % i for i in range(300)])
123+
assert result.codes.dtype == 'int8'
124+
125+
@pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']])
126+
def test_empty_astype(self, columns):
127+
# GH 18004
128+
msg = '> 1 ndim Categorical are not supported at this time'
129+
with tm.assert_raises_regex(NotImplementedError, msg):
130+
DataFrame(columns=columns).astype('category')
Lines changed: 396 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,396 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import pytest
4+
from distutils.version import LooseVersion
5+
6+
import numpy as np
7+
8+
import pandas as pd
9+
import pandas.util.testing as tm
10+
from pandas import (Categorical, Index, Series, DataFrame, CategoricalIndex)
11+
from pandas.core.dtypes.dtypes import CategoricalDtype
12+
13+
14+
class TestCategoricalGeneric(object):
15+
16+
def setup_method(self, method):
17+
self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
18+
ordered=True)
19+
20+
def test_categories_none(self):
21+
factor = Categorical(['a', 'b', 'b', 'a',
22+
'a', 'c', 'c', 'c'], ordered=True)
23+
tm.assert_categorical_equal(factor, self.factor)
24+
25+
def test_describe(self):
26+
# string type
27+
desc = self.factor.describe()
28+
assert self.factor.ordered
29+
exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories',
30+
ordered=self.factor.ordered)
31+
expected = DataFrame({'counts': [3, 2, 3],
32+
'freqs': [3 / 8., 2 / 8., 3 / 8.]},
33+
index=exp_index)
34+
tm.assert_frame_equal(desc, expected)
35+
36+
# check unused categories
37+
cat = self.factor.copy()
38+
cat.set_categories(["a", "b", "c", "d"], inplace=True)
39+
desc = cat.describe()
40+
41+
exp_index = CategoricalIndex(
42+
list('abcd'), ordered=self.factor.ordered, name='categories')
43+
expected = DataFrame({'counts': [3, 2, 3, 0],
44+
'freqs': [3 / 8., 2 / 8., 3 / 8., 0]},
45+
index=exp_index)
46+
tm.assert_frame_equal(desc, expected)
47+
48+
# check an integer one
49+
cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
50+
desc = cat.describe()
51+
exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered,
52+
name='categories')
53+
expected = DataFrame({'counts': [5, 3, 3],
54+
'freqs': [5 / 11., 3 / 11., 3 / 11.]},
55+
index=exp_index)
56+
tm.assert_frame_equal(desc, expected)
57+
58+
# https://github.com/pandas-dev/pandas/issues/3678
59+
# describe should work with NaN
60+
cat = Categorical([np.nan, 1, 2, 2])
61+
desc = cat.describe()
62+
expected = DataFrame({'counts': [1, 2, 1],
63+
'freqs': [1 / 4., 2 / 4., 1 / 4.]},
64+
index=CategoricalIndex([1, 2, np.nan],
65+
categories=[1, 2],
66+
name='categories'))
67+
tm.assert_frame_equal(desc, expected)
68+
69+
def test_getitem(self):
70+
assert self.factor[0] == 'a'
71+
assert self.factor[-1] == 'c'
72+
73+
subf = self.factor[[0, 1, 2]]
74+
tm.assert_numpy_array_equal(subf._codes,
75+
np.array([0, 1, 1], dtype=np.int8))
76+
77+
subf = self.factor[np.asarray(self.factor) == 'c']
78+
tm.assert_numpy_array_equal(subf._codes,
79+
np.array([2, 2, 2], dtype=np.int8))
80+
81+
def test_setitem(self):
82+
83+
# int/positional
84+
c = self.factor.copy()
85+
c[0] = 'b'
86+
assert c[0] == 'b'
87+
c[-1] = 'a'
88+
assert c[-1] == 'a'
89+
90+
# boolean
91+
c = self.factor.copy()
92+
indexer = np.zeros(len(c), dtype='bool')
93+
indexer[0] = True
94+
indexer[-1] = True
95+
c[indexer] = 'c'
96+
expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
97+
ordered=True)
98+
99+
tm.assert_categorical_equal(c, expected)
100+
101+
def test_set_categories_inplace(self):
102+
cat = self.factor.copy()
103+
cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
104+
tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd']))
105+
106+
def test_comparisons(self):
107+
108+
result = self.factor[self.factor == 'a']
109+
expected = self.factor[np.asarray(self.factor) == 'a']
110+
tm.assert_categorical_equal(result, expected)
111+
112+
result = self.factor[self.factor != 'a']
113+
expected = self.factor[np.asarray(self.factor) != 'a']
114+
tm.assert_categorical_equal(result, expected)
115+
116+
result = self.factor[self.factor < 'c']
117+
expected = self.factor[np.asarray(self.factor) < 'c']
118+
tm.assert_categorical_equal(result, expected)
119+
120+
result = self.factor[self.factor > 'a']
121+
expected = self.factor[np.asarray(self.factor) > 'a']
122+
tm.assert_categorical_equal(result, expected)
123+
124+
result = self.factor[self.factor >= 'b']
125+
expected = self.factor[np.asarray(self.factor) >= 'b']
126+
tm.assert_categorical_equal(result, expected)
127+
128+
result = self.factor[self.factor <= 'b']
129+
expected = self.factor[np.asarray(self.factor) <= 'b']
130+
tm.assert_categorical_equal(result, expected)
131+
132+
n = len(self.factor)
133+
134+
other = self.factor[np.random.permutation(n)]
135+
result = self.factor == other
136+
expected = np.asarray(self.factor) == np.asarray(other)
137+
tm.assert_numpy_array_equal(result, expected)
138+
139+
result = self.factor == 'd'
140+
expected = np.repeat(False, len(self.factor))
141+
tm.assert_numpy_array_equal(result, expected)
142+
143+
# comparisons with categoricals
144+
cat_rev = Categorical(
145+
["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
146+
cat_rev_base = Categorical(
147+
["b", "b", "b"], categories=["c", "b", "a"], ordered=True)
148+
cat = Categorical(["a", "b", "c"], ordered=True)
149+
cat_base = Categorical(
150+
["b", "b", "b"], categories=cat.categories, ordered=True)
151+
152+
# comparisons need to take categories ordering into account
153+
res_rev = cat_rev > cat_rev_base
154+
exp_rev = np.array([True, False, False])
155+
tm.assert_numpy_array_equal(res_rev, exp_rev)
156+
157+
res_rev = cat_rev < cat_rev_base
158+
exp_rev = np.array([False, False, True])
159+
tm.assert_numpy_array_equal(res_rev, exp_rev)
160+
161+
res = cat > cat_base
162+
exp = np.array([False, False, True])
163+
tm.assert_numpy_array_equal(res, exp)
164+
165+
# Only categories with same categories can be compared
166+
def f():
167+
cat > cat_rev
168+
169+
pytest.raises(TypeError, f)
170+
171+
cat_rev_base2 = Categorical(
172+
["b", "b", "b"], categories=["c", "b", "a", "d"])
173+
174+
def f():
175+
cat_rev > cat_rev_base2
176+
177+
pytest.raises(TypeError, f)
178+
179+
# Only categories with same ordering information can be compared
180+
cat_unorderd = cat.set_ordered(False)
181+
assert not (cat > cat).any()
182+
183+
def f():
184+
cat > cat_unorderd
185+
186+
pytest.raises(TypeError, f)
187+
188+
# comparison (in both directions) with Series will raise
189+
s = Series(["b", "b", "b"])
190+
pytest.raises(TypeError, lambda: cat > s)
191+
pytest.raises(TypeError, lambda: cat_rev > s)
192+
pytest.raises(TypeError, lambda: s < cat)
193+
pytest.raises(TypeError, lambda: s < cat_rev)
194+
195+
# comparison with numpy.array will raise in both direction, but only on
196+
# newer numpy versions
197+
a = np.array(["b", "b", "b"])
198+
pytest.raises(TypeError, lambda: cat > a)
199+
pytest.raises(TypeError, lambda: cat_rev > a)
200+
201+
# The following work via '__array_priority__ = 1000'
202+
# works only on numpy >= 1.7.1
203+
if LooseVersion(np.__version__) > "1.7.1":
204+
pytest.raises(TypeError, lambda: a < cat)
205+
pytest.raises(TypeError, lambda: a < cat_rev)
206+
207+
# Make sure that unequal comparison take the categories order in
208+
# account
209+
cat_rev = Categorical(
210+
list("abc"), categories=list("cba"), ordered=True)
211+
exp = np.array([True, False, False])
212+
res = cat_rev > "b"
213+
tm.assert_numpy_array_equal(res, exp)
214+
215+
def test_print(self):
216+
expected = ["[a, b, b, a, a, c, c, c]",
217+
"Categories (3, object): [a < b < c]"]
218+
expected = "\n".join(expected)
219+
actual = repr(self.factor)
220+
assert actual == expected
221+
222+
223+
class TestCategoricalGenericBlock(object):
224+
225+
def setup_method(self, method):
226+
self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
227+
228+
df = DataFrame({'value': np.random.randint(0, 10000, 100)})
229+
labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
230+
cat_labels = Categorical(labels, labels)
231+
232+
df = df.sort_values(by=['value'], ascending=True)
233+
df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
234+
right=False, labels=cat_labels)
235+
self.cat = df
236+
237+
def test_basic(self):
238+
239+
# test basic creation / coercion of categoricals
240+
s = Series(self.factor, name='A')
241+
assert s.dtype == 'category'
242+
assert len(s) == len(self.factor)
243+
str(s.values)
244+
str(s)
245+
246+
# in a frame
247+
df = DataFrame({'A': self.factor})
248+
result = df['A']
249+
tm.assert_series_equal(result, s)
250+
result = df.iloc[:, 0]
251+
tm.assert_series_equal(result, s)
252+
assert len(df) == len(self.factor)
253+
str(df.values)
254+
str(df)
255+
256+
df = DataFrame({'A': s})
257+
result = df['A']
258+
tm.assert_series_equal(result, s)
259+
assert len(df) == len(self.factor)
260+
str(df.values)
261+
str(df)
262+
263+
# multiples
264+
df = DataFrame({'A': s, 'B': s, 'C': 1})
265+
result1 = df['A']
266+
result2 = df['B']
267+
tm.assert_series_equal(result1, s)
268+
tm.assert_series_equal(result2, s, check_names=False)
269+
assert result2.name == 'B'
270+
assert len(df) == len(self.factor)
271+
str(df.values)
272+
str(df)
273+
274+
# GH8623
275+
x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
276+
[1, 'John P. Doe']],
277+
columns=['person_id', 'person_name'])
278+
x['person_name'] = Categorical(x.person_name
279+
) # doing this breaks transform
280+
281+
expected = x.iloc[0].person_name
282+
result = x.person_name.iloc[0]
283+
assert result == expected
284+
285+
result = x.person_name[0]
286+
assert result == expected
287+
288+
result = x.person_name.loc[0]
289+
assert result == expected
290+
291+
def test_describe(self):
292+
293+
# Categoricals should not show up together with numerical columns
294+
result = self.cat.describe()
295+
assert len(result.columns) == 1
296+
297+
# In a frame, describe() for the cat should be the same as for string
298+
# arrays (count, unique, top, freq)
299+
300+
cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'],
301+
ordered=True)
302+
s = Series(cat)
303+
result = s.describe()
304+
expected = Series([4, 2, "b", 3],
305+
index=['count', 'unique', 'top', 'freq'])
306+
tm.assert_series_equal(result, expected)
307+
308+
cat = Series(Categorical(["a", "b", "c", "c"]))
309+
df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
310+
res = df3.describe()
311+
tm.assert_numpy_array_equal(res["cat"].values, res["s"].values)
312+
313+
def test_groupby_sort(self):
314+
315+
# http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby
316+
# This should result in a properly sorted Series so that the plot
317+
# has a sorted x axis
318+
# self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
319+
320+
res = self.cat.groupby(['value_group'])['value_group'].count()
321+
exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
322+
exp.index = CategoricalIndex(exp.index, name=exp.index.name)
323+
tm.assert_series_equal(res, exp)
324+
325+
def test_astype_to_other(self):
326+
327+
s = self.cat['value_group']
328+
expected = s
329+
tm.assert_series_equal(s.astype('category'), expected)
330+
tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
331+
pytest.raises(ValueError, lambda: s.astype('float64'))
332+
333+
cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
334+
exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
335+
tm.assert_series_equal(cat.astype('str'), exp)
336+
s2 = Series(Categorical(['1', '2', '3', '4']))
337+
exp2 = Series([1, 2, 3, 4]).astype(int)
338+
tm.assert_series_equal(s2.astype('int'), exp2)
339+
340+
# object don't sort correctly, so just compare that we have the same
341+
# values
342+
def cmp(a, b):
343+
tm.assert_almost_equal(
344+
np.sort(np.unique(a)), np.sort(np.unique(b)))
345+
346+
expected = Series(np.array(s.values), name='value_group')
347+
cmp(s.astype('object'), expected)
348+
cmp(s.astype(np.object_), expected)
349+
350+
# array conversion
351+
tm.assert_almost_equal(np.array(s), np.array(s.values))
352+
353+
# valid conversion
354+
for valid in [lambda x: x.astype('category'),
355+
lambda x: x.astype(CategoricalDtype()),
356+
lambda x: x.astype('object').astype('category'),
357+
lambda x: x.astype('object').astype(
358+
CategoricalDtype())
359+
]:
360+
361+
result = valid(s)
362+
# compare series values
363+
# internal .categories can't be compared because it is sorted
364+
tm.assert_series_equal(result, s, check_categorical=False)
365+
366+
# invalid conversion (these are NOT a dtype)
367+
for invalid in [lambda x: x.astype(Categorical),
368+
lambda x: x.astype('object').astype(Categorical)]:
369+
pytest.raises(TypeError, lambda: invalid(s))
370+
371+
def test_numeric_like_ops(self):
372+
373+
# numeric ops should not succeed
374+
for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
375+
pytest.raises(TypeError,
376+
lambda: getattr(self.cat, op)(self.cat))
377+
378+
# reduction ops should not succeed (unless specifically defined, e.g.
379+
# min/max)
380+
s = self.cat['value_group']
381+
for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']:
382+
pytest.raises(TypeError,
383+
lambda: getattr(s, op)(numeric_only=False))
384+
385+
# mad technically works because it takes always the numeric data
386+
387+
# numpy ops
388+
s = Series(Categorical([1, 2, 3, 4]))
389+
pytest.raises(TypeError, lambda: np.sum(s))
390+
391+
# numeric ops on a Series
392+
for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
393+
pytest.raises(TypeError, lambda: getattr(s, op)(2))
394+
395+
# invalid ufunc
396+
pytest.raises(TypeError, lambda: np.log(s))

‎pandas/tests/categorical/test_indexing.py

Lines changed: 804 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import numpy as np
4+
5+
import pandas.util.testing as tm
6+
from pandas import (Categorical, Index, Series, CategoricalIndex, isna)
7+
from pandas.compat import lrange
8+
from pandas.core.dtypes.dtypes import CategoricalDtype
9+
10+
11+
class TestCategoricalMissing(object):
12+
13+
def test_na_flags_int_categories(self):
14+
# #1457
15+
16+
categories = lrange(10)
17+
labels = np.random.randint(0, 10, 20)
18+
labels[::5] = -1
19+
20+
cat = Categorical(labels, categories, fastpath=True)
21+
repr(cat)
22+
23+
tm.assert_numpy_array_equal(isna(cat), labels == -1)
24+
25+
def test_nan_handling(self):
26+
27+
# Nans are represented as -1 in codes
28+
c = Categorical(["a", "b", np.nan, "a"])
29+
tm.assert_index_equal(c.categories, Index(["a", "b"]))
30+
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
31+
dtype=np.int8))
32+
c[1] = np.nan
33+
tm.assert_index_equal(c.categories, Index(["a", "b"]))
34+
tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0],
35+
dtype=np.int8))
36+
37+
# Adding nan to categories should make assigned nan point to the
38+
# category!
39+
c = Categorical(["a", "b", np.nan, "a"])
40+
tm.assert_index_equal(c.categories, Index(["a", "b"]))
41+
tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
42+
dtype=np.int8))
43+
44+
def test_set_dtype_nans(self):
45+
c = Categorical(['a', 'b', np.nan])
46+
result = c._set_dtype(CategoricalDtype(['a', 'c']))
47+
tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1],
48+
dtype='int8'))
49+
50+
def test_isna(self):
51+
exp = np.array([False, False, True])
52+
c = Categorical(["a", "b", np.nan])
53+
res = c.isna()
54+
55+
tm.assert_numpy_array_equal(res, exp)
56+
57+
def test_set_item_nan(self):
58+
cat = Categorical([1, 2, 3])
59+
cat[1] = np.nan
60+
61+
exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
62+
tm.assert_categorical_equal(cat, exp)
63+
64+
65+
class TestCategoricalBlockMissing(object):
66+
67+
def test_value_counts_with_nan(self):
68+
# see gh-9443
69+
70+
# sanity check
71+
s = Series(["a", "b", "a"], dtype="category")
72+
exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
73+
74+
res = s.value_counts(dropna=True)
75+
tm.assert_series_equal(res, exp)
76+
77+
res = s.value_counts(dropna=True)
78+
tm.assert_series_equal(res, exp)
79+
80+
# same Series via two different constructions --> same behaviour
81+
series = [
82+
Series(["a", "b", None, "a", None, None], dtype="category"),
83+
Series(Categorical(["a", "b", None, "a", None, None],
84+
categories=["a", "b"]))
85+
]
86+
87+
for s in series:
88+
# None is a NaN value, so we exclude its count here
89+
exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
90+
res = s.value_counts(dropna=True)
91+
tm.assert_series_equal(res, exp)
92+
93+
# we don't exclude the count of None and sort by counts
94+
exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]))
95+
res = s.value_counts(dropna=False)
96+
tm.assert_series_equal(res, exp)
97+
98+
# When we aren't sorting by counts, and np.nan isn't a
99+
# category, it should be last.
100+
exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]))
101+
res = s.value_counts(dropna=False, sort=False)
102+
tm.assert_series_equal(res, exp)
103+
104+
def test_nan_handling(self):
105+
106+
# NaNs are represented as -1 in labels
107+
s = Series(Categorical(["a", "b", np.nan, "a"]))
108+
tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
109+
tm.assert_numpy_array_equal(s.values.codes,
110+
np.array([0, 1, -1, 0], dtype=np.int8))
Lines changed: 225 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,225 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import pytest
4+
5+
import numpy as np
6+
7+
import pandas.util.testing as tm
8+
from pandas import (Categorical, Series, date_range)
9+
10+
11+
class TestCategoricalOps(object):
12+
13+
def test_datetime_categorical_comparison(self):
14+
dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True)
15+
tm.assert_numpy_array_equal(dt_cat > dt_cat[0],
16+
np.array([False, True, True]))
17+
tm.assert_numpy_array_equal(dt_cat[0] < dt_cat,
18+
np.array([False, True, True]))
19+
20+
def test_reflected_comparison_with_scalars(self):
21+
# GH8658
22+
cat = Categorical([1, 2, 3], ordered=True)
23+
tm.assert_numpy_array_equal(cat > cat[0],
24+
np.array([False, True, True]))
25+
tm.assert_numpy_array_equal(cat[0] < cat,
26+
np.array([False, True, True]))
27+
28+
def test_comparison_with_unknown_scalars(self):
29+
# https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
30+
# and following comparisons with scalars not in categories should raise
31+
# for unequal comps, but not for equal/not equal
32+
cat = Categorical([1, 2, 3], ordered=True)
33+
34+
pytest.raises(TypeError, lambda: cat < 4)
35+
pytest.raises(TypeError, lambda: cat > 4)
36+
pytest.raises(TypeError, lambda: 4 < cat)
37+
pytest.raises(TypeError, lambda: 4 > cat)
38+
39+
tm.assert_numpy_array_equal(cat == 4,
40+
np.array([False, False, False]))
41+
tm.assert_numpy_array_equal(cat != 4,
42+
np.array([True, True, True]))
43+
44+
45+
class TestCategoricalBlockOps(object):
46+
47+
def test_comparisons(self):
48+
tests_data = [(list("abc"), list("cba"), list("bbb")),
49+
([1, 2, 3], [3, 2, 1], [2, 2, 2])]
50+
for data, reverse, base in tests_data:
51+
cat_rev = Series(
52+
Categorical(data, categories=reverse, ordered=True))
53+
cat_rev_base = Series(
54+
Categorical(base, categories=reverse, ordered=True))
55+
cat = Series(Categorical(data, ordered=True))
56+
cat_base = Series(
57+
Categorical(base, categories=cat.cat.categories, ordered=True))
58+
s = Series(base)
59+
a = np.array(base)
60+
61+
# comparisons need to take categories ordering into account
62+
res_rev = cat_rev > cat_rev_base
63+
exp_rev = Series([True, False, False])
64+
tm.assert_series_equal(res_rev, exp_rev)
65+
66+
res_rev = cat_rev < cat_rev_base
67+
exp_rev = Series([False, False, True])
68+
tm.assert_series_equal(res_rev, exp_rev)
69+
70+
res = cat > cat_base
71+
exp = Series([False, False, True])
72+
tm.assert_series_equal(res, exp)
73+
74+
scalar = base[1]
75+
res = cat > scalar
76+
exp = Series([False, False, True])
77+
exp2 = cat.values > scalar
78+
tm.assert_series_equal(res, exp)
79+
tm.assert_numpy_array_equal(res.values, exp2)
80+
res_rev = cat_rev > scalar
81+
exp_rev = Series([True, False, False])
82+
exp_rev2 = cat_rev.values > scalar
83+
tm.assert_series_equal(res_rev, exp_rev)
84+
tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
85+
86+
# Only categories with same categories can be compared
87+
def f():
88+
cat > cat_rev
89+
90+
pytest.raises(TypeError, f)
91+
92+
# categorical cannot be compared to Series or numpy array, and also
93+
# not the other way around
94+
pytest.raises(TypeError, lambda: cat > s)
95+
pytest.raises(TypeError, lambda: cat_rev > s)
96+
pytest.raises(TypeError, lambda: cat > a)
97+
pytest.raises(TypeError, lambda: cat_rev > a)
98+
99+
pytest.raises(TypeError, lambda: s < cat)
100+
pytest.raises(TypeError, lambda: s < cat_rev)
101+
102+
pytest.raises(TypeError, lambda: a < cat)
103+
pytest.raises(TypeError, lambda: a < cat_rev)
104+
105+
# unequal comparison should raise for unordered cats
106+
cat = Series(Categorical(list("abc")))
107+
108+
def f():
109+
cat > "b"
110+
111+
pytest.raises(TypeError, f)
112+
cat = Series(Categorical(list("abc"), ordered=False))
113+
114+
def f():
115+
cat > "b"
116+
117+
pytest.raises(TypeError, f)
118+
119+
# https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
120+
# and following comparisons with scalars not in categories should raise
121+
# for unequal comps, but not for equal/not equal
122+
cat = Series(Categorical(list("abc"), ordered=True))
123+
124+
pytest.raises(TypeError, lambda: cat < "d")
125+
pytest.raises(TypeError, lambda: cat > "d")
126+
pytest.raises(TypeError, lambda: "d" < cat)
127+
pytest.raises(TypeError, lambda: "d" > cat)
128+
129+
tm.assert_series_equal(cat == "d", Series([False, False, False]))
130+
tm.assert_series_equal(cat != "d", Series([True, True, True]))
131+
132+
# And test NaN handling...
133+
cat = Series(Categorical(["a", "b", "c", np.nan]))
134+
exp = Series([True, True, True, False])
135+
res = (cat == cat)
136+
tm.assert_series_equal(res, exp)
137+
138+
def test_cat_equality(self):
139+
140+
# GH 8938
141+
# allow equality comparisons
142+
a = Series(list('abc'), dtype="category")
143+
b = Series(list('abc'), dtype="object")
144+
c = Series(['a', 'b', 'cc'], dtype="object")
145+
d = Series(list('acb'), dtype="object")
146+
e = Categorical(list('abc'))
147+
f = Categorical(list('acb'))
148+
149+
# vs scalar
150+
assert not (a == 'a').all()
151+
assert ((a != 'a') == ~(a == 'a')).all()
152+
153+
assert not ('a' == a).all()
154+
assert (a == 'a')[0]
155+
assert ('a' == a)[0]
156+
assert not ('a' != a)[0]
157+
158+
# vs list-like
159+
assert (a == a).all()
160+
assert not (a != a).all()
161+
162+
assert (a == list(a)).all()
163+
assert (a == b).all()
164+
assert (b == a).all()
165+
assert ((~(a == b)) == (a != b)).all()
166+
assert ((~(b == a)) == (b != a)).all()
167+
168+
assert not (a == c).all()
169+
assert not (c == a).all()
170+
assert not (a == d).all()
171+
assert not (d == a).all()
172+
173+
# vs a cat-like
174+
assert (a == e).all()
175+
assert (e == a).all()
176+
assert not (a == f).all()
177+
assert not (f == a).all()
178+
179+
assert ((~(a == e) == (a != e)).all())
180+
assert ((~(e == a) == (e != a)).all())
181+
assert ((~(a == f) == (a != f)).all())
182+
assert ((~(f == a) == (f != a)).all())
183+
184+
# non-equality is not comparable
185+
pytest.raises(TypeError, lambda: a < b)
186+
pytest.raises(TypeError, lambda: b < a)
187+
pytest.raises(TypeError, lambda: a > b)
188+
pytest.raises(TypeError, lambda: b > a)
189+
190+
@pytest.mark.parametrize('ctor', [
191+
lambda *args, **kwargs: Categorical(*args, **kwargs),
192+
lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
193+
])
194+
def test_unordered_different_order_equal(self, ctor):
195+
# https://github.com/pandas-dev/pandas/issues/16014
196+
c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
197+
c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
198+
assert (c1 == c2).all()
199+
200+
c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
201+
c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False)
202+
assert (c1 != c2).all()
203+
204+
c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
205+
c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False)
206+
assert (c1 != c2).all()
207+
208+
c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
209+
c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
210+
result = c1 == c2
211+
tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
212+
213+
def test_unordered_different_categories_raises(self):
214+
c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
215+
c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False)
216+
with tm.assert_raises_regex(TypeError,
217+
"Categoricals can only be compared"):
218+
c1 == c2
219+
220+
def test_compare_different_lengths(self):
221+
c1 = Categorical([], categories=['a', 'b'])
222+
c2 = Categorical([], categories=['a'])
223+
msg = "Categories are different lengths"
224+
with tm.assert_raises_regex(TypeError, msg):
225+
c1 == c2

‎pandas/tests/categorical/test_repr.py

Lines changed: 754 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import numpy as np
4+
5+
import pandas.util.testing as tm
6+
from pandas import (Categorical, Index, Series, DataFrame)
7+
8+
9+
class TestCategoricalSort(object):
10+
11+
def test_argsort(self):
12+
c = Categorical([5, 3, 1, 4, 2], ordered=True)
13+
14+
expected = np.array([2, 4, 1, 3, 0])
15+
tm.assert_numpy_array_equal(c.argsort(ascending=True), expected,
16+
check_dtype=False)
17+
18+
expected = expected[::-1]
19+
tm.assert_numpy_array_equal(c.argsort(ascending=False), expected,
20+
check_dtype=False)
21+
22+
def test_numpy_argsort(self):
23+
c = Categorical([5, 3, 1, 4, 2], ordered=True)
24+
25+
expected = np.array([2, 4, 1, 3, 0])
26+
tm.assert_numpy_array_equal(np.argsort(c), expected,
27+
check_dtype=False)
28+
29+
tm.assert_numpy_array_equal(np.argsort(c, kind='mergesort'), expected,
30+
check_dtype=False)
31+
32+
msg = "the 'axis' parameter is not supported"
33+
tm.assert_raises_regex(ValueError, msg, np.argsort,
34+
c, axis=0)
35+
36+
msg = "the 'order' parameter is not supported"
37+
tm.assert_raises_regex(ValueError, msg, np.argsort,
38+
c, order='C')
39+
40+
def test_sort_values(self):
41+
42+
# unordered cats are sortable
43+
cat = Categorical(["a", "b", "b", "a"], ordered=False)
44+
cat.sort_values()
45+
46+
cat = Categorical(["a", "c", "b", "d"], ordered=True)
47+
48+
# sort_values
49+
res = cat.sort_values()
50+
exp = np.array(["a", "b", "c", "d"], dtype=object)
51+
tm.assert_numpy_array_equal(res.__array__(), exp)
52+
tm.assert_index_equal(res.categories, cat.categories)
53+
54+
cat = Categorical(["a", "c", "b", "d"],
55+
categories=["a", "b", "c", "d"], ordered=True)
56+
res = cat.sort_values()
57+
exp = np.array(["a", "b", "c", "d"], dtype=object)
58+
tm.assert_numpy_array_equal(res.__array__(), exp)
59+
tm.assert_index_equal(res.categories, cat.categories)
60+
61+
res = cat.sort_values(ascending=False)
62+
exp = np.array(["d", "c", "b", "a"], dtype=object)
63+
tm.assert_numpy_array_equal(res.__array__(), exp)
64+
tm.assert_index_equal(res.categories, cat.categories)
65+
66+
# sort (inplace order)
67+
cat1 = cat.copy()
68+
cat1.sort_values(inplace=True)
69+
exp = np.array(["a", "b", "c", "d"], dtype=object)
70+
tm.assert_numpy_array_equal(cat1.__array__(), exp)
71+
tm.assert_index_equal(res.categories, cat.categories)
72+
73+
# reverse
74+
cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
75+
res = cat.sort_values(ascending=False)
76+
exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
77+
exp_categories = Index(["a", "b", "c", "d"])
78+
tm.assert_numpy_array_equal(res.__array__(), exp_val)
79+
tm.assert_index_equal(res.categories, exp_categories)
80+
81+
def test_sort_values_na_position(self):
82+
# see gh-12882
83+
cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
84+
exp_categories = Index([2, 5])
85+
86+
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
87+
res = cat.sort_values() # default arguments
88+
tm.assert_numpy_array_equal(res.__array__(), exp)
89+
tm.assert_index_equal(res.categories, exp_categories)
90+
91+
exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
92+
res = cat.sort_values(ascending=True, na_position='first')
93+
tm.assert_numpy_array_equal(res.__array__(), exp)
94+
tm.assert_index_equal(res.categories, exp_categories)
95+
96+
exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
97+
res = cat.sort_values(ascending=False, na_position='first')
98+
tm.assert_numpy_array_equal(res.__array__(), exp)
99+
tm.assert_index_equal(res.categories, exp_categories)
100+
101+
exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
102+
res = cat.sort_values(ascending=True, na_position='last')
103+
tm.assert_numpy_array_equal(res.__array__(), exp)
104+
tm.assert_index_equal(res.categories, exp_categories)
105+
106+
exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
107+
res = cat.sort_values(ascending=False, na_position='last')
108+
tm.assert_numpy_array_equal(res.__array__(), exp)
109+
tm.assert_index_equal(res.categories, exp_categories)
110+
111+
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
112+
res = cat.sort_values(ascending=False, na_position='last')
113+
exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
114+
exp_categories = Index(["a", "b", "c", "d"])
115+
tm.assert_numpy_array_equal(res.__array__(), exp_val)
116+
tm.assert_index_equal(res.categories, exp_categories)
117+
118+
cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
119+
res = cat.sort_values(ascending=False, na_position='first')
120+
exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
121+
exp_categories = Index(["a", "b", "c", "d"])
122+
tm.assert_numpy_array_equal(res.__array__(), exp_val)
123+
tm.assert_index_equal(res.categories, exp_categories)
124+
125+
126+
class TestCategoricalBlockSort(object):
127+
128+
def test_sort_values(self):
129+
130+
c = Categorical(["a", "b", "b", "a"], ordered=False)
131+
cat = Series(c.copy())
132+
133+
# sort in the categories order
134+
expected = Series(
135+
Categorical(["a", "a", "b", "b"],
136+
ordered=False), index=[0, 3, 1, 2])
137+
result = cat.sort_values()
138+
tm.assert_series_equal(result, expected)
139+
140+
cat = Series(Categorical(["a", "c", "b", "d"], ordered=True))
141+
res = cat.sort_values()
142+
exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
143+
tm.assert_numpy_array_equal(res.__array__(), exp)
144+
145+
cat = Series(Categorical(["a", "c", "b", "d"], categories=[
146+
"a", "b", "c", "d"], ordered=True))
147+
res = cat.sort_values()
148+
exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
149+
tm.assert_numpy_array_equal(res.__array__(), exp)
150+
151+
res = cat.sort_values(ascending=False)
152+
exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
153+
tm.assert_numpy_array_equal(res.__array__(), exp)
154+
155+
raw_cat1 = Categorical(["a", "b", "c", "d"],
156+
categories=["a", "b", "c", "d"], ordered=False)
157+
raw_cat2 = Categorical(["a", "b", "c", "d"],
158+
categories=["d", "c", "b", "a"], ordered=True)
159+
s = ["a", "b", "c", "d"]
160+
df = DataFrame({"unsort": raw_cat1,
161+
"sort": raw_cat2,
162+
"string": s,
163+
"values": [1, 2, 3, 4]})
164+
165+
# Cats must be sorted in a dataframe
166+
res = df.sort_values(by=["string"], ascending=False)
167+
exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
168+
tm.assert_numpy_array_equal(res["sort"].values.__array__(), exp)
169+
assert res["sort"].dtype == "category"
170+
171+
res = df.sort_values(by=["sort"], ascending=False)
172+
exp = df.sort_values(by=["string"], ascending=True)
173+
tm.assert_series_equal(res["values"], exp["values"])
174+
assert res["sort"].dtype == "category"
175+
assert res["unsort"].dtype == "category"
176+
177+
# unordered cat, but we allow this
178+
df.sort_values(by=["unsort"], ascending=False)
179+
180+
# multi-columns sort
181+
# GH 7848
182+
df = DataFrame({"id": [6, 5, 4, 3, 2, 1],
183+
"raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
184+
df["grade"] = Categorical(df["raw_grade"], ordered=True)
185+
df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a'])
186+
187+
# sorts 'grade' according to the order of the categories
188+
result = df.sort_values(by=['grade'])
189+
expected = df.iloc[[1, 2, 5, 0, 3, 4]]
190+
tm.assert_frame_equal(result, expected)
191+
192+
# multi
193+
result = df.sort_values(by=['grade', 'id'])
194+
expected = df.iloc[[2, 1, 5, 4, 3, 0]]
195+
tm.assert_frame_equal(result, expected)
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# -*- coding: utf-8 -*-
2+
3+
from pandas import Categorical
4+
5+
import pandas.util.testing as tm
6+
7+
8+
class TestCategoricalSubclassing(object):
9+
10+
def test_constructor(self):
11+
sc = tm.SubclassedCategorical(['a', 'b', 'c'])
12+
assert isinstance(sc, tm.SubclassedCategorical)
13+
tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c']))
14+
15+
def test_from_array(self):
16+
sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
17+
assert isinstance(sc, tm.SubclassedCategorical)
18+
exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
19+
tm.assert_categorical_equal(sc, exp)
20+
21+
def test_map(self):
22+
sc = tm.SubclassedCategorical(['a', 'b', 'c'])
23+
res = sc.map(lambda x: x.upper())
24+
assert isinstance(res, tm.SubclassedCategorical)
25+
exp = Categorical(['A', 'B', 'C'])
26+
tm.assert_categorical_equal(res, exp)
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# -*- coding: utf-8 -*-
2+
3+
import pytest
4+
5+
import pandas.util.testing as tm
6+
7+
8+
class TestCategoricalWarnings(object):
9+
def test_tab_complete_warning(self, ip):
10+
# https://github.com/pandas-dev/pandas/issues/16409
11+
pytest.importorskip('IPython', minversion="6.0.0")
12+
from IPython.core.completer import provisionalcompleter
13+
14+
code = "import pandas as pd; c = Categorical([])"
15+
ip.run_code(code)
16+
with tm.assert_produces_warning(None):
17+
with provisionalcompleter('ignore'):
18+
list(ip.Completer.completions('c.', 1))

‎pandas/tests/test_categorical.py

Lines changed: 0 additions & 4831 deletions
This file was deleted.

0 commit comments

Comments
 (0)
Please sign in to comment.