pandas-dev · Nov 26, 2017
diff --git a/‎pandas/tests/categorical/__init__.py b/‎pandas/tests/categorical/__init__.py
diff --git a/‎pandas/tests/categorical/test_api.py
Lines changed: 1679 additions & 0 deletions b/‎pandas/tests/categorical/test_api.py
Lines changed: 1679 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_constructors.py
Lines changed: 625 additions & 0 deletions b/‎pandas/tests/categorical/test_constructors.py
Lines changed: 625 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_dtypes.py
Lines changed: 130 additions & 0 deletions b/‎pandas/tests/categorical/test_dtypes.py
Lines changed: 130 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_generic.py
Lines changed: 396 additions & 0 deletions b/‎pandas/tests/categorical/test_generic.py
Lines changed: 396 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_indexing.py
Lines changed: 804 additions & 0 deletions b/‎pandas/tests/categorical/test_indexing.py
Lines changed: 804 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_missing.py
Lines changed: 110 additions & 0 deletions b/‎pandas/tests/categorical/test_missing.py
Lines changed: 110 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_operators.py
Lines changed: 225 additions & 0 deletions b/‎pandas/tests/categorical/test_operators.py
Lines changed: 225 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_repr.py
Lines changed: 754 additions & 0 deletions b/‎pandas/tests/categorical/test_repr.py
Lines changed: 754 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_sorting.py
Lines changed: 195 additions & 0 deletions b/‎pandas/tests/categorical/test_sorting.py
Lines changed: 195 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_subclass.py
Lines changed: 26 additions & 0 deletions b/‎pandas/tests/categorical/test_subclass.py
Lines changed: 26 additions & 0 deletions
diff --git a/‎pandas/tests/categorical/test_warnings.py
Lines changed: 18 additions & 0 deletions b/‎pandas/tests/categorical/test_warnings.py
Lines changed: 18 additions & 0 deletions
diff --git a/‎pandas/tests/test_categorical.py
Lines changed: 0 additions & 4831 deletions b/‎pandas/tests/test_categorical.py
Lines changed: 0 additions & 4831 deletions
@@ -0,0 +1,130 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import pandas as pd
+import pandas.util.testing as tm
+from pandas.core.dtypes.dtypes import CategoricalDtype
+from pandas import (Categorical, Index, Series, DataFrame, CategoricalIndex)
+
+
+class TestCategoricalDtypes(object):
+
+    def test_is_equal_dtype(self):
+
+        # test dtype comparisons between cats
+
+        c1 = Categorical(list('aabca'), categories=list('abc'), ordered=False)
+        c2 = Categorical(list('aabca'), categories=list('cab'), ordered=False)
+        c3 = Categorical(list('aabca'), categories=list('cab'), ordered=True)
+        assert c1.is_dtype_equal(c1)
+        assert c2.is_dtype_equal(c2)
+        assert c3.is_dtype_equal(c3)
+        assert c1.is_dtype_equal(c2)
+        assert not c1.is_dtype_equal(c3)
+        assert not c1.is_dtype_equal(Index(list('aabca')))
+        assert not c1.is_dtype_equal(c1.astype(object))
+        assert c1.is_dtype_equal(CategoricalIndex(c1))
+        assert (c1.is_dtype_equal(
+            CategoricalIndex(c1, categories=list('cab'))))
+        assert not c1.is_dtype_equal(CategoricalIndex(c1, ordered=True))
+
+    def test_set_dtype_same(self):
+        c = Categorical(['a', 'b', 'c'])
+        result = c._set_dtype(CategoricalDtype(['a', 'b', 'c']))
+        tm.assert_categorical_equal(result, c)
+
+    def test_set_dtype_new_categories(self):
+        c = Categorical(['a', 'b', 'c'])
+        result = c._set_dtype(CategoricalDtype(list('abcd')))
+        tm.assert_numpy_array_equal(result.codes, c.codes)
+        tm.assert_index_equal(result.dtype.categories, Index(list('abcd')))
+
+    @pytest.mark.parametrize('values, categories, new_categories', [
+        # No NaNs, same cats, same order
+        (['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
+        # No NaNs, same cats, different order
+        (['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
+        # Same, unsorted
+        (['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
+        # No NaNs, same cats, different order
+        (['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
+        # NaNs
+        (['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
+        (['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
+        # Introduce NaNs
+        (['a', 'b', 'c'], ['a', 'b'], ['a']),
+        (['a', 'b', 'c'], ['a', 'b'], ['b']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a']),
+        (['b', 'a', 'c'], ['a', 'b'], ['a']),
+        # No overlap
+        (['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
+    ])
+    @pytest.mark.parametrize('ordered', [True, False])
+    def test_set_dtype_many(self, values, categories, new_categories,
+                            ordered):
+        c = Categorical(values, categories)
+        expected = Categorical(values, new_categories, ordered)
+        result = c._set_dtype(expected.dtype)
+        tm.assert_categorical_equal(result, expected)
+
+    def test_set_dtype_no_overlap(self):
+        c = Categorical(['a', 'b', 'c'], ['d', 'e'])
+        result = c._set_dtype(CategoricalDtype(['a', 'b']))
+        expected = Categorical([None, None, None], categories=['a', 'b'])
+        tm.assert_categorical_equal(result, expected)
+
+
+class TestCategoricalBlockDtypes(object):
+
+    def test_dtypes(self):
+
+        # GH8143
+        index = ['cat', 'obj', 'num']
+        cat = Categorical(['a', 'b', 'c'])
+        obj = Series(['a', 'b', 'c'])
+        num = Series([1, 2, 3])
+        df = pd.concat([Series(cat), obj, num], axis=1, keys=index)
+
+        result = df.dtypes == 'object'
+        expected = Series([False, True, False], index=index)
+        tm.assert_series_equal(result, expected)
+
+        result = df.dtypes == 'int64'
+        expected = Series([False, False, True], index=index)
+        tm.assert_series_equal(result, expected)
+
+        result = df.dtypes == 'category'
+        expected = Series([True, False, False], index=index)
+        tm.assert_series_equal(result, expected)
+
+    def test_codes_dtypes(self):
+
+        # GH 8453
+        result = Categorical(['foo', 'bar', 'baz'])
+        assert result.codes.dtype == 'int8'
+
+        result = Categorical(['foo%05d' % i for i in range(400)])
+        assert result.codes.dtype == 'int16'
+
+        result = Categorical(['foo%05d' % i for i in range(40000)])
+        assert result.codes.dtype == 'int32'
+
+        # adding cats
+        result = Categorical(['foo', 'bar', 'baz'])
+        assert result.codes.dtype == 'int8'
+        result = result.add_categories(['foo%05d' % i for i in range(400)])
+        assert result.codes.dtype == 'int16'
+
+        # removing cats
+        result = result.remove_categories(['foo%05d' % i for i in range(300)])
+        assert result.codes.dtype == 'int8'
+
+    @pytest.mark.parametrize('columns', [['x'], ['x', 'y'], ['x', 'y', 'z']])
+    def test_empty_astype(self, columns):
+        # GH 18004
+        msg = '> 1 ndim Categorical are not supported at this time'
+        with tm.assert_raises_regex(NotImplementedError, msg):
+            DataFrame(columns=columns).astype('category')
@@ -0,0 +1,396 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+from distutils.version import LooseVersion
+
+import numpy as np
+
+import pandas as pd
+import pandas.util.testing as tm
+from pandas import (Categorical, Index, Series, DataFrame, CategoricalIndex)
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+
+class TestCategoricalGeneric(object):
+
+    def setup_method(self, method):
+        self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
+                                  ordered=True)
+
+    def test_categories_none(self):
+        factor = Categorical(['a', 'b', 'b', 'a',
+                              'a', 'c', 'c', 'c'], ordered=True)
+        tm.assert_categorical_equal(factor, self.factor)
+
+    def test_describe(self):
+        # string type
+        desc = self.factor.describe()
+        assert self.factor.ordered
+        exp_index = CategoricalIndex(['a', 'b', 'c'], name='categories',
+                                     ordered=self.factor.ordered)
+        expected = DataFrame({'counts': [3, 2, 3],
+                              'freqs': [3 / 8., 2 / 8., 3 / 8.]},
+                             index=exp_index)
+        tm.assert_frame_equal(desc, expected)
+
+        # check unused categories
+        cat = self.factor.copy()
+        cat.set_categories(["a", "b", "c", "d"], inplace=True)
+        desc = cat.describe()
+
+        exp_index = CategoricalIndex(
+            list('abcd'), ordered=self.factor.ordered, name='categories')
+        expected = DataFrame({'counts': [3, 2, 3, 0],
+                              'freqs': [3 / 8., 2 / 8., 3 / 8., 0]},
+                             index=exp_index)
+        tm.assert_frame_equal(desc, expected)
+
+        # check an integer one
+        cat = Categorical([1, 2, 3, 1, 2, 3, 3, 2, 1, 1, 1])
+        desc = cat.describe()
+        exp_index = CategoricalIndex([1, 2, 3], ordered=cat.ordered,
+                                     name='categories')
+        expected = DataFrame({'counts': [5, 3, 3],
+                              'freqs': [5 / 11., 3 / 11., 3 / 11.]},
+                             index=exp_index)
+        tm.assert_frame_equal(desc, expected)
+
+        # https://github.com/pandas-dev/pandas/issues/3678
+        # describe should work with NaN
+        cat = Categorical([np.nan, 1, 2, 2])
+        desc = cat.describe()
+        expected = DataFrame({'counts': [1, 2, 1],
+                              'freqs': [1 / 4., 2 / 4., 1 / 4.]},
+                             index=CategoricalIndex([1, 2, np.nan],
+                                                    categories=[1, 2],
+                                                    name='categories'))
+        tm.assert_frame_equal(desc, expected)
+
+    def test_getitem(self):
+        assert self.factor[0] == 'a'
+        assert self.factor[-1] == 'c'
+
+        subf = self.factor[[0, 1, 2]]
+        tm.assert_numpy_array_equal(subf._codes,
+                                    np.array([0, 1, 1], dtype=np.int8))
+
+        subf = self.factor[np.asarray(self.factor) == 'c']
+        tm.assert_numpy_array_equal(subf._codes,
+                                    np.array([2, 2, 2], dtype=np.int8))
+
+    def test_setitem(self):
+
+        # int/positional
+        c = self.factor.copy()
+        c[0] = 'b'
+        assert c[0] == 'b'
+        c[-1] = 'a'
+        assert c[-1] == 'a'
+
+        # boolean
+        c = self.factor.copy()
+        indexer = np.zeros(len(c), dtype='bool')
+        indexer[0] = True
+        indexer[-1] = True
+        c[indexer] = 'c'
+        expected = Categorical(['c', 'b', 'b', 'a', 'a', 'c', 'c', 'c'],
+                               ordered=True)
+
+        tm.assert_categorical_equal(c, expected)
+
+    def test_set_categories_inplace(self):
+        cat = self.factor.copy()
+        cat.set_categories(['a', 'b', 'c', 'd'], inplace=True)
+        tm.assert_index_equal(cat.categories, Index(['a', 'b', 'c', 'd']))
+
+    def test_comparisons(self):
+
+        result = self.factor[self.factor == 'a']
+        expected = self.factor[np.asarray(self.factor) == 'a']
+        tm.assert_categorical_equal(result, expected)
+
+        result = self.factor[self.factor != 'a']
+        expected = self.factor[np.asarray(self.factor) != 'a']
+        tm.assert_categorical_equal(result, expected)
+
+        result = self.factor[self.factor < 'c']
+        expected = self.factor[np.asarray(self.factor) < 'c']
+        tm.assert_categorical_equal(result, expected)
+
+        result = self.factor[self.factor > 'a']
+        expected = self.factor[np.asarray(self.factor) > 'a']
+        tm.assert_categorical_equal(result, expected)
+
+        result = self.factor[self.factor >= 'b']
+        expected = self.factor[np.asarray(self.factor) >= 'b']
+        tm.assert_categorical_equal(result, expected)
+
+        result = self.factor[self.factor <= 'b']
+        expected = self.factor[np.asarray(self.factor) <= 'b']
+        tm.assert_categorical_equal(result, expected)
+
+        n = len(self.factor)
+
+        other = self.factor[np.random.permutation(n)]
+        result = self.factor == other
+        expected = np.asarray(self.factor) == np.asarray(other)
+        tm.assert_numpy_array_equal(result, expected)
+
+        result = self.factor == 'd'
+        expected = np.repeat(False, len(self.factor))
+        tm.assert_numpy_array_equal(result, expected)
+
+        # comparisons with categoricals
+        cat_rev = Categorical(
+            ["a", "b", "c"], categories=["c", "b", "a"], ordered=True)
+        cat_rev_base = Categorical(
+            ["b", "b", "b"], categories=["c", "b", "a"], ordered=True)
+        cat = Categorical(["a", "b", "c"], ordered=True)
+        cat_base = Categorical(
+            ["b", "b", "b"], categories=cat.categories, ordered=True)
+
+        # comparisons need to take categories ordering into account
+        res_rev = cat_rev > cat_rev_base
+        exp_rev = np.array([True, False, False])
+        tm.assert_numpy_array_equal(res_rev, exp_rev)
+
+        res_rev = cat_rev < cat_rev_base
+        exp_rev = np.array([False, False, True])
+        tm.assert_numpy_array_equal(res_rev, exp_rev)
+
+        res = cat > cat_base
+        exp = np.array([False, False, True])
+        tm.assert_numpy_array_equal(res, exp)
+
+        # Only categories with same categories can be compared
+        def f():
+            cat > cat_rev
+
+        pytest.raises(TypeError, f)
+
+        cat_rev_base2 = Categorical(
+            ["b", "b", "b"], categories=["c", "b", "a", "d"])
+
+        def f():
+            cat_rev > cat_rev_base2
+
+        pytest.raises(TypeError, f)
+
+        # Only categories with same ordering information can be compared
+        cat_unorderd = cat.set_ordered(False)
+        assert not (cat > cat).any()
+
+        def f():
+            cat > cat_unorderd
+
+        pytest.raises(TypeError, f)
+
+        # comparison (in both directions) with Series will raise
+        s = Series(["b", "b", "b"])
+        pytest.raises(TypeError, lambda: cat > s)
+        pytest.raises(TypeError, lambda: cat_rev > s)
+        pytest.raises(TypeError, lambda: s < cat)
+        pytest.raises(TypeError, lambda: s < cat_rev)
+
+        # comparison with numpy.array will raise in both direction, but only on
+        # newer numpy versions
+        a = np.array(["b", "b", "b"])
+        pytest.raises(TypeError, lambda: cat > a)
+        pytest.raises(TypeError, lambda: cat_rev > a)
+
+        # The following work via '__array_priority__ = 1000'
+        # works only on numpy >= 1.7.1
+        if LooseVersion(np.__version__) > "1.7.1":
+            pytest.raises(TypeError, lambda: a < cat)
+            pytest.raises(TypeError, lambda: a < cat_rev)
+
+        # Make sure that unequal comparison take the categories order in
+        # account
+        cat_rev = Categorical(
+            list("abc"), categories=list("cba"), ordered=True)
+        exp = np.array([True, False, False])
+        res = cat_rev > "b"
+        tm.assert_numpy_array_equal(res, exp)
+
+    def test_print(self):
+        expected = ["[a, b, b, a, a, c, c, c]",
+                    "Categories (3, object): [a < b < c]"]
+        expected = "\n".join(expected)
+        actual = repr(self.factor)
+        assert actual == expected
+
+
+class TestCategoricalGenericBlock(object):
+
+    def setup_method(self, method):
+        self.factor = Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
+
+        df = DataFrame({'value': np.random.randint(0, 10000, 100)})
+        labels = ["{0} - {1}".format(i, i + 499) for i in range(0, 10000, 500)]
+        cat_labels = Categorical(labels, labels)
+
+        df = df.sort_values(by=['value'], ascending=True)
+        df['value_group'] = pd.cut(df.value, range(0, 10500, 500),
+                                   right=False, labels=cat_labels)
+        self.cat = df
+
+    def test_basic(self):
+
+        # test basic creation / coercion of categoricals
+        s = Series(self.factor, name='A')
+        assert s.dtype == 'category'
+        assert len(s) == len(self.factor)
+        str(s.values)
+        str(s)
+
+        # in a frame
+        df = DataFrame({'A': self.factor})
+        result = df['A']
+        tm.assert_series_equal(result, s)
+        result = df.iloc[:, 0]
+        tm.assert_series_equal(result, s)
+        assert len(df) == len(self.factor)
+        str(df.values)
+        str(df)
+
+        df = DataFrame({'A': s})
+        result = df['A']
+        tm.assert_series_equal(result, s)
+        assert len(df) == len(self.factor)
+        str(df.values)
+        str(df)
+
+        # multiples
+        df = DataFrame({'A': s, 'B': s, 'C': 1})
+        result1 = df['A']
+        result2 = df['B']
+        tm.assert_series_equal(result1, s)
+        tm.assert_series_equal(result2, s, check_names=False)
+        assert result2.name == 'B'
+        assert len(df) == len(self.factor)
+        str(df.values)
+        str(df)
+
+        # GH8623
+        x = DataFrame([[1, 'John P. Doe'], [2, 'Jane Dove'],
+                       [1, 'John P. Doe']],
+                      columns=['person_id', 'person_name'])
+        x['person_name'] = Categorical(x.person_name
+                                       )  # doing this breaks transform
+
+        expected = x.iloc[0].person_name
+        result = x.person_name.iloc[0]
+        assert result == expected
+
+        result = x.person_name[0]
+        assert result == expected
+
+        result = x.person_name.loc[0]
+        assert result == expected
+
+    def test_describe(self):
+
+        # Categoricals should not show up together with numerical columns
+        result = self.cat.describe()
+        assert len(result.columns) == 1
+
+        # In a frame, describe() for the cat should be the same as for string
+        # arrays (count, unique, top, freq)
+
+        cat = Categorical(["a", "b", "b", "b"], categories=['a', 'b', 'c'],
+                          ordered=True)
+        s = Series(cat)
+        result = s.describe()
+        expected = Series([4, 2, "b", 3],
+                          index=['count', 'unique', 'top', 'freq'])
+        tm.assert_series_equal(result, expected)
+
+        cat = Series(Categorical(["a", "b", "c", "c"]))
+        df3 = DataFrame({"cat": cat, "s": ["a", "b", "c", "c"]})
+        res = df3.describe()
+        tm.assert_numpy_array_equal(res["cat"].values, res["s"].values)
+
+    def test_groupby_sort(self):
+
+        # http://stackoverflow.com/questions/23814368/sorting-pandas-categorical-labels-after-groupby
+        # This should result in a properly sorted Series so that the plot
+        # has a sorted x axis
+        # self.cat.groupby(['value_group'])['value_group'].count().plot(kind='bar')
+
+        res = self.cat.groupby(['value_group'])['value_group'].count()
+        exp = res[sorted(res.index, key=lambda x: float(x.split()[0]))]
+        exp.index = CategoricalIndex(exp.index, name=exp.index.name)
+        tm.assert_series_equal(res, exp)
+
+    def test_astype_to_other(self):
+
+        s = self.cat['value_group']
+        expected = s
+        tm.assert_series_equal(s.astype('category'), expected)
+        tm.assert_series_equal(s.astype(CategoricalDtype()), expected)
+        pytest.raises(ValueError, lambda: s.astype('float64'))
+
+        cat = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c']))
+        exp = Series(['a', 'b', 'b', 'a', 'a', 'c', 'c', 'c'])
+        tm.assert_series_equal(cat.astype('str'), exp)
+        s2 = Series(Categorical(['1', '2', '3', '4']))
+        exp2 = Series([1, 2, 3, 4]).astype(int)
+        tm.assert_series_equal(s2.astype('int'), exp2)
+
+        # object don't sort correctly, so just compare that we have the same
+        # values
+        def cmp(a, b):
+            tm.assert_almost_equal(
+                np.sort(np.unique(a)), np.sort(np.unique(b)))
+
+        expected = Series(np.array(s.values), name='value_group')
+        cmp(s.astype('object'), expected)
+        cmp(s.astype(np.object_), expected)
+
+        # array conversion
+        tm.assert_almost_equal(np.array(s), np.array(s.values))
+
+        # valid conversion
+        for valid in [lambda x: x.astype('category'),
+                      lambda x: x.astype(CategoricalDtype()),
+                      lambda x: x.astype('object').astype('category'),
+                      lambda x: x.astype('object').astype(
+                          CategoricalDtype())
+                      ]:
+
+            result = valid(s)
+            # compare series values
+            # internal .categories can't be compared because it is sorted
+            tm.assert_series_equal(result, s, check_categorical=False)
+
+        # invalid conversion (these are NOT a dtype)
+        for invalid in [lambda x: x.astype(Categorical),
+                        lambda x: x.astype('object').astype(Categorical)]:
+            pytest.raises(TypeError, lambda: invalid(s))
+
+    def test_numeric_like_ops(self):
+
+        # numeric ops should not succeed
+        for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
+            pytest.raises(TypeError,
+                          lambda: getattr(self.cat, op)(self.cat))
+
+        # reduction ops should not succeed (unless specifically defined, e.g.
+        # min/max)
+        s = self.cat['value_group']
+        for op in ['kurt', 'skew', 'var', 'std', 'mean', 'sum', 'median']:
+            pytest.raises(TypeError,
+                          lambda: getattr(s, op)(numeric_only=False))
+
+        # mad technically works because it takes always the numeric data
+
+        # numpy ops
+        s = Series(Categorical([1, 2, 3, 4]))
+        pytest.raises(TypeError, lambda: np.sum(s))
+
+        # numeric ops on a Series
+        for op in ['__add__', '__sub__', '__mul__', '__truediv__']:
+            pytest.raises(TypeError, lambda: getattr(s, op)(2))
+
+        # invalid ufunc
+        pytest.raises(TypeError, lambda: np.log(s))
@@ -0,0 +1,110 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas import (Categorical, Index, Series, CategoricalIndex, isna)
+from pandas.compat import lrange
+from pandas.core.dtypes.dtypes import CategoricalDtype
+
+
+class TestCategoricalMissing(object):
+
+    def test_na_flags_int_categories(self):
+        # #1457
+
+        categories = lrange(10)
+        labels = np.random.randint(0, 10, 20)
+        labels[::5] = -1
+
+        cat = Categorical(labels, categories, fastpath=True)
+        repr(cat)
+
+        tm.assert_numpy_array_equal(isna(cat), labels == -1)
+
+    def test_nan_handling(self):
+
+        # Nans are represented as -1 in codes
+        c = Categorical(["a", "b", np.nan, "a"])
+        tm.assert_index_equal(c.categories, Index(["a", "b"]))
+        tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
+                                                       dtype=np.int8))
+        c[1] = np.nan
+        tm.assert_index_equal(c.categories, Index(["a", "b"]))
+        tm.assert_numpy_array_equal(c._codes, np.array([0, -1, -1, 0],
+                                                       dtype=np.int8))
+
+        # Adding nan to categories should make assigned nan point to the
+        # category!
+        c = Categorical(["a", "b", np.nan, "a"])
+        tm.assert_index_equal(c.categories, Index(["a", "b"]))
+        tm.assert_numpy_array_equal(c._codes, np.array([0, 1, -1, 0],
+                                                       dtype=np.int8))
+
+    def test_set_dtype_nans(self):
+        c = Categorical(['a', 'b', np.nan])
+        result = c._set_dtype(CategoricalDtype(['a', 'c']))
+        tm.assert_numpy_array_equal(result.codes, np.array([0, -1, -1],
+                                                           dtype='int8'))
+
+    def test_isna(self):
+        exp = np.array([False, False, True])
+        c = Categorical(["a", "b", np.nan])
+        res = c.isna()
+
+        tm.assert_numpy_array_equal(res, exp)
+
+    def test_set_item_nan(self):
+        cat = Categorical([1, 2, 3])
+        cat[1] = np.nan
+
+        exp = Categorical([1, np.nan, 3], categories=[1, 2, 3])
+        tm.assert_categorical_equal(cat, exp)
+
+
+class TestCategoricalBlockMissing(object):
+
+    def test_value_counts_with_nan(self):
+        # see gh-9443
+
+        # sanity check
+        s = Series(["a", "b", "a"], dtype="category")
+        exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
+
+        res = s.value_counts(dropna=True)
+        tm.assert_series_equal(res, exp)
+
+        res = s.value_counts(dropna=True)
+        tm.assert_series_equal(res, exp)
+
+        # same Series via two different constructions --> same behaviour
+        series = [
+            Series(["a", "b", None, "a", None, None], dtype="category"),
+            Series(Categorical(["a", "b", None, "a", None, None],
+                               categories=["a", "b"]))
+        ]
+
+        for s in series:
+            # None is a NaN value, so we exclude its count here
+            exp = Series([2, 1], index=CategoricalIndex(["a", "b"]))
+            res = s.value_counts(dropna=True)
+            tm.assert_series_equal(res, exp)
+
+            # we don't exclude the count of None and sort by counts
+            exp = Series([3, 2, 1], index=CategoricalIndex([np.nan, "a", "b"]))
+            res = s.value_counts(dropna=False)
+            tm.assert_series_equal(res, exp)
+
+            # When we aren't sorting by counts, and np.nan isn't a
+            # category, it should be last.
+            exp = Series([2, 1, 3], index=CategoricalIndex(["a", "b", np.nan]))
+            res = s.value_counts(dropna=False, sort=False)
+            tm.assert_series_equal(res, exp)
+
+    def test_nan_handling(self):
+
+        # NaNs are represented as -1 in labels
+        s = Series(Categorical(["a", "b", np.nan, "a"]))
+        tm.assert_index_equal(s.cat.categories, Index(["a", "b"]))
+        tm.assert_numpy_array_equal(s.values.codes,
+                                    np.array([0, 1, -1, 0], dtype=np.int8))
@@ -0,0 +1,225 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas import (Categorical, Series, date_range)
+
+
+class TestCategoricalOps(object):
+
+    def test_datetime_categorical_comparison(self):
+        dt_cat = Categorical(date_range('2014-01-01', periods=3), ordered=True)
+        tm.assert_numpy_array_equal(dt_cat > dt_cat[0],
+                                    np.array([False, True, True]))
+        tm.assert_numpy_array_equal(dt_cat[0] < dt_cat,
+                                    np.array([False, True, True]))
+
+    def test_reflected_comparison_with_scalars(self):
+        # GH8658
+        cat = Categorical([1, 2, 3], ordered=True)
+        tm.assert_numpy_array_equal(cat > cat[0],
+                                    np.array([False, True, True]))
+        tm.assert_numpy_array_equal(cat[0] < cat,
+                                    np.array([False, True, True]))
+
+    def test_comparison_with_unknown_scalars(self):
+        # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
+        # and following comparisons with scalars not in categories should raise
+        # for unequal comps, but not for equal/not equal
+        cat = Categorical([1, 2, 3], ordered=True)
+
+        pytest.raises(TypeError, lambda: cat < 4)
+        pytest.raises(TypeError, lambda: cat > 4)
+        pytest.raises(TypeError, lambda: 4 < cat)
+        pytest.raises(TypeError, lambda: 4 > cat)
+
+        tm.assert_numpy_array_equal(cat == 4,
+                                    np.array([False, False, False]))
+        tm.assert_numpy_array_equal(cat != 4,
+                                    np.array([True, True, True]))
+
+
+class TestCategoricalBlockOps(object):
+
+    def test_comparisons(self):
+        tests_data = [(list("abc"), list("cba"), list("bbb")),
+                      ([1, 2, 3], [3, 2, 1], [2, 2, 2])]
+        for data, reverse, base in tests_data:
+            cat_rev = Series(
+                Categorical(data, categories=reverse, ordered=True))
+            cat_rev_base = Series(
+                Categorical(base, categories=reverse, ordered=True))
+            cat = Series(Categorical(data, ordered=True))
+            cat_base = Series(
+                Categorical(base, categories=cat.cat.categories, ordered=True))
+            s = Series(base)
+            a = np.array(base)
+
+            # comparisons need to take categories ordering into account
+            res_rev = cat_rev > cat_rev_base
+            exp_rev = Series([True, False, False])
+            tm.assert_series_equal(res_rev, exp_rev)
+
+            res_rev = cat_rev < cat_rev_base
+            exp_rev = Series([False, False, True])
+            tm.assert_series_equal(res_rev, exp_rev)
+
+            res = cat > cat_base
+            exp = Series([False, False, True])
+            tm.assert_series_equal(res, exp)
+
+            scalar = base[1]
+            res = cat > scalar
+            exp = Series([False, False, True])
+            exp2 = cat.values > scalar
+            tm.assert_series_equal(res, exp)
+            tm.assert_numpy_array_equal(res.values, exp2)
+            res_rev = cat_rev > scalar
+            exp_rev = Series([True, False, False])
+            exp_rev2 = cat_rev.values > scalar
+            tm.assert_series_equal(res_rev, exp_rev)
+            tm.assert_numpy_array_equal(res_rev.values, exp_rev2)
+
+            # Only categories with same categories can be compared
+            def f():
+                cat > cat_rev
+
+            pytest.raises(TypeError, f)
+
+            # categorical cannot be compared to Series or numpy array, and also
+            # not the other way around
+            pytest.raises(TypeError, lambda: cat > s)
+            pytest.raises(TypeError, lambda: cat_rev > s)
+            pytest.raises(TypeError, lambda: cat > a)
+            pytest.raises(TypeError, lambda: cat_rev > a)
+
+            pytest.raises(TypeError, lambda: s < cat)
+            pytest.raises(TypeError, lambda: s < cat_rev)
+
+            pytest.raises(TypeError, lambda: a < cat)
+            pytest.raises(TypeError, lambda: a < cat_rev)
+
+        # unequal comparison should raise for unordered cats
+        cat = Series(Categorical(list("abc")))
+
+        def f():
+            cat > "b"
+
+        pytest.raises(TypeError, f)
+        cat = Series(Categorical(list("abc"), ordered=False))
+
+        def f():
+            cat > "b"
+
+        pytest.raises(TypeError, f)
+
+        # https://github.com/pandas-dev/pandas/issues/9836#issuecomment-92123057
+        # and following comparisons with scalars not in categories should raise
+        # for unequal comps, but not for equal/not equal
+        cat = Series(Categorical(list("abc"), ordered=True))
+
+        pytest.raises(TypeError, lambda: cat < "d")
+        pytest.raises(TypeError, lambda: cat > "d")
+        pytest.raises(TypeError, lambda: "d" < cat)
+        pytest.raises(TypeError, lambda: "d" > cat)
+
+        tm.assert_series_equal(cat == "d", Series([False, False, False]))
+        tm.assert_series_equal(cat != "d", Series([True, True, True]))
+
+        # And test NaN handling...
+        cat = Series(Categorical(["a", "b", "c", np.nan]))
+        exp = Series([True, True, True, False])
+        res = (cat == cat)
+        tm.assert_series_equal(res, exp)
+
+    def test_cat_equality(self):
+
+        # GH 8938
+        # allow equality comparisons
+        a = Series(list('abc'), dtype="category")
+        b = Series(list('abc'), dtype="object")
+        c = Series(['a', 'b', 'cc'], dtype="object")
+        d = Series(list('acb'), dtype="object")
+        e = Categorical(list('abc'))
+        f = Categorical(list('acb'))
+
+        # vs scalar
+        assert not (a == 'a').all()
+        assert ((a != 'a') == ~(a == 'a')).all()
+
+        assert not ('a' == a).all()
+        assert (a == 'a')[0]
+        assert ('a' == a)[0]
+        assert not ('a' != a)[0]
+
+        # vs list-like
+        assert (a == a).all()
+        assert not (a != a).all()
+
+        assert (a == list(a)).all()
+        assert (a == b).all()
+        assert (b == a).all()
+        assert ((~(a == b)) == (a != b)).all()
+        assert ((~(b == a)) == (b != a)).all()
+
+        assert not (a == c).all()
+        assert not (c == a).all()
+        assert not (a == d).all()
+        assert not (d == a).all()
+
+        # vs a cat-like
+        assert (a == e).all()
+        assert (e == a).all()
+        assert not (a == f).all()
+        assert not (f == a).all()
+
+        assert ((~(a == e) == (a != e)).all())
+        assert ((~(e == a) == (e != a)).all())
+        assert ((~(a == f) == (a != f)).all())
+        assert ((~(f == a) == (f != a)).all())
+
+        # non-equality is not comparable
+        pytest.raises(TypeError, lambda: a < b)
+        pytest.raises(TypeError, lambda: b < a)
+        pytest.raises(TypeError, lambda: a > b)
+        pytest.raises(TypeError, lambda: b > a)
+
+    @pytest.mark.parametrize('ctor', [
+        lambda *args, **kwargs: Categorical(*args, **kwargs),
+        lambda *args, **kwargs: Series(Categorical(*args, **kwargs)),
+    ])
+    def test_unordered_different_order_equal(self, ctor):
+        # https://github.com/pandas-dev/pandas/issues/16014
+        c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
+        c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
+        assert (c1 == c2).all()
+
+        c1 = ctor(['a', 'b'], categories=['a', 'b'], ordered=False)
+        c2 = ctor(['b', 'a'], categories=['b', 'a'], ordered=False)
+        assert (c1 != c2).all()
+
+        c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
+        c2 = ctor(['b', 'b'], categories=['b', 'a'], ordered=False)
+        assert (c1 != c2).all()
+
+        c1 = ctor(['a', 'a'], categories=['a', 'b'], ordered=False)
+        c2 = ctor(['a', 'b'], categories=['b', 'a'], ordered=False)
+        result = c1 == c2
+        tm.assert_numpy_array_equal(np.array(result), np.array([True, False]))
+
+    def test_unordered_different_categories_raises(self):
+        c1 = Categorical(['a', 'b'], categories=['a', 'b'], ordered=False)
+        c2 = Categorical(['a', 'c'], categories=['c', 'a'], ordered=False)
+        with tm.assert_raises_regex(TypeError,
+                                    "Categoricals can only be compared"):
+            c1 == c2
+
+    def test_compare_different_lengths(self):
+        c1 = Categorical([], categories=['a', 'b'])
+        c2 = Categorical([], categories=['a'])
+        msg = "Categories are different lengths"
+        with tm.assert_raises_regex(TypeError, msg):
+            c1 == c2
@@ -0,0 +1,195 @@
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+import pandas.util.testing as tm
+from pandas import (Categorical, Index, Series, DataFrame)
+
+
+class TestCategoricalSort(object):
+
+    def test_argsort(self):
+        c = Categorical([5, 3, 1, 4, 2], ordered=True)
+
+        expected = np.array([2, 4, 1, 3, 0])
+        tm.assert_numpy_array_equal(c.argsort(ascending=True), expected,
+                                    check_dtype=False)
+
+        expected = expected[::-1]
+        tm.assert_numpy_array_equal(c.argsort(ascending=False), expected,
+                                    check_dtype=False)
+
+    def test_numpy_argsort(self):
+        c = Categorical([5, 3, 1, 4, 2], ordered=True)
+
+        expected = np.array([2, 4, 1, 3, 0])
+        tm.assert_numpy_array_equal(np.argsort(c), expected,
+                                    check_dtype=False)
+
+        tm.assert_numpy_array_equal(np.argsort(c, kind='mergesort'), expected,
+                                    check_dtype=False)
+
+        msg = "the 'axis' parameter is not supported"
+        tm.assert_raises_regex(ValueError, msg, np.argsort,
+                               c, axis=0)
+
+        msg = "the 'order' parameter is not supported"
+        tm.assert_raises_regex(ValueError, msg, np.argsort,
+                               c, order='C')
+
+    def test_sort_values(self):
+
+        # unordered cats are sortable
+        cat = Categorical(["a", "b", "b", "a"], ordered=False)
+        cat.sort_values()
+
+        cat = Categorical(["a", "c", "b", "d"], ordered=True)
+
+        # sort_values
+        res = cat.sort_values()
+        exp = np.array(["a", "b", "c", "d"], dtype=object)
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+        tm.assert_index_equal(res.categories, cat.categories)
+
+        cat = Categorical(["a", "c", "b", "d"],
+                          categories=["a", "b", "c", "d"], ordered=True)
+        res = cat.sort_values()
+        exp = np.array(["a", "b", "c", "d"], dtype=object)
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+        tm.assert_index_equal(res.categories, cat.categories)
+
+        res = cat.sort_values(ascending=False)
+        exp = np.array(["d", "c", "b", "a"], dtype=object)
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+        tm.assert_index_equal(res.categories, cat.categories)
+
+        # sort (inplace order)
+        cat1 = cat.copy()
+        cat1.sort_values(inplace=True)
+        exp = np.array(["a", "b", "c", "d"], dtype=object)
+        tm.assert_numpy_array_equal(cat1.__array__(), exp)
+        tm.assert_index_equal(res.categories, cat.categories)
+
+        # reverse
+        cat = Categorical(["a", "c", "c", "b", "d"], ordered=True)
+        res = cat.sort_values(ascending=False)
+        exp_val = np.array(["d", "c", "c", "b", "a"], dtype=object)
+        exp_categories = Index(["a", "b", "c", "d"])
+        tm.assert_numpy_array_equal(res.__array__(), exp_val)
+        tm.assert_index_equal(res.categories, exp_categories)
+
+    def test_sort_values_na_position(self):
+        # see gh-12882
+        cat = Categorical([5, 2, np.nan, 2, np.nan], ordered=True)
+        exp_categories = Index([2, 5])
+
+        exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
+        res = cat.sort_values()  # default arguments
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+        tm.assert_index_equal(res.categories, exp_categories)
+
+        exp = np.array([np.nan, np.nan, 2.0, 2.0, 5.0])
+        res = cat.sort_values(ascending=True, na_position='first')
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+        tm.assert_index_equal(res.categories, exp_categories)
+
+        exp = np.array([np.nan, np.nan, 5.0, 2.0, 2.0])
+        res = cat.sort_values(ascending=False, na_position='first')
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+        tm.assert_index_equal(res.categories, exp_categories)
+
+        exp = np.array([2.0, 2.0, 5.0, np.nan, np.nan])
+        res = cat.sort_values(ascending=True, na_position='last')
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+        tm.assert_index_equal(res.categories, exp_categories)
+
+        exp = np.array([5.0, 2.0, 2.0, np.nan, np.nan])
+        res = cat.sort_values(ascending=False, na_position='last')
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+        tm.assert_index_equal(res.categories, exp_categories)
+
+        cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
+        res = cat.sort_values(ascending=False, na_position='last')
+        exp_val = np.array(["d", "c", "b", "a", np.nan], dtype=object)
+        exp_categories = Index(["a", "b", "c", "d"])
+        tm.assert_numpy_array_equal(res.__array__(), exp_val)
+        tm.assert_index_equal(res.categories, exp_categories)
+
+        cat = Categorical(["a", "c", "b", "d", np.nan], ordered=True)
+        res = cat.sort_values(ascending=False, na_position='first')
+        exp_val = np.array([np.nan, "d", "c", "b", "a"], dtype=object)
+        exp_categories = Index(["a", "b", "c", "d"])
+        tm.assert_numpy_array_equal(res.__array__(), exp_val)
+        tm.assert_index_equal(res.categories, exp_categories)
+
+
+class TestCategoricalBlockSort(object):
+
+    def test_sort_values(self):
+
+        c = Categorical(["a", "b", "b", "a"], ordered=False)
+        cat = Series(c.copy())
+
+        # sort in the categories order
+        expected = Series(
+            Categorical(["a", "a", "b", "b"],
+                        ordered=False), index=[0, 3, 1, 2])
+        result = cat.sort_values()
+        tm.assert_series_equal(result, expected)
+
+        cat = Series(Categorical(["a", "c", "b", "d"], ordered=True))
+        res = cat.sort_values()
+        exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+
+        cat = Series(Categorical(["a", "c", "b", "d"], categories=[
+                     "a", "b", "c", "d"], ordered=True))
+        res = cat.sort_values()
+        exp = np.array(["a", "b", "c", "d"], dtype=np.object_)
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+
+        res = cat.sort_values(ascending=False)
+        exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
+        tm.assert_numpy_array_equal(res.__array__(), exp)
+
+        raw_cat1 = Categorical(["a", "b", "c", "d"],
+                               categories=["a", "b", "c", "d"], ordered=False)
+        raw_cat2 = Categorical(["a", "b", "c", "d"],
+                               categories=["d", "c", "b", "a"], ordered=True)
+        s = ["a", "b", "c", "d"]
+        df = DataFrame({"unsort": raw_cat1,
+                        "sort": raw_cat2,
+                        "string": s,
+                        "values": [1, 2, 3, 4]})
+
+        # Cats must be sorted in a dataframe
+        res = df.sort_values(by=["string"], ascending=False)
+        exp = np.array(["d", "c", "b", "a"], dtype=np.object_)
+        tm.assert_numpy_array_equal(res["sort"].values.__array__(), exp)
+        assert res["sort"].dtype == "category"
+
+        res = df.sort_values(by=["sort"], ascending=False)
+        exp = df.sort_values(by=["string"], ascending=True)
+        tm.assert_series_equal(res["values"], exp["values"])
+        assert res["sort"].dtype == "category"
+        assert res["unsort"].dtype == "category"
+
+        # unordered cat, but we allow this
+        df.sort_values(by=["unsort"], ascending=False)
+
+        # multi-columns sort
+        # GH 7848
+        df = DataFrame({"id": [6, 5, 4, 3, 2, 1],
+                        "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']})
+        df["grade"] = Categorical(df["raw_grade"], ordered=True)
+        df['grade'] = df['grade'].cat.set_categories(['b', 'e', 'a'])
+
+        # sorts 'grade' according to the order of the categories
+        result = df.sort_values(by=['grade'])
+        expected = df.iloc[[1, 2, 5, 0, 3, 4]]
+        tm.assert_frame_equal(result, expected)
+
+        # multi
+        result = df.sort_values(by=['grade', 'id'])
+        expected = df.iloc[[2, 1, 5, 4, 3, 0]]
+        tm.assert_frame_equal(result, expected)
@@ -0,0 +1,26 @@
+# -*- coding: utf-8 -*-
+
+from pandas import Categorical
+
+import pandas.util.testing as tm
+
+
+class TestCategoricalSubclassing(object):
+
+    def test_constructor(self):
+        sc = tm.SubclassedCategorical(['a', 'b', 'c'])
+        assert isinstance(sc, tm.SubclassedCategorical)
+        tm.assert_categorical_equal(sc, Categorical(['a', 'b', 'c']))
+
+    def test_from_array(self):
+        sc = tm.SubclassedCategorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
+        assert isinstance(sc, tm.SubclassedCategorical)
+        exp = Categorical.from_codes([1, 0, 2], ['a', 'b', 'c'])
+        tm.assert_categorical_equal(sc, exp)
+
+    def test_map(self):
+        sc = tm.SubclassedCategorical(['a', 'b', 'c'])
+        res = sc.map(lambda x: x.upper())
+        assert isinstance(res, tm.SubclassedCategorical)
+        exp = Categorical(['A', 'B', 'C'])
+        tm.assert_categorical_equal(res, exp)
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+
+import pytest
+
+import pandas.util.testing as tm
+
+
+class TestCategoricalWarnings(object):
+    def test_tab_complete_warning(self, ip):
+        # https://github.com/pandas-dev/pandas/issues/16409
+        pytest.importorskip('IPython', minversion="6.0.0")
+        from IPython.core.completer import provisionalcompleter
+
+        code = "import pandas as pd; c = Categorical([])"
+        ip.run_code(code)
+        with tm.assert_produces_warning(None):
+            with provisionalcompleter('ignore'):
+                list(ip.Completer.completions('c.', 1))