Merge pull request #10729 from TomAugspurger/categorical-value_counts

Tom Augspurger · Tom Augspurger · commit 4309dac743c8 · 2015-08-04T12:38:52.000-05:00
API: CategoricalIndex for value_counts
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -457,6 +457,8 @@ Other API Changes
 ^^^^^^^^^^^^^^^^^
 
 - Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`)
+- Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a
+Series with a ``CategoricalIndex`` (:issue:`10704`)
 - Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
 - Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
 - Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -1027,6 +1027,7 @@ def value_counts(self, dropna=True):
         """
         import pandas.hashtable as htable
         from pandas.core.series import Series
+        from pandas.core.index import CategoricalIndex
 
         cat = self.dropna() if dropna else self
         keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
@@ -1036,10 +1037,12 @@ def value_counts(self, dropna=True):
         if not dropna and -1 in keys:
             ix = np.append(ix, -1)
         result = result.reindex(ix, fill_value=0)
-        result.index = (np.append(cat.categories, np.nan)
+        index = (np.append(cat.categories, np.nan)
             if not dropna and -1 in keys
             else cat.categories)
 
+        result.index = CategoricalIndex(index, self.categories, self.ordered)
+
         return result
 
     def get_values(self):
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -4,7 +4,7 @@
 import numpy as np
 from numpy.random import RandomState
 
-from pandas.core.api import Series, Categorical
+from pandas.core.api import Series, Categorical, CategoricalIndex
 import pandas as pd
 
 import pandas.core.algorithms as algos
@@ -290,9 +290,15 @@ def test_value_counts(self):
         factor = cut(arr, 4)
 
         tm.assertIsInstance(factor, Categorical)
-
         result = algos.value_counts(factor)
-        expected = algos.value_counts(np.asarray(factor))
+        cats = ['(-1.194, -0.535]',
+                '(-0.535, 0.121]',
+                '(0.121, 0.777]',
+                '(0.777, 1.433]'
+        ]
+        expected_index = CategoricalIndex(cats, cats, ordered=True)
+        expected = Series([1, 1, 1, 1],
+                          index=expected_index)
         tm.assert_series_equal(result.sort_index(), expected.sort_index())
 
     def test_value_counts_bins(self):
@@ -332,6 +338,57 @@ def test_value_counts_nat(self):
         tm.assert_series_equal(algos.value_counts(dt), exp_dt)
         # TODO same for (timedelta)
 
+    def test_categorical(self):
+        s = Series(pd.Categorical(list('aaabbc')))
+        result = s.value_counts()
+        expected = pd.Series([3, 2, 1], index=pd.CategoricalIndex(['a', 'b', 'c']))
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+        # preserve order?
+        s = s.cat.as_ordered()
+        result = s.value_counts()
+        expected.index = expected.index.as_ordered()
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+    def test_categorical_nans(self):
+        s = Series(pd.Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan)
+        s.iloc[1] = np.nan
+        result = s.value_counts()
+        expected = pd.Series([4, 3, 2],
+                             index=pd.CategoricalIndex(['a', 'b', 'c'],
+                                                       categories=['a', 'b', 'c']))
+        tm.assert_series_equal(result, expected, check_index_type=True)
+        result = s.value_counts(dropna=False)
+        expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex(
+            ['a', 'b',  'c', np.nan]))
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+        # out of order
+        s = Series(pd.Categorical(list('aaaaabbbcc'),
+                                  ordered=True, categories=['b', 'a', 'c']))
+        s.iloc[1] = np.nan
+        result = s.value_counts()
+        expected = pd.Series([4, 3, 2],
+                             index=pd.CategoricalIndex(['a', 'b', 'c'],
+                                                       categories=['b', 'a', 'c'],
+                                                       ordered=True))
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+        result = s.value_counts(dropna=False)
+        expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex(
+            ['a', 'b',  'c', np.nan], categories=['b', 'a', 'c'], ordered=True))
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+    def test_categorical_zeroes(self):
+        # keep the `d` category with 0
+        s = Series(pd.Categorical(list('bbbaac'), categories=list('abcd'),
+                                  ordered=True))
+        result = s.value_counts()
+        expected = Series([3, 2, 1, 0], index=pd.Categorical(
+            ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True))
+        tm.assert_series_equal(result, expected, check_index_type=True)
+
+
     def test_dropna(self):
         # https://github.com/pydata/pandas/issues/9443#issuecomment-73719328
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -458,7 +458,8 @@ def test_describe(self):
         desc = cat.describe()
         expected = DataFrame.from_dict(dict(counts=[1, 2, 1],
                                             freqs=[1/4., 2/4., 1/4.],
-                                            categories=[1,2,np.nan]
+                                            categories=Categorical([1,2,np.nan],
+                                                                   [1, 2])
                                             )
                                             ).set_index('categories')
         tm.assert_frame_equal(desc, expected)