From 6bdcb164929cad21375db6b1de803f34edd52890 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Thu, 30 Jul 2015 18:07:19 -0500 Subject: [PATCH] API: CategoricalIndex for value_counts Changes ``Categorical.value_counts`` to return a Series with a CategoricalIndex. Previously the Series and an Index. --- doc/source/whatsnew/v0.17.0.txt | 2 + pandas/core/categorical.py | 5 ++- pandas/tests/test_algos.py | 63 ++++++++++++++++++++++++++++++-- pandas/tests/test_categorical.py | 3 +- 4 files changed, 68 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 16c6c639a489e..87a3042061b16 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -457,6 +457,8 @@ Other API Changes ^^^^^^^^^^^^^^^^^ - Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`) +- Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a +Series with a ``CategoricalIndex`` (:issue:`10704`) - Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`) - Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`) - Allow passing `kwargs` to the interpolation methods (:issue:`10378`). diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 1604705ff824a..b0d564caa5826 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1027,6 +1027,7 @@ def value_counts(self, dropna=True): """ import pandas.hashtable as htable from pandas.core.series import Series + from pandas.core.index import CategoricalIndex cat = self.dropna() if dropna else self keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes)) @@ -1036,10 +1037,12 @@ def value_counts(self, dropna=True): if not dropna and -1 in keys: ix = np.append(ix, -1) result = result.reindex(ix, fill_value=0) - result.index = (np.append(cat.categories, np.nan) + index = (np.append(cat.categories, np.nan) if not dropna and -1 in keys else cat.categories) + result.index = CategoricalIndex(index, self.categories, self.ordered) + return result def get_values(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index cf72f0e433634..6164b1b4906de 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -4,7 +4,7 @@ import numpy as np from numpy.random import RandomState -from pandas.core.api import Series, Categorical +from pandas.core.api import Series, Categorical, CategoricalIndex import pandas as pd import pandas.core.algorithms as algos @@ -290,9 +290,15 @@ def test_value_counts(self): factor = cut(arr, 4) tm.assertIsInstance(factor, Categorical) - result = algos.value_counts(factor) - expected = algos.value_counts(np.asarray(factor)) + cats = ['(-1.194, -0.535]', + '(-0.535, 0.121]', + '(0.121, 0.777]', + '(0.777, 1.433]' + ] + expected_index = CategoricalIndex(cats, cats, ordered=True) + expected = Series([1, 1, 1, 1], + index=expected_index) tm.assert_series_equal(result.sort_index(), expected.sort_index()) def test_value_counts_bins(self): @@ -332,6 +338,57 @@ def test_value_counts_nat(self): tm.assert_series_equal(algos.value_counts(dt), exp_dt) # TODO same for (timedelta) + def test_categorical(self): + s = Series(pd.Categorical(list('aaabbc'))) + result = s.value_counts() + expected = pd.Series([3, 2, 1], index=pd.CategoricalIndex(['a', 'b', 'c'])) + tm.assert_series_equal(result, expected, check_index_type=True) + + # preserve order? + s = s.cat.as_ordered() + result = s.value_counts() + expected.index = expected.index.as_ordered() + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_categorical_nans(self): + s = Series(pd.Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan) + s.iloc[1] = np.nan + result = s.value_counts() + expected = pd.Series([4, 3, 2], + index=pd.CategoricalIndex(['a', 'b', 'c'], + categories=['a', 'b', 'c'])) + tm.assert_series_equal(result, expected, check_index_type=True) + result = s.value_counts(dropna=False) + expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex( + ['a', 'b', 'c', np.nan])) + tm.assert_series_equal(result, expected, check_index_type=True) + + # out of order + s = Series(pd.Categorical(list('aaaaabbbcc'), + ordered=True, categories=['b', 'a', 'c'])) + s.iloc[1] = np.nan + result = s.value_counts() + expected = pd.Series([4, 3, 2], + index=pd.CategoricalIndex(['a', 'b', 'c'], + categories=['b', 'a', 'c'], + ordered=True)) + tm.assert_series_equal(result, expected, check_index_type=True) + + result = s.value_counts(dropna=False) + expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex( + ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True)) + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_categorical_zeroes(self): + # keep the `d` category with 0 + s = Series(pd.Categorical(list('bbbaac'), categories=list('abcd'), + ordered=True)) + result = s.value_counts() + expected = Series([3, 2, 1, 0], index=pd.Categorical( + ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True)) + tm.assert_series_equal(result, expected, check_index_type=True) + + def test_dropna(self): # https://github.com/pydata/pandas/issues/9443#issuecomment-73719328 diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index a85fd52ed6eb3..a065d03d4ad72 100755 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -458,7 +458,8 @@ def test_describe(self): desc = cat.describe() expected = DataFrame.from_dict(dict(counts=[1, 2, 1], freqs=[1/4., 2/4., 1/4.], - categories=[1,2,np.nan] + categories=Categorical([1,2,np.nan], + [1, 2]) ) ).set_index('categories') tm.assert_frame_equal(desc, expected)