Skip to content

Commit e17fa6f

Browse files
TomAugspurgerjbuyl
authored andcommitted
API: CategoricalIndex for value_counts
Changes ``Categorical.value_counts`` to return a Series with a CategoricalIndex. Previously the Series and an Index.
1 parent 71bf222 commit e17fa6f

File tree

4 files changed

+69
-6
lines changed

4 files changed

+69
-6
lines changed

doc/source/whatsnew/v0.17.0.txt

+3-1
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,8 @@ Other API Changes
457457
^^^^^^^^^^^^^^^^^
458458

459459
- Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`)
460+
- Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a
461+
Series with a ``CategoricalIndex`` (:issue:`10704`)
460462
- Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
461463
- Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
462464
- Allow passing `kwargs` to the interpolation methods (:issue:`10378`).
@@ -606,4 +608,4 @@ Bug Fixes
606608
- Bug in vectorised setting of timestamp columns with python ``datetime.date`` and numpy ``datetime64`` (:issue:`10408`, :issue:`10412`)
607609

608610
- Bug in ``pd.DataFrame`` when constructing an empty DataFrame with a string dtype (:issue:`9428`)
609-
- Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10739`)
611+
- Bug in ``read_stata`` when reading a file with a different order set in ``columns`` (:issue:`10739`)

pandas/core/categorical.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1027,6 +1027,7 @@ def value_counts(self, dropna=True):
10271027
"""
10281028
import pandas.hashtable as htable
10291029
from pandas.core.series import Series
1030+
from pandas.core.index import CategoricalIndex
10301031

10311032
cat = self.dropna() if dropna else self
10321033
keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
@@ -1036,10 +1037,12 @@ def value_counts(self, dropna=True):
10361037
if not dropna and -1 in keys:
10371038
ix = np.append(ix, -1)
10381039
result = result.reindex(ix, fill_value=0)
1039-
result.index = (np.append(cat.categories, np.nan)
1040+
index = (np.append(cat.categories, np.nan)
10401041
if not dropna and -1 in keys
10411042
else cat.categories)
10421043

1044+
result.index = CategoricalIndex(index, self.categories, self.ordered)
1045+
10431046
return result
10441047

10451048
def get_values(self):

pandas/tests/test_algos.py

+60-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
from numpy.random import RandomState
66

7-
from pandas.core.api import Series, Categorical
7+
from pandas.core.api import Series, Categorical, CategoricalIndex
88
import pandas as pd
99

1010
import pandas.core.algorithms as algos
@@ -246,9 +246,15 @@ def test_value_counts(self):
246246
factor = cut(arr, 4)
247247

248248
tm.assertIsInstance(factor, Categorical)
249-
250249
result = algos.value_counts(factor)
251-
expected = algos.value_counts(np.asarray(factor))
250+
cats = ['(-1.194, -0.535]',
251+
'(-0.535, 0.121]',
252+
'(0.121, 0.777]',
253+
'(0.777, 1.433]'
254+
]
255+
expected_index = CategoricalIndex(cats, cats, ordered=True)
256+
expected = Series([1, 1, 1, 1],
257+
index=expected_index)
252258
tm.assert_series_equal(result.sort_index(), expected.sort_index())
253259

254260
def test_value_counts_bins(self):
@@ -288,6 +294,57 @@ def test_value_counts_nat(self):
288294
tm.assert_series_equal(algos.value_counts(dt), exp_dt)
289295
# TODO same for (timedelta)
290296

297+
def test_categorical(self):
298+
s = Series(pd.Categorical(list('aaabbc')))
299+
result = s.value_counts()
300+
expected = pd.Series([3, 2, 1], index=pd.CategoricalIndex(['a', 'b', 'c']))
301+
tm.assert_series_equal(result, expected, check_index_type=True)
302+
303+
# preserve order?
304+
s = s.cat.as_ordered()
305+
result = s.value_counts()
306+
expected.index = expected.index.as_ordered()
307+
tm.assert_series_equal(result, expected, check_index_type=True)
308+
309+
def test_categorical_nans(self):
310+
s = Series(pd.Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan)
311+
s.iloc[1] = np.nan
312+
result = s.value_counts()
313+
expected = pd.Series([4, 3, 2],
314+
index=pd.CategoricalIndex(['a', 'b', 'c'],
315+
categories=['a', 'b', 'c']))
316+
tm.assert_series_equal(result, expected, check_index_type=True)
317+
result = s.value_counts(dropna=False)
318+
expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex(
319+
['a', 'b', 'c', np.nan]))
320+
tm.assert_series_equal(result, expected, check_index_type=True)
321+
322+
# out of order
323+
s = Series(pd.Categorical(list('aaaaabbbcc'),
324+
ordered=True, categories=['b', 'a', 'c']))
325+
s.iloc[1] = np.nan
326+
result = s.value_counts()
327+
expected = pd.Series([4, 3, 2],
328+
index=pd.CategoricalIndex(['a', 'b', 'c'],
329+
categories=['b', 'a', 'c'],
330+
ordered=True))
331+
tm.assert_series_equal(result, expected, check_index_type=True)
332+
333+
result = s.value_counts(dropna=False)
334+
expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex(
335+
['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True))
336+
tm.assert_series_equal(result, expected, check_index_type=True)
337+
338+
def test_categorical_zeroes(self):
339+
# keep the `d` category with 0
340+
s = Series(pd.Categorical(list('bbbaac'), categories=list('abcd'),
341+
ordered=True))
342+
result = s.value_counts()
343+
expected = Series([3, 2, 1, 0], index=pd.Categorical(
344+
['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True))
345+
tm.assert_series_equal(result, expected, check_index_type=True)
346+
347+
291348
def test_dropna(self):
292349
# https://github.com/pydata/pandas/issues/9443#issuecomment-73719328
293350

pandas/tests/test_categorical.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,8 @@ def test_describe(self):
458458
desc = cat.describe()
459459
expected = DataFrame.from_dict(dict(counts=[1, 2, 1],
460460
freqs=[1/4., 2/4., 1/4.],
461-
categories=[1,2,np.nan]
461+
categories=Categorical([1,2,np.nan],
462+
[1, 2])
462463
)
463464
).set_index('categories')
464465
tm.assert_frame_equal(desc, expected)

0 commit comments

Comments
 (0)