Skip to content

Commit 4309dac

Browse files
author
Tom Augspurger
committed
Merge pull request #10729 from TomAugspurger/categorical-value_counts
API: CategoricalIndex for value_counts
2 parents 94e394a + 6bdcb16 commit 4309dac

File tree

4 files changed

+68
-5
lines changed

4 files changed

+68
-5
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -457,6 +457,8 @@ Other API Changes
457457
^^^^^^^^^^^^^^^^^
458458

459459
- Line and kde plot with ``subplots=True`` now uses default colors, not all black. Specify ``color='k'`` to draw all lines in black (:issue:`9894`)
460+
- Calling the ``.value_counts`` method on a Series with ``categorical`` dtype now returns a
461+
Series with a ``CategoricalIndex`` (:issue:`10704`)
460462
- Enable writing Excel files in :ref:`memory <_io.excel_writing_buffer>` using StringIO/BytesIO (:issue:`7074`)
461463
- Enable serialization of lists and dicts to strings in ExcelWriter (:issue:`8188`)
462464
- Allow passing `kwargs` to the interpolation methods (:issue:`10378`).

pandas/core/categorical.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1027,6 +1027,7 @@ def value_counts(self, dropna=True):
10271027
"""
10281028
import pandas.hashtable as htable
10291029
from pandas.core.series import Series
1030+
from pandas.core.index import CategoricalIndex
10301031

10311032
cat = self.dropna() if dropna else self
10321033
keys, counts = htable.value_count_int64(com._ensure_int64(cat._codes))
@@ -1036,10 +1037,12 @@ def value_counts(self, dropna=True):
10361037
if not dropna and -1 in keys:
10371038
ix = np.append(ix, -1)
10381039
result = result.reindex(ix, fill_value=0)
1039-
result.index = (np.append(cat.categories, np.nan)
1040+
index = (np.append(cat.categories, np.nan)
10401041
if not dropna and -1 in keys
10411042
else cat.categories)
10421043

1044+
result.index = CategoricalIndex(index, self.categories, self.ordered)
1045+
10431046
return result
10441047

10451048
def get_values(self):

pandas/tests/test_algos.py

+60-3
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import numpy as np
55
from numpy.random import RandomState
66

7-
from pandas.core.api import Series, Categorical
7+
from pandas.core.api import Series, Categorical, CategoricalIndex
88
import pandas as pd
99

1010
import pandas.core.algorithms as algos
@@ -290,9 +290,15 @@ def test_value_counts(self):
290290
factor = cut(arr, 4)
291291

292292
tm.assertIsInstance(factor, Categorical)
293-
294293
result = algos.value_counts(factor)
295-
expected = algos.value_counts(np.asarray(factor))
294+
cats = ['(-1.194, -0.535]',
295+
'(-0.535, 0.121]',
296+
'(0.121, 0.777]',
297+
'(0.777, 1.433]'
298+
]
299+
expected_index = CategoricalIndex(cats, cats, ordered=True)
300+
expected = Series([1, 1, 1, 1],
301+
index=expected_index)
296302
tm.assert_series_equal(result.sort_index(), expected.sort_index())
297303

298304
def test_value_counts_bins(self):
@@ -332,6 +338,57 @@ def test_value_counts_nat(self):
332338
tm.assert_series_equal(algos.value_counts(dt), exp_dt)
333339
# TODO same for (timedelta)
334340

341+
def test_categorical(self):
342+
s = Series(pd.Categorical(list('aaabbc')))
343+
result = s.value_counts()
344+
expected = pd.Series([3, 2, 1], index=pd.CategoricalIndex(['a', 'b', 'c']))
345+
tm.assert_series_equal(result, expected, check_index_type=True)
346+
347+
# preserve order?
348+
s = s.cat.as_ordered()
349+
result = s.value_counts()
350+
expected.index = expected.index.as_ordered()
351+
tm.assert_series_equal(result, expected, check_index_type=True)
352+
353+
def test_categorical_nans(self):
354+
s = Series(pd.Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan)
355+
s.iloc[1] = np.nan
356+
result = s.value_counts()
357+
expected = pd.Series([4, 3, 2],
358+
index=pd.CategoricalIndex(['a', 'b', 'c'],
359+
categories=['a', 'b', 'c']))
360+
tm.assert_series_equal(result, expected, check_index_type=True)
361+
result = s.value_counts(dropna=False)
362+
expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex(
363+
['a', 'b', 'c', np.nan]))
364+
tm.assert_series_equal(result, expected, check_index_type=True)
365+
366+
# out of order
367+
s = Series(pd.Categorical(list('aaaaabbbcc'),
368+
ordered=True, categories=['b', 'a', 'c']))
369+
s.iloc[1] = np.nan
370+
result = s.value_counts()
371+
expected = pd.Series([4, 3, 2],
372+
index=pd.CategoricalIndex(['a', 'b', 'c'],
373+
categories=['b', 'a', 'c'],
374+
ordered=True))
375+
tm.assert_series_equal(result, expected, check_index_type=True)
376+
377+
result = s.value_counts(dropna=False)
378+
expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex(
379+
['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True))
380+
tm.assert_series_equal(result, expected, check_index_type=True)
381+
382+
def test_categorical_zeroes(self):
383+
# keep the `d` category with 0
384+
s = Series(pd.Categorical(list('bbbaac'), categories=list('abcd'),
385+
ordered=True))
386+
result = s.value_counts()
387+
expected = Series([3, 2, 1, 0], index=pd.Categorical(
388+
['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True))
389+
tm.assert_series_equal(result, expected, check_index_type=True)
390+
391+
335392
def test_dropna(self):
336393
# https://github.com/pydata/pandas/issues/9443#issuecomment-73719328
337394

pandas/tests/test_categorical.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,8 @@ def test_describe(self):
458458
desc = cat.describe()
459459
expected = DataFrame.from_dict(dict(counts=[1, 2, 1],
460460
freqs=[1/4., 2/4., 1/4.],
461-
categories=[1,2,np.nan]
461+
categories=Categorical([1,2,np.nan],
462+
[1, 2])
462463
)
463464
).set_index('categories')
464465
tm.assert_frame_equal(desc, expected)

0 commit comments

Comments
 (0)