Skip to content

Commit 4226afe

Browse files
committed
Merge pull request #4502 from hayd/value_count_bins
ENH add bins argument to value_counts
2 parents c467051 + 85f191c commit 4226afe

File tree

7 files changed

+119
-22
lines changed

7 files changed

+119
-22
lines changed

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@ pandas 0.13
4646
the index of the sheet to read in (:issue:`4301`).
4747
- ``get_dummies`` works with NaN (:issue:`4446`)
4848
- Added a test for ``read_clipboard()`` and ``to_clipboard()`` (:issue:`4282`)
49+
- Added bins argument to ``value_counts`` (:issue:`3945`), also sort and
50+
ascending, now available in Series method as well as top-level function.
4951
- Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf",
5052
"iNf", etc.) to infinity. (:issue:`4220`, :issue:`4219`), affecting
5153
``read_table``, ``read_csv``, etc.

pandas/core/algorithms.py

+22-4
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
148148
return labels, uniques
149149

150150

151-
def value_counts(values, sort=True, ascending=False, normalize=False):
151+
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None):
152152
"""
153153
Compute a histogram of the counts of non-null values
154154
@@ -161,33 +161,51 @@ def value_counts(values, sort=True, ascending=False, normalize=False):
161161
Sort in ascending order
162162
normalize: boolean, default False
163163
If True then compute a relative histogram
164+
bins : integer, optional
165+
Rather than count values, group them into half-open bins,
166+
convenience for pd.cut, only works with numeric data
164167
165168
Returns
166169
-------
167170
value_counts : Series
171+
168172
"""
169173
from pandas.core.series import Series
174+
from pandas.tools.tile import cut
175+
176+
values = Series(values).values
170177

171-
values = np.asarray(values)
178+
if bins is not None:
179+
try:
180+
cat, bins = cut(values, bins, retbins=True)
181+
except TypeError:
182+
raise TypeError("bins argument only works with numeric data.")
183+
values = cat.labels
172184

173185
if com.is_integer_dtype(values.dtype):
174186
values = com._ensure_int64(values)
175187
keys, counts = htable.value_count_int64(values)
176-
elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
177188

189+
elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
178190
dtype = values.dtype
179191
values = values.view(np.int64)
180192
keys, counts = htable.value_count_int64(values)
181193

182194
# convert the keys back to the dtype we came in
183-
keys = Series(keys,dtype=dtype)
195+
keys = Series(keys, dtype=dtype)
196+
184197
else:
185198
mask = com.isnull(values)
186199
values = com._ensure_object(values)
187200
keys, counts = htable.value_count_object(values, mask)
188201

189202
result = Series(counts, index=com._values_from_object(keys))
190203

204+
if bins is not None:
205+
# TODO: This next line should be more efficient
206+
result = result.reindex(np.arange(len(cat.levels)), fill_value=0)
207+
result.index = bins[:-1]
208+
191209
if sort:
192210
result.sort()
193211
if not ascending:

pandas/core/categorical.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
from pandas.core.base import PandasObject
77
from pandas.core.index import Index
88
import pandas.core.common as com
9-
from pandas.core.frame import DataFrame
109

1110

1211
def _cat_compare_op(op):
@@ -182,6 +181,7 @@ def describe(self):
182181
Returns a dataframe with frequency and counts by level.
183182
"""
184183
#Hack?
184+
from pandas.core.frame import DataFrame
185185
grouped = DataFrame(self.labels).groupby(0)
186186
counts = grouped.count().values.squeeze()
187187
freqs = counts/float(counts.sum())

pandas/core/series.py

+15-3
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
_is_index_slice, _maybe_convert_indices)
2828
from pandas.core import generic
2929
from pandas.core.internals import SingleBlockManager
30+
from pandas.core.categorical import Categorical
3031
import pandas.core.expressions as expressions
3132
from pandas.tseries.index import DatetimeIndex
3233
from pandas.tseries.period import PeriodIndex, Period
@@ -579,6 +580,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
579580
index = data.index
580581
else:
581582
data = data.reindex(index, copy=copy)
583+
elif isinstance(data, Categorical):
584+
if name is None:
585+
name = data.name
586+
data = np.asarray(data)
582587
elif isinstance(data, types.GeneratorType):
583588
data = list(data)
584589
elif isinstance(data, (set, frozenset)):
@@ -1525,7 +1530,7 @@ def count(self, level=None):
15251530

15261531
return notnull(_values_from_object(self)).sum()
15271532

1528-
def value_counts(self, normalize=False):
1533+
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None):
15291534
"""
15301535
Returns Series containing counts of unique values. The resulting Series
15311536
will be in descending order so that the first element is the most
@@ -1536,14 +1541,21 @@ def value_counts(self, normalize=False):
15361541
normalize: boolean, default False
15371542
If True then the Series returned will contain the relative
15381543
frequencies of the unique values.
1544+
sort : boolean, default True
1545+
Sort by values
1546+
ascending : boolean, default False
1547+
Sort in ascending order
1548+
bins : integer, optional
1549+
Rather than count values, group them into half-open bins,
1550+
a convenience for pd.cut, only works with numeric data
15391551
15401552
Returns
15411553
-------
15421554
counts : Series
15431555
"""
15441556
from pandas.core.algorithms import value_counts
1545-
return value_counts(self.values, sort=True, ascending=False,
1546-
normalize=normalize)
1557+
return value_counts(self.values, sort=sort, ascending=ascending,
1558+
normalize=normalize, bins=bins)
15471559

15481560
def unique(self):
15491561
"""

pandas/tests/test_algos.py

+39-1
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
import numpy as np
55

6-
from pandas.core.api import Series
6+
from pandas.core.api import Series, Categorical
77
import pandas as pd
88

99
import pandas.core.algorithms as algos
@@ -63,6 +63,44 @@ def test_on_index_object(self):
6363

6464
tm.assert_almost_equal(result, expected)
6565

66+
class TestValueCounts(unittest.TestCase):
67+
_multiprocess_can_split_ = True
68+
69+
def test_value_counts(self):
70+
from pandas.tools.tile import cut
71+
72+
arr = np.random.randn(4)
73+
factor = cut(arr, 4)
74+
75+
tm.assert_isinstance(factor, Categorical)
76+
77+
result = algos.value_counts(factor)
78+
expected = algos.value_counts(np.asarray(factor))
79+
tm.assert_series_equal(result, expected)
80+
81+
def test_value_counts_bins(self):
82+
s = [1, 2, 3, 4]
83+
result = algos.value_counts(s, bins=1)
84+
self.assertEqual(result.tolist(), [4])
85+
self.assertEqual(result.index[0], 0.997)
86+
87+
result = algos.value_counts(s, bins=2, sort=False)
88+
self.assertEqual(result.tolist(), [2, 2])
89+
self.assertEqual(result.index[0], 0.997)
90+
self.assertEqual(result.index[1], 2.5)
91+
92+
def test_value_counts_dtypes(self):
93+
result = algos.value_counts([1, 1.])
94+
self.assertEqual(len(result), 1)
95+
96+
result = algos.value_counts([1, 1.], bins=1)
97+
self.assertEqual(len(result), 1)
98+
99+
result = algos.value_counts(Series([1, 1., '1'])) # object
100+
self.assertEqual(len(result), 2)
101+
102+
self.assertRaises(TypeError, lambda s: algos.value_counts(s, bins=1), ['1', 1])
103+
66104

67105
def test_quantile():
68106
s = Series(np.random.randn(100))

pandas/tests/test_categorical.py

-13
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77

88
import numpy as np
99

10-
from pandas.core.api import value_counts
1110
from pandas.core.categorical import Categorical
1211
from pandas.core.index import Index, Int64Index, MultiIndex
1312
from pandas.core.frame import DataFrame
@@ -89,18 +88,6 @@ def test_comparisons(self):
8988
expected = np.repeat(False, len(self.factor))
9089
self.assert_(np.array_equal(result, expected))
9190

92-
def test_value_counts(self):
93-
from pandas.tools.tile import cut
94-
95-
arr = np.random.randn(4)
96-
factor = cut(arr, 4)
97-
98-
tm.assert_isinstance(factor, Categorical)
99-
100-
result = value_counts(factor)
101-
expected = value_counts(np.asarray(factor))
102-
tm.assert_series_equal(result, expected)
103-
10491
def test_na_flags_int_levels(self):
10592
# #1457
10693

pandas/tests/test_series.py

+40
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,16 @@ def test_constructor_generator(self):
371371
exp.index = lrange(10, 20)
372372
assert_series_equal(result, exp)
373373

374+
def test_constructor_categorical(self):
375+
cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'])
376+
res = Series(cat)
377+
exp = Series({0: 'a', 1: 'b', 2: 'c', 3: 'a', 4: 'b', 5: 'c'})
378+
assert_series_equal(res, exp)
379+
380+
cat.name = 'foo'
381+
res = Series(cat)
382+
self.assertEqual(res.name, cat.name)
383+
374384
def test_constructor_maskedarray(self):
375385
data = ma.masked_all((3,), dtype=float)
376386
result = Series(data)
@@ -2979,13 +2989,43 @@ def test_value_counts_nunique(self):
29792989
expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
29802990
assert_series_equal(hist, expected)
29812991

2992+
# don't sort, have to sort after the fact as not sorting is platform-dep
2993+
hist = s.value_counts(sort=False)
2994+
hist.sort()
2995+
expected = Series([3, 1, 4, 2], index=list('acbd'))
2996+
expected.sort()
2997+
assert_series_equal(hist, expected)
2998+
2999+
# sort ascending
3000+
hist = s.value_counts(ascending=True)
3001+
expected = Series([1, 2, 3, 4], index=list('cdab'))
3002+
assert_series_equal(hist, expected)
3003+
29823004
# relative histogram.
29833005
hist = s.value_counts(normalize=True)
29843006
expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
29853007
assert_series_equal(hist, expected)
29863008

29873009
self.assertEquals(s.nunique(), 4)
29883010

3011+
# bins
3012+
self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)
3013+
3014+
s1 = Series([1, 1, 2, 3])
3015+
res1 = s1.value_counts(bins=1)
3016+
exp1 = Series({0.998: 4})
3017+
assert_series_equal(res1, exp1)
3018+
res1n = s1.value_counts(bins=1, normalize=True)
3019+
exp1n = Series({0.998: 1.0})
3020+
assert_series_equal(res1n, exp1n)
3021+
3022+
res4 = s1.value_counts(bins=4)
3023+
exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
3024+
assert_series_equal(res4, exp4)
3025+
res4n = s1.value_counts(bins=4, normalize=True)
3026+
exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
3027+
assert_series_equal(res4n, exp4n)
3028+
29893029
# handle NA's properly
29903030
s[5:7] = np.nan
29913031
hist = s.value_counts()

0 commit comments

Comments
 (0)