Skip to content

ENH add bins argument to value_counts #4502

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 27, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ pandas 0.13
- ``read_excel`` now supports an integer in its ``sheetname`` argument giving
the index of the sheet to read in (:issue:`4301`).
- Added a test for ``read_clipboard()`` and ``to_clipboard()`` (:issue:`4282`)
- Added bins argument to ``value_counts`` (:issue:`3945`), also sort and
ascending, now available in Series method as well as top-level function.
- Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf",
"iNf", etc.) to infinity. (:issue:`4220`, :issue:`4219`), affecting
``read_table``, ``read_csv``, etc.
Expand Down
26 changes: 22 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
return labels, uniques


def value_counts(values, sort=True, ascending=False, normalize=False):
def value_counts(values, sort=True, ascending=False, normalize=False, bins=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe? add raise_on_error=False, which you could automatically just exclude non-numeric

or just do it by definition; I think we do this is for example describe (it should just work);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think Categorical was the thing that throws.

Is there a neat way to just include only numerics from a Series?

Maybe call numeric_only like groupby: dda2363. Not sure it should be default though.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

going to push somethign to enable _get_numeric_data/_get_bool_data in all NDFrame shortly

"""
Compute a histogram of the counts of non-null values

Expand All @@ -161,33 +161,51 @@ def value_counts(values, sort=True, ascending=False, normalize=False):
Sort in ascending order
normalize: boolean, default False
If True then compute a relative histogram
bins : integer, optional
Rather than count values, group them into half-open bins,
convenience for pd.cut, only works with numeric data

Returns
-------
value_counts : Series

"""
from pandas.core.series import Series
from pandas.tools.tile import cut

values = Series(values).values

values = np.asarray(values)
if bins is not None:
try:
cat, bins = cut(values, bins, retbins=True)
except TypeError:
raise TypeError("bins argument only works with numeric data.")
values = cat.labels

if com.is_integer_dtype(values.dtype):
values = com._ensure_int64(values)
keys, counts = htable.value_count_int64(values)
elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):

elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
dtype = values.dtype
values = values.view(np.int64)
keys, counts = htable.value_count_int64(values)

# convert the keys back to the dtype we came in
keys = Series(keys,dtype=dtype)
keys = Series(keys, dtype=dtype)

else:
mask = com.isnull(values)
values = com._ensure_object(values)
keys, counts = htable.value_count_object(values, mask)

result = Series(counts, index=com._values_from_object(keys))

if bins is not None:
# TODO: This next line should be more efficient
result = result.reindex(np.arange(len(cat.levels)), fill_value=0)
result.index = bins[:-1]

if sort:
result.sort()
if not ascending:
Expand Down
2 changes: 1 addition & 1 deletion pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
from pandas.core.base import PandasObject
from pandas.core.index import Index
import pandas.core.common as com
from pandas.core.frame import DataFrame


def _cat_compare_op(op):
Expand Down Expand Up @@ -182,6 +181,7 @@ def describe(self):
Returns a dataframe with frequency and counts by level.
"""
#Hack?
from pandas.core.frame import DataFrame
grouped = DataFrame(self.labels).groupby(0)
counts = grouped.count().values.squeeze()
freqs = counts/float(counts.sum())
Expand Down
18 changes: 15 additions & 3 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
_is_index_slice, _maybe_convert_indices)
from pandas.core import generic
from pandas.core.internals import SingleBlockManager
from pandas.core.categorical import Categorical
import pandas.core.expressions as expressions
from pandas.tseries.index import DatetimeIndex
from pandas.tseries.period import PeriodIndex, Period
Expand Down Expand Up @@ -579,6 +580,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
index = data.index
else:
data = data.reindex(index, copy=copy)
elif isinstance(data, Categorical):
if name is None:
name = data.name
data = np.asarray(data)
elif isinstance(data, types.GeneratorType):
data = list(data)
elif isinstance(data, (set, frozenset)):
Expand Down Expand Up @@ -1525,7 +1530,7 @@ def count(self, level=None):

return notnull(_values_from_object(self)).sum()

def value_counts(self, normalize=False):
def value_counts(self, normalize=False, sort=True, ascending=False, bins=None):
"""
Returns Series containing counts of unique values. The resulting Series
will be in descending order so that the first element is the most
Expand All @@ -1536,14 +1541,21 @@ def value_counts(self, normalize=False):
normalize: boolean, default False
If True then the Series returned will contain the relative
frequencies of the unique values.
sort : boolean, default True
Sort by values
ascending : boolean, default False
Sort in ascending order
bins : integer, optional
Rather than count values, group them into half-open bins,
a convenience for pd.cut, only works with numeric data

Returns
-------
counts : Series
"""
from pandas.core.algorithms import value_counts
return value_counts(self.values, sort=True, ascending=False,
normalize=normalize)
return value_counts(self.values, sort=sort, ascending=ascending,
normalize=normalize, bins=bins)

def unique(self):
"""
Expand Down
40 changes: 39 additions & 1 deletion pandas/tests/test_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import numpy as np

from pandas.core.api import Series
from pandas.core.api import Series, Categorical
import pandas as pd

import pandas.core.algorithms as algos
Expand Down Expand Up @@ -63,6 +63,44 @@ def test_on_index_object(self):

tm.assert_almost_equal(result, expected)

class TestValueCounts(unittest.TestCase):
_multiprocess_can_split_ = True

def test_value_counts(self):
from pandas.tools.tile import cut

arr = np.random.randn(4)
factor = cut(arr, 4)

tm.assert_isinstance(factor, Categorical)

result = algos.value_counts(factor)
expected = algos.value_counts(np.asarray(factor))
tm.assert_series_equal(result, expected)

def test_value_counts_bins(self):
s = [1, 2, 3, 4]
result = algos.value_counts(s, bins=1)
self.assertEqual(result.tolist(), [4])
self.assertEqual(result.index[0], 0.997)

result = algos.value_counts(s, bins=2, sort=False)
self.assertEqual(result.tolist(), [2, 2])
self.assertEqual(result.index[0], 0.997)
self.assertEqual(result.index[1], 2.5)

def test_value_counts_dtypes(self):
result = algos.value_counts([1, 1.])
self.assertEqual(len(result), 1)

result = algos.value_counts([1, 1.], bins=1)
self.assertEqual(len(result), 1)

result = algos.value_counts(Series([1, 1., '1'])) # object
self.assertEqual(len(result), 2)

self.assertRaises(TypeError, lambda s: algos.value_counts(s, bins=1), ['1', 1])


def test_quantile():
s = Series(np.random.randn(100))
Expand Down
13 changes: 0 additions & 13 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@

import numpy as np

from pandas.core.api import value_counts
from pandas.core.categorical import Categorical
from pandas.core.index import Index, Int64Index, MultiIndex
from pandas.core.frame import DataFrame
Expand Down Expand Up @@ -89,18 +88,6 @@ def test_comparisons(self):
expected = np.repeat(False, len(self.factor))
self.assert_(np.array_equal(result, expected))

def test_value_counts(self):
from pandas.tools.tile import cut

arr = np.random.randn(4)
factor = cut(arr, 4)

tm.assert_isinstance(factor, Categorical)

result = value_counts(factor)
expected = value_counts(np.asarray(factor))
tm.assert_series_equal(result, expected)

def test_na_flags_int_levels(self):
# #1457

Expand Down
40 changes: 40 additions & 0 deletions pandas/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,16 @@ def test_constructor_generator(self):
exp.index = lrange(10, 20)
assert_series_equal(result, exp)

def test_constructor_categorical(self):
cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'])
res = Series(cat)
exp = Series({0: 'a', 1: 'b', 2: 'c', 3: 'a', 4: 'b', 5: 'c'})
assert_series_equal(res, exp)

cat.name = 'foo'
res = Series(cat)
self.assertEqual(res.name, cat.name)

def test_constructor_maskedarray(self):
data = ma.masked_all((3,), dtype=float)
result = Series(data)
Expand Down Expand Up @@ -2966,13 +2976,43 @@ def test_value_counts_nunique(self):
expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
assert_series_equal(hist, expected)

# don't sort, have to sort after the fact as not sorting is platform-dep
hist = s.value_counts(sort=False)
hist.sort()
expected = Series([3, 1, 4, 2], index=list('acbd'))
expected.sort()
assert_series_equal(hist, expected)

# sort ascending
hist = s.value_counts(ascending=True)
expected = Series([1, 2, 3, 4], index=list('cdab'))
assert_series_equal(hist, expected)

# relative histogram.
hist = s.value_counts(normalize=True)
expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
assert_series_equal(hist, expected)

self.assertEquals(s.nunique(), 4)

# bins
self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

s1 = Series([1, 1, 2, 3])
res1 = s1.value_counts(bins=1)
exp1 = Series({0.998: 4})
assert_series_equal(res1, exp1)
res1n = s1.value_counts(bins=1, normalize=True)
exp1n = Series({0.998: 1.0})
assert_series_equal(res1n, exp1n)

res4 = s1.value_counts(bins=4)
exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
assert_series_equal(res4, exp4)
res4n = s1.value_counts(bins=4, normalize=True)
exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
assert_series_equal(res4n, exp4n)

# handle NA's properly
s[5:7] = np.nan
hist = s.value_counts()
Expand Down