From 85f191cbdca7f1d83aab05e226cb921b03e9b638 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Wed, 7 Aug 2013 13:47:39 +0100 Subject: [PATCH] ENH add bins argument to value_counts ENH Series method accepts Categorical Also, adds arguments present in top-level function to the Series method (sort and ascending). --- doc/source/release.rst | 2 ++ pandas/core/algorithms.py | 26 +++++++++++++++++---- pandas/core/categorical.py | 2 +- pandas/core/series.py | 18 +++++++++++--- pandas/tests/test_algos.py | 40 +++++++++++++++++++++++++++++++- pandas/tests/test_categorical.py | 13 ----------- pandas/tests/test_series.py | 40 ++++++++++++++++++++++++++++++++ 7 files changed, 119 insertions(+), 22 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index b301dcb80445a..dc8b05a97b75a 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -41,6 +41,8 @@ pandas 0.13 - ``read_excel`` now supports an integer in its ``sheetname`` argument giving the index of the sheet to read in (:issue:`4301`). - Added a test for ``read_clipboard()`` and ``to_clipboard()`` (:issue:`4282`) + - Added bins argument to ``value_counts`` (:issue:`3945`), also sort and + ascending, now available in Series method as well as top-level function. - Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf", "iNf", etc.) to infinity. (:issue:`4220`, :issue:`4219`), affecting ``read_table``, ``read_csv``, etc. diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index f1d78dc34957b..f6b1131120aa6 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -148,7 +148,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1): return labels, uniques -def value_counts(values, sort=True, ascending=False, normalize=False): +def value_counts(values, sort=True, ascending=False, normalize=False, bins=None): """ Compute a histogram of the counts of non-null values @@ -161,26 +161,39 @@ def value_counts(values, sort=True, ascending=False, normalize=False): Sort in ascending order normalize: boolean, default False If True then compute a relative histogram + bins : integer, optional + Rather than count values, group them into half-open bins, + convenience for pd.cut, only works with numeric data Returns ------- value_counts : Series + """ from pandas.core.series import Series + from pandas.tools.tile import cut + + values = Series(values).values - values = np.asarray(values) + if bins is not None: + try: + cat, bins = cut(values, bins, retbins=True) + except TypeError: + raise TypeError("bins argument only works with numeric data.") + values = cat.labels if com.is_integer_dtype(values.dtype): values = com._ensure_int64(values) keys, counts = htable.value_count_int64(values) - elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)): + elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)): dtype = values.dtype values = values.view(np.int64) keys, counts = htable.value_count_int64(values) # convert the keys back to the dtype we came in - keys = Series(keys,dtype=dtype) + keys = Series(keys, dtype=dtype) + else: mask = com.isnull(values) values = com._ensure_object(values) @@ -188,6 +201,11 @@ def value_counts(values, sort=True, ascending=False, normalize=False): result = Series(counts, index=com._values_from_object(keys)) + if bins is not None: + # TODO: This next line should be more efficient + result = result.reindex(np.arange(len(cat.levels)), fill_value=0) + result.index = bins[:-1] + if sort: result.sort() if not ascending: diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index b25a027adedd9..b085738018950 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -6,7 +6,6 @@ from pandas.core.base import PandasObject from pandas.core.index import Index import pandas.core.common as com -from pandas.core.frame import DataFrame def _cat_compare_op(op): @@ -182,6 +181,7 @@ def describe(self): Returns a dataframe with frequency and counts by level. """ #Hack? + from pandas.core.frame import DataFrame grouped = DataFrame(self.labels).groupby(0) counts = grouped.count().values.squeeze() freqs = counts/float(counts.sum()) diff --git a/pandas/core/series.py b/pandas/core/series.py index 050a9de2b23dc..ad4295d05fca7 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -27,6 +27,7 @@ _is_index_slice, _maybe_convert_indices) from pandas.core import generic from pandas.core.internals import SingleBlockManager +from pandas.core.categorical import Categorical import pandas.core.expressions as expressions from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period @@ -579,6 +580,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None, index = data.index else: data = data.reindex(index, copy=copy) + elif isinstance(data, Categorical): + if name is None: + name = data.name + data = np.asarray(data) elif isinstance(data, types.GeneratorType): data = list(data) elif isinstance(data, (set, frozenset)): @@ -1525,7 +1530,7 @@ def count(self, level=None): return notnull(_values_from_object(self)).sum() - def value_counts(self, normalize=False): + def value_counts(self, normalize=False, sort=True, ascending=False, bins=None): """ Returns Series containing counts of unique values. The resulting Series will be in descending order so that the first element is the most @@ -1536,14 +1541,21 @@ def value_counts(self, normalize=False): normalize: boolean, default False If True then the Series returned will contain the relative frequencies of the unique values. + sort : boolean, default True + Sort by values + ascending : boolean, default False + Sort in ascending order + bins : integer, optional + Rather than count values, group them into half-open bins, + a convenience for pd.cut, only works with numeric data Returns ------- counts : Series """ from pandas.core.algorithms import value_counts - return value_counts(self.values, sort=True, ascending=False, - normalize=normalize) + return value_counts(self.values, sort=sort, ascending=ascending, + normalize=normalize, bins=bins) def unique(self): """ diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index d0a050984a07f..6458d7c31d689 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -3,7 +3,7 @@ import numpy as np -from pandas.core.api import Series +from pandas.core.api import Series, Categorical import pandas as pd import pandas.core.algorithms as algos @@ -63,6 +63,44 @@ def test_on_index_object(self): tm.assert_almost_equal(result, expected) +class TestValueCounts(unittest.TestCase): + _multiprocess_can_split_ = True + + def test_value_counts(self): + from pandas.tools.tile import cut + + arr = np.random.randn(4) + factor = cut(arr, 4) + + tm.assert_isinstance(factor, Categorical) + + result = algos.value_counts(factor) + expected = algos.value_counts(np.asarray(factor)) + tm.assert_series_equal(result, expected) + + def test_value_counts_bins(self): + s = [1, 2, 3, 4] + result = algos.value_counts(s, bins=1) + self.assertEqual(result.tolist(), [4]) + self.assertEqual(result.index[0], 0.997) + + result = algos.value_counts(s, bins=2, sort=False) + self.assertEqual(result.tolist(), [2, 2]) + self.assertEqual(result.index[0], 0.997) + self.assertEqual(result.index[1], 2.5) + + def test_value_counts_dtypes(self): + result = algos.value_counts([1, 1.]) + self.assertEqual(len(result), 1) + + result = algos.value_counts([1, 1.], bins=1) + self.assertEqual(len(result), 1) + + result = algos.value_counts(Series([1, 1., '1'])) # object + self.assertEqual(len(result), 2) + + self.assertRaises(TypeError, lambda s: algos.value_counts(s, bins=1), ['1', 1]) + def test_quantile(): s = Series(np.random.randn(100)) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 29d104e9c465c..71e9f36c26e70 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -7,7 +7,6 @@ import numpy as np -from pandas.core.api import value_counts from pandas.core.categorical import Categorical from pandas.core.index import Index, Int64Index, MultiIndex from pandas.core.frame import DataFrame @@ -89,18 +88,6 @@ def test_comparisons(self): expected = np.repeat(False, len(self.factor)) self.assert_(np.array_equal(result, expected)) - def test_value_counts(self): - from pandas.tools.tile import cut - - arr = np.random.randn(4) - factor = cut(arr, 4) - - tm.assert_isinstance(factor, Categorical) - - result = value_counts(factor) - expected = value_counts(np.asarray(factor)) - tm.assert_series_equal(result, expected) - def test_na_flags_int_levels(self): # #1457 diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index e0780e8674600..3599d3a9a9ac0 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -371,6 +371,16 @@ def test_constructor_generator(self): exp.index = lrange(10, 20) assert_series_equal(result, exp) + def test_constructor_categorical(self): + cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c']) + res = Series(cat) + exp = Series({0: 'a', 1: 'b', 2: 'c', 3: 'a', 4: 'b', 5: 'c'}) + assert_series_equal(res, exp) + + cat.name = 'foo' + res = Series(cat) + self.assertEqual(res.name, cat.name) + def test_constructor_maskedarray(self): data = ma.masked_all((3,), dtype=float) result = Series(data) @@ -2966,6 +2976,18 @@ def test_value_counts_nunique(self): expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) assert_series_equal(hist, expected) + # don't sort, have to sort after the fact as not sorting is platform-dep + hist = s.value_counts(sort=False) + hist.sort() + expected = Series([3, 1, 4, 2], index=list('acbd')) + expected.sort() + assert_series_equal(hist, expected) + + # sort ascending + hist = s.value_counts(ascending=True) + expected = Series([1, 2, 3, 4], index=list('cdab')) + assert_series_equal(hist, expected) + # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) @@ -2973,6 +2995,24 @@ def test_value_counts_nunique(self): self.assertEquals(s.nunique(), 4) + # bins + self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) + + s1 = Series([1, 1, 2, 3]) + res1 = s1.value_counts(bins=1) + exp1 = Series({0.998: 4}) + assert_series_equal(res1, exp1) + res1n = s1.value_counts(bins=1, normalize=True) + exp1n = Series({0.998: 1.0}) + assert_series_equal(res1n, exp1n) + + res4 = s1.value_counts(bins=4) + exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) + assert_series_equal(res4, exp4) + res4n = s1.value_counts(bins=4, normalize=True) + exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) + assert_series_equal(res4n, exp4n) + # handle NA's properly s[5:7] = np.nan hist = s.value_counts()