Merge pull request #4502 from hayd/value_count_bins

hayd · hayd · commit 4226afe4b77b · 2013-08-26T17:07:49.000-07:00
ENH add bins argument to value_counts
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -46,6 +46,8 @@ pandas 0.13
     the index of the sheet to read in (:issue:`4301`).
   - ``get_dummies`` works with NaN (:issue:`4446`)
   - Added a test for ``read_clipboard()`` and ``to_clipboard()`` (:issue:`4282`)
+  - Added bins argument to ``value_counts`` (:issue:`3945`), also sort and
+  ascending, now available in Series method as well as top-level function.
   - Text parser now treats anything that reads like inf ("inf", "Inf", "-Inf",
     "iNf", etc.) to infinity. (:issue:`4220`, :issue:`4219`), affecting
     ``read_table``, ``read_csv``, etc.
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -148,7 +148,7 @@ def factorize(values, sort=False, order=None, na_sentinel=-1):
     return labels, uniques
 
 
-def value_counts(values, sort=True, ascending=False, normalize=False):
+def value_counts(values, sort=True, ascending=False, normalize=False, bins=None):
     """
     Compute a histogram of the counts of non-null values
 
@@ -161,33 +161,51 @@ def value_counts(values, sort=True, ascending=False, normalize=False):
         Sort in ascending order
     normalize: boolean, default False
         If True then compute a relative histogram
+    bins : integer, optional
+        Rather than count values, group them into half-open bins,
+        convenience for pd.cut, only works with numeric data
 
     Returns
     -------
     value_counts : Series
+
     """
     from pandas.core.series import Series
+    from pandas.tools.tile import cut
+
+    values = Series(values).values
 
-    values = np.asarray(values)
+    if bins is not None:
+        try:
+            cat, bins = cut(values, bins, retbins=True)
+        except TypeError:
+            raise TypeError("bins argument only works with numeric data.")
+        values = cat.labels
 
     if com.is_integer_dtype(values.dtype):
         values = com._ensure_int64(values)
         keys, counts = htable.value_count_int64(values)
-    elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
 
+    elif issubclass(values.dtype.type, (np.datetime64,np.timedelta64)):
         dtype = values.dtype
         values = values.view(np.int64)
         keys, counts = htable.value_count_int64(values)
 
         # convert the keys back to the dtype we came in
-        keys = Series(keys,dtype=dtype)
+        keys = Series(keys, dtype=dtype)
+
     else:
         mask = com.isnull(values)
         values = com._ensure_object(values)
         keys, counts = htable.value_count_object(values, mask)
 
     result = Series(counts, index=com._values_from_object(keys))
 
+    if bins is not None:
+        # TODO: This next line should be more efficient
+        result = result.reindex(np.arange(len(cat.levels)), fill_value=0)
+        result.index = bins[:-1]
+
     if sort:
         result.sort()
         if not ascending:
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -6,7 +6,6 @@
 from pandas.core.base import PandasObject
 from pandas.core.index import Index
 import pandas.core.common as com
-from pandas.core.frame import DataFrame
 
 
 def _cat_compare_op(op):
@@ -182,6 +181,7 @@ def describe(self):
         Returns a dataframe with frequency and counts by level.
         """
         #Hack?
+        from pandas.core.frame import DataFrame
         grouped = DataFrame(self.labels).groupby(0)
         counts = grouped.count().values.squeeze()
         freqs = counts/float(counts.sum())
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -27,6 +27,7 @@
     _is_index_slice, _maybe_convert_indices)
 from pandas.core import generic
 from pandas.core.internals import SingleBlockManager
+from pandas.core.categorical import Categorical
 import pandas.core.expressions as expressions
 from pandas.tseries.index import DatetimeIndex
 from pandas.tseries.period import PeriodIndex, Period
@@ -579,6 +580,10 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
                     index = data.index
                 else:
                     data = data.reindex(index, copy=copy)
+            elif isinstance(data, Categorical):
+                if name is None:
+                    name = data.name
+                data = np.asarray(data)
             elif isinstance(data, types.GeneratorType):
                 data = list(data)
             elif isinstance(data, (set, frozenset)):
@@ -1525,7 +1530,7 @@ def count(self, level=None):
 
         return notnull(_values_from_object(self)).sum()
 
-    def value_counts(self, normalize=False):
+    def value_counts(self, normalize=False, sort=True, ascending=False, bins=None):
         """
         Returns Series containing counts of unique values. The resulting Series
         will be in descending order so that the first element is the most
@@ -1536,14 +1541,21 @@ def value_counts(self, normalize=False):
         normalize: boolean, default False
             If True then the Series returned will contain the relative
             frequencies of the unique values.
+        sort : boolean, default True
+            Sort by values
+        ascending : boolean, default False
+            Sort in ascending order
+        bins : integer, optional
+            Rather than count values, group them into half-open bins,
+            a convenience for pd.cut, only works with numeric data
 
         Returns
         -------
         counts : Series
         """
         from pandas.core.algorithms import value_counts
-        return value_counts(self.values, sort=True, ascending=False,
-                            normalize=normalize)
+        return value_counts(self.values, sort=sort, ascending=ascending,
+                            normalize=normalize, bins=bins)
 
     def unique(self):
         """
diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py
@@ -3,7 +3,7 @@
 
 import numpy as np
 
-from pandas.core.api import Series
+from pandas.core.api import Series, Categorical
 import pandas as pd
 
 import pandas.core.algorithms as algos
@@ -63,6 +63,44 @@ def test_on_index_object(self):
 
         tm.assert_almost_equal(result, expected)
 
+class TestValueCounts(unittest.TestCase):
+    _multiprocess_can_split_ = True
+
+    def test_value_counts(self):
+        from pandas.tools.tile import cut
+
+        arr = np.random.randn(4)
+        factor = cut(arr, 4)
+
+        tm.assert_isinstance(factor, Categorical)
+
+        result = algos.value_counts(factor)
+        expected = algos.value_counts(np.asarray(factor))
+        tm.assert_series_equal(result, expected)
+
+    def test_value_counts_bins(self):
+        s = [1, 2, 3, 4]
+        result = algos.value_counts(s, bins=1)
+        self.assertEqual(result.tolist(), [4])
+        self.assertEqual(result.index[0], 0.997)
+
+        result = algos.value_counts(s, bins=2, sort=False)
+        self.assertEqual(result.tolist(), [2, 2])
+        self.assertEqual(result.index[0], 0.997)        
+        self.assertEqual(result.index[1], 2.5)
+
+    def test_value_counts_dtypes(self):
+        result = algos.value_counts([1, 1.])
+        self.assertEqual(len(result), 1)
+
+        result = algos.value_counts([1, 1.], bins=1)
+        self.assertEqual(len(result), 1)
+
+        result = algos.value_counts(Series([1, 1., '1']))  # object
+        self.assertEqual(len(result), 2)
+
+        self.assertRaises(TypeError, lambda s: algos.value_counts(s, bins=1), ['1', 1])
+
 
 def test_quantile():
     s = Series(np.random.randn(100))
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -7,7 +7,6 @@
 
 import numpy as np
 
-from pandas.core.api import value_counts
 from pandas.core.categorical import Categorical
 from pandas.core.index import Index, Int64Index, MultiIndex
 from pandas.core.frame import DataFrame
@@ -89,18 +88,6 @@ def test_comparisons(self):
         expected = np.repeat(False, len(self.factor))
         self.assert_(np.array_equal(result, expected))
 
-    def test_value_counts(self):
-        from pandas.tools.tile import cut
-
-        arr = np.random.randn(4)
-        factor = cut(arr, 4)
-
-        tm.assert_isinstance(factor, Categorical)
-
-        result = value_counts(factor)
-        expected = value_counts(np.asarray(factor))
-        tm.assert_series_equal(result, expected)
-
     def test_na_flags_int_levels(self):
         # #1457
 
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -371,6 +371,16 @@ def test_constructor_generator(self):
         exp.index = lrange(10, 20)
         assert_series_equal(result, exp)
 
+    def test_constructor_categorical(self):
+        cat = pd.Categorical([0, 1, 2, 0, 1, 2], ['a', 'b', 'c'])
+        res = Series(cat)
+        exp = Series({0: 'a', 1: 'b', 2: 'c', 3: 'a', 4: 'b', 5: 'c'})
+        assert_series_equal(res, exp)
+
+        cat.name = 'foo'
+        res = Series(cat)
+        self.assertEqual(res.name, cat.name)
+
     def test_constructor_maskedarray(self):
         data = ma.masked_all((3,), dtype=float)
         result = Series(data)
@@ -2979,13 +2989,43 @@ def test_value_counts_nunique(self):
         expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
         assert_series_equal(hist, expected)
 
+        # don't sort, have to sort after the fact as not sorting is platform-dep
+        hist = s.value_counts(sort=False)
+        hist.sort()
+        expected = Series([3, 1, 4, 2], index=list('acbd'))
+        expected.sort()
+        assert_series_equal(hist, expected)
+
+        # sort ascending
+        hist = s.value_counts(ascending=True)
+        expected = Series([1, 2, 3, 4], index=list('cdab'))
+        assert_series_equal(hist, expected)
+
         # relative histogram.
         hist = s.value_counts(normalize=True)
         expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
         assert_series_equal(hist, expected)
 
         self.assertEquals(s.nunique(), 4)
 
+        # bins
+        self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)
+
+        s1 = Series([1, 1, 2, 3])
+        res1 = s1.value_counts(bins=1)
+        exp1 = Series({0.998: 4})
+        assert_series_equal(res1, exp1)
+        res1n = s1.value_counts(bins=1, normalize=True)
+        exp1n = Series({0.998: 1.0})
+        assert_series_equal(res1n, exp1n)
+
+        res4 = s1.value_counts(bins=4)
+        exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
+        assert_series_equal(res4, exp4)
+        res4n = s1.value_counts(bins=4, normalize=True)
+        exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
+        assert_series_equal(res4n, exp4n)
+
         # handle NA's properly
         s[5:7] = np.nan
         hist = s.value_counts()