PERF: improves performance in SeriesGroupBy.count, pandas-dev#10946

behzadnouri · Nick Eubank · commit 4832e34d1966 · 2015-09-29T12:05:46.000-07:00
BUG: closes bug in Series.count when index has nulls
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -913,6 +913,8 @@ Bug Fixes
 - Bug in ``BinGrouper.group_info`` where returned values are not compatible with base class (:issue:`10914`)
 - Bug in clearing the cache on ``DataFrame.pop`` and a subsequent inplace op (:issue:`10912`)
 - Bug in indexing with a mixed-integer ``Index`` causing an ``ImportError`` (:issue:`10610`)
+- Bug in ``Series.count`` when index has nulls (:issue:`10946`)
+
 - Bug causing ``DataFrame.where`` to not respect the ``axis`` parameter when the frame has a symmetric shape. (:issue:`9736`)
 
 - Bug in ``Table.select_column`` where name is not preserved (:issue:`10392`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -2685,6 +2685,15 @@ def value_counts(self, normalize=False, sort=True, ascending=False,
 
         return Series(out, index=mi)
 
+    def count(self):
+        ids, _, ngroups = self.grouper.group_info
+        val = self.obj.get_values()
+
+        mask = (ids != -1) & ~isnull(val)
+        out = np.bincount(ids[mask], minlength=ngroups) if ngroups != 0 else []
+
+        return Series(out, index=self.grouper.result_index, name=self.name)
+
     def _apply_to_column_groupbys(self, func):
         """ return a pass thru """
         return func(self)
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1159,27 +1159,24 @@ def count(self, level=None):
         -------
         nobs : int or Series (if level specified)
         """
-        if level is not None:
-            mask = notnull(self._values)
+        from pandas.core.index import _get_na_value
 
-            if isinstance(level, compat.string_types):
-                level = self.index._get_level_number(level)
+        if level is None:
+            return notnull(_values_from_object(self)).sum()
 
-            level_index = self.index.levels[level]
+        if isinstance(level, compat.string_types):
+            level = self.index._get_level_number(level)
 
-            if len(self) == 0:
-                return self._constructor(0, index=level_index)\
-                           .__finalize__(self)
+        lev = self.index.levels[level]
+        lab = np.array(self.index.labels[level], subok=False, copy=True)
 
-            # call cython function
-            max_bin = len(level_index)
-            labels = com._ensure_int64(self.index.labels[level])
-            counts = lib.count_level_1d(mask.view(np.uint8),
-                                        labels, max_bin)
-            return self._constructor(counts,
-                                     index=level_index).__finalize__(self)
+        mask = lab == -1
+        if mask.any():
+            lab[mask] = cnt = len(lev)
+            lev = lev.insert(cnt, _get_na_value(lev.dtype.type))
 
-        return notnull(_values_from_object(self)).sum()
+        out = np.bincount(lab[notnull(self.values)], minlength=len(lev))
+        return self._constructor(out, index=lev).__finalize__(self)
 
     def mode(self):
         """Returns the mode(s) of the dataset.
@@ -2121,7 +2118,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
 
         >>> import pandas as pd
         >>> import numpy as np
-        >>> series = pd.Series([20, 21, 12], index=['London', 
+        >>> series = pd.Series([20, 21, 12], index=['London',
         ... 'New York','Helsinki'])
         London      20
         New York    21
@@ -2149,7 +2146,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
         dtype: int64
 
         Define a custom function that needs additional positional
-        arguments and pass these additional arguments using the 
+        arguments and pass these additional arguments using the
         ``args`` keyword.
 
         >>> def subtract_custom_value(x, custom_value):
@@ -2175,7 +2172,7 @@ def apply(self, func, convert_dtype=True, args=(), **kwds):
         Helsinki    87
         dtype: int64
 
-        Use a function from the Numpy library. 
+        Use a function from the Numpy library.
 
         >>> series.apply(np.log)
         London      2.995732
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -1253,23 +1253,6 @@ def lookup_values(ndarray[object] values, dict mapping):
     return maybe_convert_objects(result)
 
 
-def count_level_1d(ndarray[uint8_t, cast=True] mask,
-                   ndarray[int64_t] labels, Py_ssize_t max_bin):
-    cdef:
-        Py_ssize_t i, n
-        ndarray[int64_t] counts
-
-    counts = np.zeros(max_bin, dtype='i8')
-
-    n = len(mask)
-
-    for i from 0 <= i < n:
-        if mask[i]:
-            counts[labels[i]] += 1
-
-    return counts
-
-
 def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
                    ndarray[int64_t] labels, Py_ssize_t max_bin):
     cdef:
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
@@ -4904,6 +4904,16 @@ def test_count(self):
 
         self.assertEqual(self.ts.count(), np.isfinite(self.ts).sum())
 
+        mi = MultiIndex.from_arrays([list('aabbcc'), [1, 2, 2, nan, 1, 2]])
+        ts = Series(np.arange(len(mi)), index=mi)
+
+        left = ts.count(level=1)
+        right = Series([2, 3, 1], index=[1, 2, nan])
+        assert_series_equal(left, right)
+
+        ts.iloc[[0, 3, 5]] = nan
+        assert_series_equal(ts.count(level=1), right - 1)
+
     def test_dtype(self):
 
         self.assertEqual(self.ts.dtype, np.dtype('float64'))