CLN: removes cython implementation of groupby count

behzadnouri · behzadnouri · commit d968aab3da13 · 2015-09-06T10:02:01.000-04:00
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -4562,7 +4562,7 @@ def _count_level(self, level, axis=0, numeric_only=False):
 
         level_index = count_axis.levels[level]
         labels = com._ensure_int64(count_axis.labels[level])
-        counts = lib.count_level_2d(mask, labels, len(level_index))
+        counts = lib.count_level_2d(mask, labels, len(level_index), axis=0)
 
         result = DataFrame(counts, index=level_index,
                            columns=agg_axis)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -69,7 +69,7 @@
     'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount',
     'resample',
     'describe',
-    'rank', 'quantile', 'count',
+    'rank', 'quantile',
     'fillna',
     'mad',
     'any', 'all',
@@ -149,9 +149,6 @@ def _last(x):
         return _last(x)
 
 
-def _count_compat(x, axis=0):
-    return x.count()  # .size != .count(); count excludes nan
-
 class Grouper(object):
     """
     A Grouper allows the user to specify a groupby instruction for a target object
@@ -801,11 +798,6 @@ def size(self):
                               numeric_only=False, _convert=True)
     last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
                              _convert=True)
-    _count = _groupby_function('_count', 'count', _count_compat,
-                               numeric_only=False)
-
-    def count(self, axis=0):
-        return self._count().astype('int64')
 
     def ohlc(self):
         """
@@ -1463,7 +1455,6 @@ def get_group_levels(self):
             'f': lambda func, a, b, c, d: func(a, b, c, d, 1)
         },
         'last': 'group_last',
-        'count': 'group_count',
     }
 
     _cython_arity = {
@@ -3468,6 +3459,24 @@ def _apply_to_column_groupbys(self, func):
              in self._iterate_column_groupbys()),
             keys=self._selected_obj.columns, axis=1)
 
+    def count(self):
+        from functools import partial
+        from pandas.lib import count_level_2d
+        from pandas.core.common import _isnull_ndarraylike as isnull
+
+        data, _ = self._get_data_to_aggregate()
+        ids, _, ngroups = self.grouper.group_info
+        mask = ids != -1
+
+        val = ((mask & ~isnull(blk.get_values())) for blk in data.blocks)
+        loc = (blk.mgr_locs for blk in data.blocks)
+
+        counter = partial(count_level_2d, labels=ids, max_bin=ngroups, axis=1)
+        blk = map(make_block, map(counter, val), loc)
+
+        return self._wrap_agged_blocks(data.items, list(blk))
+
+
 from pandas.tools.plotting import boxplot_frame_groupby
 DataFrameGroupBy.boxplot = boxplot_frame_groupby
 
diff --git a/pandas/lib.pyx b/pandas/lib.pyx
@@ -1253,19 +1253,32 @@ def lookup_values(ndarray[object] values, dict mapping):
     return maybe_convert_objects(result)
 
 
+@cython.boundscheck(False)
+@cython.wraparound(False)
 def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
-                   ndarray[int64_t] labels, Py_ssize_t max_bin):
+                   ndarray[int64_t, ndim=1] labels,
+                   Py_ssize_t max_bin,
+                   int axis):
     cdef:
         Py_ssize_t i, j, k, n
         ndarray[int64_t, ndim=2] counts
 
+    assert(axis == 0 or axis == 1)
     n, k = (<object> mask).shape
-    counts = np.zeros((max_bin, k), dtype='i8')
 
-    for i from 0 <= i < n:
-        for j from 0 <= j < k:
-            if mask[i, j]:
-                counts[labels[i], j] += 1
+    if axis == 0:
+        counts = np.zeros((max_bin, k), dtype='i8')
+        with nogil:
+            for i from 0 <= i < n:
+                for j from 0 <= j < k:
+                    counts[labels[i], j] += mask[i, j]
+
+    else:  # axis == 1
+        counts = np.zeros((n, max_bin), dtype='i8')
+        with nogil:
+            for i from 0 <= i < n:
+                for j from 0 <= j < k:
+                    counts[i, labels[j]] += mask[i, j]
 
     return counts
 
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
@@ -971,44 +971,6 @@ def group_var_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
 
 """
 
-group_count_template = """@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_%(name)s(ndarray[%(dest_type2)s, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[%(c_type)s, ndim=2] values,
-                         ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, lab, ncounts = len(counts)
-        Py_ssize_t N = values.shape[0], K = values.shape[1]
-        %(c_type)s val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    if len(values) != len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-
-    %(nogil)s
-    %(tab)sfor i in range(N):
-    %(tab)s    lab = labels[i]
-    %(tab)s    if lab < 0:
-    %(tab)s        continue
-
-    %(tab)s    counts[lab] += 1
-    %(tab)s    for j in range(K):
-    %(tab)s        val = values[i, j]
-
-    %(tab)s        # not nan
-    %(tab)s        nobs[lab, j] += val == val and val != iNaT
-
-    %(tab)sfor i in range(ncounts):
-    %(tab)s    for j in range(K):
-    %(tab)s        out[i, j] = nobs[i, j]
-"""
-
 # add passing bin edges, instead of labels
 
 
@@ -1995,8 +1957,6 @@ def generate_from_template(template, exclude=None):
 groupby_min_max = [group_min_template,
                    group_max_template]
 
-groupby_count = [group_count_template]
-
 templates_1d = [map_indices_template,
                 pad_template,
                 backfill_template,
@@ -2051,12 +2011,6 @@ def generate_take_cython_file():
             print(generate_put_min_max_template(template, use_ints=True),
                   file=f)
 
-        for template in groupby_count:
-            print(generate_put_selection_template(template, use_ints=True,
-                                                  use_datelikes=True,
-                                                  use_objects=True),
-                                                  file=f)
-
         for template in nobool_1d_templates:
             print(generate_from_template(template, exclude=['bool']), file=f)
 
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
@@ -7930,192 +7930,6 @@ def group_max_int64(ndarray[int64_t, ndim=2] out,
                     out[i, j] = maxx[i, j]
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_float64(ndarray[float64_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[float64_t, ndim=2] values,
-                         ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, lab, ncounts = len(counts)
-        Py_ssize_t N = values.shape[0], K = values.shape[1]
-        float64_t val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    if len(values) != len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                nobs[lab, j] += val == val and val != iNaT
-
-        for i in range(ncounts):
-            for j in range(K):
-                out[i, j] = nobs[i, j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_float32(ndarray[float32_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[float32_t, ndim=2] values,
-                         ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, lab, ncounts = len(counts)
-        Py_ssize_t N = values.shape[0], K = values.shape[1]
-        float32_t val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    if len(values) != len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                nobs[lab, j] += val == val and val != iNaT
-
-        for i in range(ncounts):
-            for j in range(K):
-                out[i, j] = nobs[i, j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_int64(ndarray[int64_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[int64_t, ndim=2] values,
-                         ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, lab, ncounts = len(counts)
-        Py_ssize_t N = values.shape[0], K = values.shape[1]
-        int64_t val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    if len(values) != len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                nobs[lab, j] += val == val and val != iNaT
-
-        for i in range(ncounts):
-            for j in range(K):
-                out[i, j] = nobs[i, j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_object(ndarray[object, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[object, ndim=2] values,
-                         ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, lab, ncounts = len(counts)
-        Py_ssize_t N = values.shape[0], K = values.shape[1]
-        object val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    if len(values) != len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-
-    
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            nobs[lab, j] += val == val and val != iNaT
-
-    for i in range(ncounts):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_int64(ndarray[int64_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[int64_t, ndim=2] values,
-                         ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, lab, ncounts = len(counts)
-        Py_ssize_t N = values.shape[0], K = values.shape[1]
-        int64_t val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    if len(values) != len(labels):
-        raise AssertionError("len(index) != len(labels)")
-
-
-    with nogil:
-        for i in range(N):
-            lab = labels[i]
-            if lab < 0:
-                continue
-
-            counts[lab] += 1
-            for j in range(K):
-                val = values[i, j]
-
-                # not nan
-                nobs[lab, j] += val == val and val != iNaT
-
-        for i in range(ncounts):
-            for j in range(K):
-                out[i, j] = nobs[i, j]
-
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def left_join_indexer_unique_float64(ndarray[float64_t] left,
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py