ENH: use size instead of cythonized count for fallback cases

cpcloud · cpcloud · commit cf6cbb2c113f · 2014-05-08T09:11:05.000-04:00
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -722,8 +722,7 @@ def size(self):
     last = _groupby_function('last', 'last', _last_compat, numeric_only=False,
                              _convert=True)
 
-    _count = _groupby_function('_count', 'count',
-                               lambda x, axis=0: notnull(x).sum(axis=axis),
+    _count = _groupby_function('_count', 'count', lambda x, axis=0: x.size(),
                                numeric_only=False)
 
     def count(self, axis=0):
diff --git a/pandas/src/generate_code.py b/pandas/src/generate_code.py
@@ -2219,18 +2219,21 @@ def put2d_%(name)s_%(dest_type)s(ndarray[%(c_type)s, ndim=2, cast=True] values,
 #-------------------------------------------------------------------------
 # Generators
 
-def generate_put_template(template, use_ints = True, use_floats = True,
-                          use_objects=False):
+def generate_put_template(template, use_ints=True, use_floats=True,
+                          use_objects=False, use_datelikes=False):
     floats_list = [
         ('float64', 'float64_t', 'float64_t', 'np.float64'),
         ('float32', 'float32_t', 'float32_t', 'np.float32'),
-        ]
+    ]
     ints_list = [
         ('int8',  'int8_t',  'float32_t', 'np.float32'),
         ('int16', 'int16_t', 'float32_t', 'np.float32'),
         ('int32', 'int32_t', 'float64_t', 'np.float64'),
         ('int64', 'int64_t', 'float64_t', 'np.float64'),
-        ]
+    ]
+    date_like_list = [
+        ('int64', 'int64_t', 'float64_t', 'np.float64'),
+    ]
     object_list = [('object', 'object', 'float64_t', 'np.float64')]
     function_list = []
     if use_floats:
@@ -2239,14 +2242,16 @@ def generate_put_template(template, use_ints = True, use_floats = True,
         function_list.extend(ints_list)
     if use_objects:
         function_list.extend(object_list)
+    if use_datelikes:
+        function_list.extend(date_like_list)
 
     output = StringIO()
     for name, c_type, dest_type, dest_dtype in function_list:
-        func = template % {'name' : name,
-                           'c_type' : c_type,
-                           'dest_type' : dest_type.replace('_t', ''),
-                           'dest_type2' : dest_type,
-                           'dest_dtype' : dest_dtype}
+        func = template % {'name': name,
+                           'c_type': c_type,
+                           'dest_type': dest_type.replace('_t', ''),
+                           'dest_type2': dest_type,
+                           'dest_dtype': dest_dtype}
         output.write(func)
     return output.getvalue()
 
@@ -2372,7 +2377,9 @@ def generate_take_cython_file(path='generated.pyx'):
             print(generate_put_template(template, use_ints=False), file=f)
 
         for template in groupby_count:
-            print(generate_put_template(template, use_objects=True), file=f)
+            print(generate_put_template(template, use_ints=False,
+                                        use_datelikes=True, use_objects=True),
+                  file=f)
 
         # for template in templates_1d_datetime:
         #     print >> f, generate_from_template_datetime(template)
diff --git a/pandas/src/generated.pyx b/pandas/src/generated.pyx
@@ -6697,89 +6697,17 @@ def group_count_float32(ndarray[float32_t, ndim=2] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_count_int8(ndarray[float32_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[int8_t, ndim=2] values,
-                         ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, lab
-        Py_ssize_t N = values.shape[0], K = values.shape[1]
-        int8_t val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    if len(values) != len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            nobs[lab, j] += val == val and val != iNaT
-
-    for i in range(len(counts)):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_int16(ndarray[float32_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[int16_t, ndim=2] values,
-                         ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, lab
-        Py_ssize_t N = values.shape[0], K = values.shape[1]
-        int16_t val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    if len(values) != len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            nobs[lab, j] += val == val and val != iNaT
-
-    for i in range(len(counts)):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_int32(ndarray[float64_t, ndim=2] out,
+def group_count_object(ndarray[float64_t, ndim=2] out,
                          ndarray[int64_t] counts,
-                         ndarray[int32_t, ndim=2] values,
+                         ndarray[object, ndim=2] values,
                          ndarray[int64_t] labels):
     '''
     Only aggregates on axis=0
     '''
     cdef:
         Py_ssize_t i, j, lab
         Py_ssize_t N = values.shape[0], K = values.shape[1]
-        int32_t val
+        object val
         ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
                                                  dtype=np.int64)
 
@@ -6839,42 +6767,6 @@ def group_count_int64(ndarray[float64_t, ndim=2] out,
             out[i, j] = nobs[i, j]
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_object(ndarray[float64_t, ndim=2] out,
-                         ndarray[int64_t] counts,
-                         ndarray[object, ndim=2] values,
-                         ndarray[int64_t] labels):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, lab
-        Py_ssize_t N = values.shape[0], K = values.shape[1]
-        object val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    if len(values) != len(labels):
-       raise AssertionError("len(index) != len(labels)")
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            nobs[lab, j] += val == val and val != iNaT
-
-    for i in range(len(counts)):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
-
-
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
@@ -6946,85 +6838,17 @@ def group_count_bin_float32(ndarray[float32_t, ndim=2] out,
 
 @cython.boundscheck(False)
 @cython.wraparound(False)
-def group_count_bin_int8(ndarray[float32_t, ndim=2] out,
-                             ndarray[int64_t] counts,
-                             ndarray[int8_t, ndim=2] values,
-                             ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, ngroups
-        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
-        int8_t val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    ngroups = len(bins) + (bins[len(bins) - 1] != N)
-
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            nobs[b, j] += val == val and val != iNaT
-
-    for i in range(ngroups):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_bin_int16(ndarray[float32_t, ndim=2] out,
-                             ndarray[int64_t] counts,
-                             ndarray[int16_t, ndim=2] values,
-                             ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, ngroups
-        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
-        int16_t val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    ngroups = len(bins) + (bins[len(bins) - 1] != N)
-
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            nobs[b, j] += val == val and val != iNaT
-
-    for i in range(ngroups):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_bin_int32(ndarray[float64_t, ndim=2] out,
+def group_count_bin_object(ndarray[float64_t, ndim=2] out,
                              ndarray[int64_t] counts,
-                             ndarray[int32_t, ndim=2] values,
+                             ndarray[object, ndim=2] values,
                              ndarray[int64_t] bins):
     '''
     Only aggregates on axis=0
     '''
     cdef:
         Py_ssize_t i, j, ngroups
         Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
-        int32_t val
+        object val
         ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
                                                  dtype=np.int64)
 
@@ -7080,40 +6904,6 @@ def group_count_bin_int64(ndarray[float64_t, ndim=2] out,
             out[i, j] = nobs[i, j]
 
 
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_count_bin_object(ndarray[float64_t, ndim=2] out,
-                             ndarray[int64_t] counts,
-                             ndarray[object, ndim=2] values,
-                             ndarray[int64_t] bins):
-    '''
-    Only aggregates on axis=0
-    '''
-    cdef:
-        Py_ssize_t i, j, ngroups
-        Py_ssize_t N = values.shape[0], K = values.shape[1], b = 0
-        object val
-        ndarray[int64_t, ndim=2] nobs = np.zeros((out.shape[0], out.shape[1]),
-                                                 dtype=np.int64)
-
-    ngroups = len(bins) + (bins[len(bins) - 1] != N)
-
-    for i in range(N):
-        while b < ngroups - 1 and i >= bins[b]:
-            b += 1
-
-        counts[b] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            nobs[b, j] += val == val and val != iNaT
-
-    for i in range(ngroups):
-        for j in range(K):
-            out[i, j] = nobs[i, j]
-
-
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
@@ -4202,6 +4202,19 @@ def test_datetime_count(self):
                           name='dates')
         tm.assert_series_equal(result, expected)
 
+    def test_lower_int_prec_count(self):
+        df = DataFrame({'a': np.array([0, 1, 2, 100], np.int8),
+                        'b': np.array([1, 2, 3, 6], np.uint32),
+                        'c': np.array([4, 5, 6, 8], np.int16),
+                        'grp': list('ab' * 2)})
+        result = df.groupby('grp').count()
+        expected = DataFrame({'a': [2, 2],
+                              'b': [2, 2],
+                              'c': [2, 2]}, index=pd.Index(list('ab'),
+                                                           name='grp'))
+        tm.assert_frame_equal(result, expected)
+
+
 def assert_fp_equal(a, b):
     assert (np.abs(a - b) < 1e-12).all()
 
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -133,21 +133,35 @@ def f():
 value2 = np.random.randn(n)
 value2[np.random.rand(n) > 0.5] = np.nan
 
-obj = pd.util.testing.choice(['a', 'b'], size=n).astype(object)
+obj = tm.choice(list('ab'), size=n).astype(object)
 obj[np.random.randn(n) > 0.5] = np.nan
 
 df = DataFrame({'key1': np.random.randint(0, 500, size=n),
                 'key2': np.random.randint(0, 100, size=n),
                 'dates': dates,
                 'value2' : value2,
                 'value3' : np.random.randn(n),
+                'ints': np.random.randint(0, 1000, size=n),
                 'obj': obj,
                 'offsets': offsets})
 """
 
 groupby_multi_count = Benchmark("df.groupby(['key1', 'key2']).count()",
                                 setup, name='groupby_multi_count',
                                 start_date=datetime(2014, 5, 5))
+
+setup = common_setup + """
+n = 10000
+
+df = DataFrame({'key1': randint(0, 500, size=n),
+                'key2': randint(0, 100, size=n),
+                'ints': randint(0, 1000, size=n),
+                'ints2': randint(0, 1000, size=n)})
+"""
+
+groupby_int_count = Benchmark("df.groupby(['key1', 'key2']).count()",
+                              setup, name='groupby_int_count',
+                              start_date=datetime(2014, 5, 6))
 #----------------------------------------------------------------------
 # Series.value_counts