Consolidate nth / last object Groupby Implementations (#19610)

WillAyd · jreback · commit d4730e65fd2d · 2018-02-10T11:08:58.000-05:00
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -26,105 +26,6 @@ cdef double NaN = <double> np.NaN
 cdef double nan = NaN
 
 
-# TODO: aggregate multiple columns in single pass
-# ----------------------------------------------------------------------
-# first, nth, last
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_nth_object(ndarray[object, ndim=2] out,
-                     ndarray[int64_t] counts,
-                     ndarray[object, ndim=2] values,
-                     ndarray[int64_t] labels,
-                     int64_t rank,
-                     Py_ssize_t min_count=-1):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        object val
-        float64_t count
-        ndarray[int64_t, ndim=2] nobs
-        ndarray[object, ndim=2] resx
-
-    assert min_count == -1, "'min_count' only used in add and prod"
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty((<object> out).shape, dtype=object)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                if nobs[lab, j] == rank:
-                    resx[lab, j] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = <object> nan
-            else:
-                out[i, j] = resx[i, j]
-
-
-@cython.boundscheck(False)
-@cython.wraparound(False)
-def group_last_object(ndarray[object, ndim=2] out,
-                      ndarray[int64_t] counts,
-                      ndarray[object, ndim=2] values,
-                      ndarray[int64_t] labels,
-                      Py_ssize_t min_count=-1):
-    """
-    Only aggregates on axis=0
-    """
-    cdef:
-        Py_ssize_t i, j, N, K, lab
-        object val
-        float64_t count
-        ndarray[object, ndim=2] resx
-        ndarray[int64_t, ndim=2] nobs
-
-    assert min_count == -1, "'min_count' only used in add and prod"
-
-    nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    resx = np.empty((<object> out).shape, dtype=object)
-
-    N, K = (<object> values).shape
-
-    for i in range(N):
-        lab = labels[i]
-        if lab < 0:
-            continue
-
-        counts[lab] += 1
-        for j in range(K):
-            val = values[i, j]
-
-            # not nan
-            if val == val:
-                nobs[lab, j] += 1
-                resx[lab, j] = val
-
-    for i in range(len(counts)):
-        for j in range(K):
-            if nobs[i, j] == 0:
-                out[i, j] = nan
-            else:
-                out[i, j] = resx[i, j]
-
-
 cdef inline float64_t median_linear(float64_t* a, int n) nogil:
     cdef int i, j, na_count = 0
     cdef float64_t result
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -325,7 +325,8 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 # name, c_type, dest_type2, nan_val
 dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'),
           ('float32', 'float32_t', 'float32_t', 'NAN'),
-          ('int64', 'int64_t', 'int64_t', 'iNaT')]
+          ('int64', 'int64_t', 'int64_t', 'iNaT'),
+          ('object', 'object', 'object', 'NAN')]
 
 def get_dispatch(dtypes):
 
@@ -350,7 +351,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
+        {{dest_type2}} val
         ndarray[{{dest_type2}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
@@ -360,11 +361,19 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         raise AssertionError("len(index) != len(labels)")
 
     nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    {{if name=='object'}}
+    resx = np.empty((<object> out).shape, dtype=object)
+    {{else}}
     resx = np.empty_like(out)
+    {{endif}}
 
     N, K = (<object> values).shape
 
+    {{if name == "object"}}
+    if True:  # make templating happy
+    {{else}}
     with nogil:
+    {{endif}}
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -375,11 +384,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
                 if val == val and val != {{nan_val}}:
-                {{endif}}
                     nobs[lab, j] += 1
                     resx[lab, j] = val
 
@@ -390,7 +395,6 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 else:
                     out[i, j] = resx[i, j]
 
-
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
@@ -403,7 +407,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val, count
+        {{dest_type2}} val
         ndarray[{{dest_type2}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
@@ -413,11 +417,19 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         raise AssertionError("len(index) != len(labels)")
 
     nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    {{if name=='object'}}
+    resx = np.empty((<object> out).shape, dtype=object)
+    {{else}}
     resx = np.empty_like(out)
+    {{endif}}
 
     N, K = (<object> values).shape
 
+    {{if name == "object"}}
+    if True:  # make templating happy
+    {{else}}
     with nogil:
+    {{endif}}
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -428,11 +440,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
-                {{if name == 'int64'}}
-                if val != {{nan_val}}:
-                {{else}}
                 if val == val and val != {{nan_val}}:
-                {{endif}}
                     nobs[lab, j] += 1
                     if nobs[lab, j] == rank:
                         resx[lab, j] = val
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2252,47 +2252,45 @@ def test_median_empty_bins(self):
         expected = df.groupby(bins).agg(lambda x: x.median())
         assert_frame_equal(result, expected)
 
-    def test_groupby_non_arithmetic_agg_types(self):
+    @pytest.mark.parametrize("dtype", [
+        'int8', 'int16', 'int32', 'int64', 'float32', 'float64'])
+    @pytest.mark.parametrize("method,data", [
+        ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
+        ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
+        ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
+        ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
+        ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
+                 'args': [1]}),
+        ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
+                   'out_type': 'int64'})
+    ])
+    def test_groupby_non_arithmetic_agg_types(self, dtype, method, data):
         # GH9311, GH6620
         df = pd.DataFrame(
             [{'a': 1, 'b': 1},
              {'a': 1, 'b': 2},
              {'a': 2, 'b': 3},
              {'a': 2, 'b': 4}])
 
-        dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
-
-        grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]},
-                   'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]},
-                   'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]},
-                   'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]},
-                   'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
-                           'args': [1]},
-                   'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
-                             'out_type': 'int64'}}
+        df['b'] = df.b.astype(dtype)
 
-        for dtype in dtypes:
-            df_in = df.copy()
-            df_in['b'] = df_in.b.astype(dtype)
+        if 'args' not in data:
+            data['args'] = []
 
-            for method, data in compat.iteritems(grp_exp):
-                if 'args' not in data:
-                    data['args'] = []
-
-                if 'out_type' in data:
-                    out_type = data['out_type']
-                else:
-                    out_type = dtype
+        if 'out_type' in data:
+            out_type = data['out_type']
+        else:
+            out_type = dtype
 
-                exp = data['df']
-                df_out = pd.DataFrame(exp)
+        exp = data['df']
+        df_out = pd.DataFrame(exp)
 
-                df_out['b'] = df_out.b.astype(out_type)
-                df_out.set_index('a', inplace=True)
+        df_out['b'] = df_out.b.astype(out_type)
+        df_out.set_index('a', inplace=True)
 
-                grpd = df_in.groupby('a')
-                t = getattr(grpd, method)(*data['args'])
-                assert_frame_equal(t, df_out)
+        grpd = df.groupby('a')
+        t = getattr(grpd, method)(*data['args'])
+        assert_frame_equal(t, df_out)
 
     def test_groupby_non_arithmetic_agg_intlike_precision(self):
         # GH9311, GH6620