Revert "Consolidate nth / last object Groupby Implementations (#19610)"

jreback · jreback · commit 5c76f33a106d · 2018-02-10T11:09:58.000-05:00
This reverts commit d4730e6.
diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx
@@ -26,6 +26,105 @@ cdef double NaN = <double> np.NaN
 cdef double nan = NaN
 
 
+# TODO: aggregate multiple columns in single pass
+# ----------------------------------------------------------------------
+# first, nth, last
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_nth_object(ndarray[object, ndim=2] out,
+                     ndarray[int64_t] counts,
+                     ndarray[object, ndim=2] values,
+                     ndarray[int64_t] labels,
+                     int64_t rank,
+                     Py_ssize_t min_count=-1):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        object val
+        float64_t count
+        ndarray[int64_t, ndim=2] nobs
+        ndarray[object, ndim=2] resx
+
+    assert min_count == -1, "'min_count' only used in add and prod"
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                if nobs[lab, j] == rank:
+                    resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = <object> nan
+            else:
+                out[i, j] = resx[i, j]
+
+
+@cython.boundscheck(False)
+@cython.wraparound(False)
+def group_last_object(ndarray[object, ndim=2] out,
+                      ndarray[int64_t] counts,
+                      ndarray[object, ndim=2] values,
+                      ndarray[int64_t] labels,
+                      Py_ssize_t min_count=-1):
+    """
+    Only aggregates on axis=0
+    """
+    cdef:
+        Py_ssize_t i, j, N, K, lab
+        object val
+        float64_t count
+        ndarray[object, ndim=2] resx
+        ndarray[int64_t, ndim=2] nobs
+
+    assert min_count == -1, "'min_count' only used in add and prod"
+
+    nobs = np.zeros((<object> out).shape, dtype=np.int64)
+    resx = np.empty((<object> out).shape, dtype=object)
+
+    N, K = (<object> values).shape
+
+    for i in range(N):
+        lab = labels[i]
+        if lab < 0:
+            continue
+
+        counts[lab] += 1
+        for j in range(K):
+            val = values[i, j]
+
+            # not nan
+            if val == val:
+                nobs[lab, j] += 1
+                resx[lab, j] = val
+
+    for i in range(len(counts)):
+        for j in range(K):
+            if nobs[i, j] == 0:
+                out[i, j] = nan
+            else:
+                out[i, j] = resx[i, j]
+
+
 cdef inline float64_t median_linear(float64_t* a, int n) nogil:
     cdef int i, j, na_count = 0
     cdef float64_t result
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -325,8 +325,7 @@ def group_ohlc_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
 # name, c_type, dest_type2, nan_val
 dtypes = [('float64', 'float64_t', 'float64_t', 'NAN'),
           ('float32', 'float32_t', 'float32_t', 'NAN'),
-          ('int64', 'int64_t', 'int64_t', 'iNaT'),
-          ('object', 'object', 'object', 'NAN')]
+          ('int64', 'int64_t', 'int64_t', 'iNaT')]
 
 def get_dispatch(dtypes):
 
@@ -351,7 +350,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val
+        {{dest_type2}} val, count
         ndarray[{{dest_type2}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
@@ -361,19 +360,11 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         raise AssertionError("len(index) != len(labels)")
 
     nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    {{if name=='object'}}
-    resx = np.empty((<object> out).shape, dtype=object)
-    {{else}}
     resx = np.empty_like(out)
-    {{endif}}
 
     N, K = (<object> values).shape
 
-    {{if name == "object"}}
-    if True:  # make templating happy
-    {{else}}
     with nogil:
-    {{endif}}
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -384,7 +375,11 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
+                {{if name == 'int64'}}
+                if val != {{nan_val}}:
+                {{else}}
                 if val == val and val != {{nan_val}}:
+                {{endif}}
                     nobs[lab, j] += 1
                     resx[lab, j] = val
 
@@ -395,6 +390,7 @@ def group_last_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 else:
                     out[i, j] = resx[i, j]
 
+
 @cython.wraparound(False)
 @cython.boundscheck(False)
 def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
@@ -407,7 +403,7 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
     """
     cdef:
         Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
-        {{dest_type2}} val
+        {{dest_type2}} val, count
         ndarray[{{dest_type2}}, ndim=2] resx
         ndarray[int64_t, ndim=2] nobs
 
@@ -417,19 +413,11 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
         raise AssertionError("len(index) != len(labels)")
 
     nobs = np.zeros((<object> out).shape, dtype=np.int64)
-    {{if name=='object'}}
-    resx = np.empty((<object> out).shape, dtype=object)
-    {{else}}
     resx = np.empty_like(out)
-    {{endif}}
 
     N, K = (<object> values).shape
 
-    {{if name == "object"}}
-    if True:  # make templating happy
-    {{else}}
     with nogil:
-    {{endif}}
         for i in range(N):
             lab = labels[i]
             if lab < 0:
@@ -440,7 +428,11 @@ def group_nth_{{name}}(ndarray[{{dest_type2}}, ndim=2] out,
                 val = values[i, j]
 
                 # not nan
+                {{if name == 'int64'}}
+                if val != {{nan_val}}:
+                {{else}}
                 if val == val and val != {{nan_val}}:
+                {{endif}}
                     nobs[lab, j] += 1
                     if nobs[lab, j] == rank:
                         resx[lab, j] = val
diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py
@@ -2252,45 +2252,47 @@ def test_median_empty_bins(self):
         expected = df.groupby(bins).agg(lambda x: x.median())
         assert_frame_equal(result, expected)
 
-    @pytest.mark.parametrize("dtype", [
-        'int8', 'int16', 'int32', 'int64', 'float32', 'float64'])
-    @pytest.mark.parametrize("method,data", [
-        ('first', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
-        ('last', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
-        ('min', {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]}),
-        ('max', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]}),
-        ('nth', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
-                 'args': [1]}),
-        ('count', {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
-                   'out_type': 'int64'})
-    ])
-    def test_groupby_non_arithmetic_agg_types(self, dtype, method, data):
+    def test_groupby_non_arithmetic_agg_types(self):
         # GH9311, GH6620
         df = pd.DataFrame(
             [{'a': 1, 'b': 1},
              {'a': 1, 'b': 2},
              {'a': 2, 'b': 3},
              {'a': 2, 'b': 4}])
 
-        df['b'] = df.b.astype(dtype)
+        dtypes = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
 
-        if 'args' not in data:
-            data['args'] = []
+        grp_exp = {'first': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]},
+                   'last': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]},
+                   'min': {'df': [{'a': 1, 'b': 1}, {'a': 2, 'b': 3}]},
+                   'max': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}]},
+                   'nth': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 4}],
+                           'args': [1]},
+                   'count': {'df': [{'a': 1, 'b': 2}, {'a': 2, 'b': 2}],
+                             'out_type': 'int64'}}
 
-        if 'out_type' in data:
-            out_type = data['out_type']
-        else:
-            out_type = dtype
+        for dtype in dtypes:
+            df_in = df.copy()
+            df_in['b'] = df_in.b.astype(dtype)
+
+            for method, data in compat.iteritems(grp_exp):
+                if 'args' not in data:
+                    data['args'] = []
+
+                if 'out_type' in data:
+                    out_type = data['out_type']
+                else:
+                    out_type = dtype
 
-        exp = data['df']
-        df_out = pd.DataFrame(exp)
+                exp = data['df']
+                df_out = pd.DataFrame(exp)
 
-        df_out['b'] = df_out.b.astype(out_type)
-        df_out.set_index('a', inplace=True)
+                df_out['b'] = df_out.b.astype(out_type)
+                df_out.set_index('a', inplace=True)
 
-        grpd = df.groupby('a')
-        t = getattr(grpd, method)(*data['args'])
-        assert_frame_equal(t, df_out)
+                grpd = df_in.groupby('a')
+                t = getattr(grpd, method)(*data['args'])
+                assert_frame_equal(t, df_out)
 
     def test_groupby_non_arithmetic_agg_intlike_precision(self):
         # GH9311, GH6620