Simplied groupby Cython calls for ffill/bfill

WillAyd · WillAyd · commit e46423030333 · 2018-02-13T22:31:04.000-08:00
diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in
@@ -1105,3 +1105,58 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
                     out[ii] = -1
 
                 label_indexer[lab, idxer_slot] = ii
+
+@cython.wraparound(False)
+@cython.boundscheck(False)
+def group_fillna_indexer(ndarray[int64_t] out,
+                         ndarray[uint8_t] mask,
+                         ndarray[int64_t] labels,
+                         object method,
+                         int64_t limit):
+    """Fills values forwards or backwards within a group
+
+    Parameters
+    ----------
+    out : array of int64_t values which this method will write its results to
+        Missing values will be written to with a value of -1
+    mask : array of int64_t values where a 1 indicates a missing value
+    labels : array containing unique label for each group, with its ordering
+        matching up to the corresponding record in `values`
+    method : {'ffill', 'bfill'}
+        Direction for fill to be applied (forwards or backwards, respectively)
+    limit : Consecutive values to fill before stopping, or -1 for no limit
+
+    Notes
+    -----
+    This method modifies the `out` parameter rather than returning an object
+    """
+    cdef:
+        Py_ssize_t i, N
+        ndarray[int64_t] sorted_labels
+        int64_t curr_fill_idx=-1
+        int64_t idx, filled_vals=0
+
+    N = len(out)
+
+    sorted_labels = np.argsort(labels)
+    if method == 'bfill':
+        sorted_labels = sorted_labels[::-1]
+
+    with nogil:
+        for i in range(N):
+            idx = sorted_labels[i]
+            if mask[idx] == 1:  # is missing
+                # Stop filling once we've hit the limit
+                if filled_vals >= limit and limit != -1:
+                    curr_fill_idx = -1
+                filled_vals += 1
+            else:  # reset items when not missing
+                filled_vals = 0
+                curr_fill_idx = idx
+
+            out[idx] = curr_fill_idx
+            # If we move to the next group, reset
+            # the fill_idx and counter
+            if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]:
+                curr_fill_idx = -1
+                filled_vals = 0
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -38,7 +38,7 @@
     _ensure_float)
 from pandas.core.dtypes.cast import maybe_downcast_to_dtype
 from pandas.core.dtypes.generic import ABCSeries
-from pandas.core.dtypes.missing import isna, notna, _maybe_fill
+from pandas.core.dtypes.missing import isna, isnull, notna, _maybe_fill
 
 from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
                               DataError, SpecificationError)
@@ -875,28 +875,21 @@ def apply(self, func, *args, **kwargs):
 
         func = self._is_builtin_func(func)
 
-        # Try to go down the Cython path first
-        try:
-            f = self.grouper._cython_functions['apply'][func]
-            return self.grouper._cython_apply(f, self._selected_obj, self.axis,
-                                              **kwargs)
-        except KeyError:
-            # this is needed so we don't try and wrap strings. If we could
-            # resolve functions to their callable functions prior, this
-            # wouldn't be needed
-            if args or kwargs:
-                if callable(func):
-
-                    @wraps(func)
-                    def f(g):
-                        with np.errstate(all='ignore'):
-                            return func(g, *args, **kwargs)
-                else:
-                    raise ValueError('func must be a callable if args or '
-                                     'kwargs are supplied and func is not '
-                                     'implemented in Cython')
+        # this is needed so we don't try and wrap strings. If we could
+        # resolve functions to their callable functions prior, this
+        # wouldn't be needed
+        if args or kwargs:
+            if callable(func):
+
+                @wraps(func)
+                def f(g):
+                    with np.errstate(all='ignore'):
+                        return func(g, *args, **kwargs)
             else:
-                f = func
+                raise ValueError('func must be a callable if args or '
+                                 'kwargs are supplied')
+        else:
+            f = func
 
         # ignore SettingWithCopy here in case the user mutates
         with option_context('mode.chained_assignment', None):
@@ -1462,6 +1455,25 @@ def expanding(self, *args, **kwargs):
         from pandas.core.window import ExpandingGroupby
         return ExpandingGroupby(self, *args, **kwargs)
 
+    def _fill(self, how, limit=None):
+        labels, _, _ = self.grouper.group_info
+
+        # Need int value for Cython
+        if limit is None:
+            limit = -1
+        output = {}
+        if type(self) is DataFrameGroupBy:
+            for nm in self.grouper.names:
+                output[nm] = self.obj[nm].values
+        for name, obj in self._iterate_slices():
+            indexer = np.zeros_like(labels)
+            mask = isnull(obj.values).view(np.uint8)
+            libgroupby.group_fillna_indexer(indexer, mask, labels, how,
+                                            limit)
+            output[name] = algorithms.take_nd(obj.values, indexer)
+
+        return self._wrap_transformed_output(output)
+
     @Substitution(name='groupby')
     def pad(self, limit=None):
         """
@@ -1479,7 +1491,7 @@ def pad(self, limit=None):
         Series.fillna
         DataFrame.fillna
         """
-        return self.apply('ffill', limit=limit)
+        return self._fill('ffill', limit=limit)
     ffill = pad
 
     @Substitution(name='groupby')
@@ -1499,7 +1511,7 @@ def backfill(self, limit=None):
         Series.fillna
         DataFrame.fillna
         """
-        return self.apply('bfill', limit=limit)
+        return self._fill('bfill', limit=limit)
     bfill = backfill
 
     @Substitution(name='groupby')
@@ -2039,38 +2051,6 @@ def _get_group_keys(self):
                                           self.levels,
                                           self.labels)
 
-    def _cython_apply(self, ftype, data, axis, **kwargs):
-        def _generate_output(ser):
-            # duplicative of _get_cython_function; needs refactor
-            dtype_str = ser.dtype.name
-            values = ser.values[:, None]
-            func = afunc = self._get_func(ftype['name'], dtype_str)
-            f = ftype.get('f')
-
-            def wrapper(*args, **kwargs):
-                return f(afunc, *args, **kwargs)
-
-            func = wrapper
-            labels, _, _ = self.group_info
-
-            result = _maybe_fill(np.empty_like(values, dtype=dtype_str),
-                                 fill_value=np.nan)
-            func(result, values, labels, **kwargs)
-
-            return result[:, 0]
-
-        # Using introspection to determine result; not ideal needs refactor
-        if type(data) is Series:
-            return Series(_generate_output(data), name=data.name)
-        else:
-            output = collections.OrderedDict()
-            for col in data.columns:
-                if col in self.names:
-                    output[col] = data[col].values
-                else:
-                    output[col] = _generate_output(data[col])
-            return DataFrame(output, index=data.index)
-
     def apply(self, f, data, axis=0):
         mutated = self.mutated
         splitter = self._get_splitter(data, axis=axis)
@@ -2267,22 +2247,6 @@ def get_group_levels(self):
                     kwargs.get('na_option', 'keep')
                 )
             }
-        },
-        'apply': {
-            'ffill': {
-                'name': 'group_fillna',
-                'f': lambda func, a, b, c, **kwargs: func(
-                    a, b, c,
-                    'ffill', kwargs['limit'] if kwargs['limit'] else -1
-                )
-            },
-            'bfill': {
-                'name': 'group_fillna',
-                'f': lambda func, a, b, c, **kwargs: func(
-                    a, b, c,
-                    'bfill', kwargs['limit'] if kwargs['limit'] else -1
-                )
-            }
         }
     }
 
@@ -2301,28 +2265,27 @@ def _is_builtin_func(self, arg):
         """
         return SelectionMixin._builtin_table.get(arg, arg)
 
-    def _get_func(self, fname, dtype_str=None, is_numeric=False):
-        # see if there is a fused-type version of function
-        # only valid for numeric
-        f = getattr(libgroupby, fname, None)
-        if f is not None and is_numeric:
-            return f
-
-        # otherwise find dtype-specific version, falling back to object
-        for dt in [dtype_str, 'object']:
-            f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None)
-            if f is not None:
-                return f
-
     def _get_cython_function(self, kind, how, values, is_numeric):
 
         dtype_str = values.dtype.name
 
+        def get_func(fname):
+            # see if there is a fused-type version of function
+            # only valid for numeric
+            f = getattr(libgroupby, fname, None)
+            if f is not None and is_numeric:
+                return f
+
+            # otherwise find dtype-specific version, falling back to object
+            for dt in [dtype_str, 'object']:
+                f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None)
+                if f is not None:
+                    return f
+
         ftype = self._cython_functions[kind][how]
 
         if isinstance(ftype, dict):
-            func = afunc = self._get_func(ftype['name'], dtype_str=dtype_str,
-                                          is_numeric=is_numeric)
+            func = afunc = get_func(ftype['name'])
 
             # a sub-function
             f = ftype.get('f')
@@ -2335,8 +2298,7 @@ def wrapper(*args, **kwargs):
                 func = wrapper
 
         else:
-            func = self._get_func(ftype, dtype_str=dtype_str,
-                                  is_numeric=is_numeric)
+            func = get_func(ftype)
 
         if func is None:
             raise NotImplementedError("function is not implemented for this"