Skip to content

Commit d835057

Browse files
committed
Simplied groupby Cython calls for ffill/bfill
1 parent 6fd84db commit d835057

File tree

2 files changed

+106
-89
lines changed

2 files changed

+106
-89
lines changed

pandas/_libs/groupby_helper.pxi.in

+55
Original file line numberDiff line numberDiff line change
@@ -1023,3 +1023,58 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
10231023
out[ii] = -1
10241024

10251025
label_indexer[lab, idxer_slot] = ii
1026+
1027+
@cython.wraparound(False)
1028+
@cython.boundscheck(False)
1029+
def group_fillna_indexer(ndarray[int64_t] out,
1030+
ndarray[uint8_t] mask,
1031+
ndarray[int64_t] labels,
1032+
object method,
1033+
int64_t limit):
1034+
"""Fills values forwards or backwards within a group
1035+
1036+
Parameters
1037+
----------
1038+
out : array of int64_t values which this method will write its results to
1039+
Missing values will be written to with a value of -1
1040+
mask : array of int64_t values where a 1 indicates a missing value
1041+
labels : array containing unique label for each group, with its ordering
1042+
matching up to the corresponding record in `values`
1043+
method : {'ffill', 'bfill'}
1044+
Direction for fill to be applied (forwards or backwards, respectively)
1045+
limit : Consecutive values to fill before stopping, or -1 for no limit
1046+
1047+
Notes
1048+
-----
1049+
This method modifies the `out` parameter rather than returning an object
1050+
"""
1051+
cdef:
1052+
Py_ssize_t i, N
1053+
ndarray[int64_t] sorted_labels
1054+
int64_t curr_fill_idx=-1
1055+
int64_t idx, filled_vals=0
1056+
1057+
N = len(out)
1058+
1059+
sorted_labels = np.argsort(labels)
1060+
if method == 'bfill':
1061+
sorted_labels = sorted_labels[::-1]
1062+
1063+
with nogil:
1064+
for i in range(N):
1065+
idx = sorted_labels[i]
1066+
if mask[idx] == 1: # is missing
1067+
# Stop filling once we've hit the limit
1068+
if filled_vals >= limit and limit != -1:
1069+
curr_fill_idx = -1
1070+
filled_vals += 1
1071+
else: # reset items when not missing
1072+
filled_vals = 0
1073+
curr_fill_idx = idx
1074+
1075+
out[idx] = curr_fill_idx
1076+
# If we move to the next group, reset
1077+
# the fill_idx and counter
1078+
if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]:
1079+
curr_fill_idx = -1
1080+
filled_vals = 0

pandas/core/groupby.py

+51-89
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
_ensure_float)
3939
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
4040
from pandas.core.dtypes.generic import ABCSeries
41-
from pandas.core.dtypes.missing import isna, notna, _maybe_fill
41+
from pandas.core.dtypes.missing import isna, isnull, notna, _maybe_fill
4242

4343
from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
4444
DataError, SpecificationError)
@@ -877,28 +877,21 @@ def apply(self, func, *args, **kwargs):
877877

878878
func = self._is_builtin_func(func)
879879

880-
# Try to go down the Cython path first
881-
try:
882-
f = self.grouper._cython_functions['apply'][func]
883-
return self.grouper._cython_apply(f, self._selected_obj, self.axis,
884-
**kwargs)
885-
except KeyError:
886-
# this is needed so we don't try and wrap strings. If we could
887-
# resolve functions to their callable functions prior, this
888-
# wouldn't be needed
889-
if args or kwargs:
890-
if callable(func):
891-
892-
@wraps(func)
893-
def f(g):
894-
with np.errstate(all='ignore'):
895-
return func(g, *args, **kwargs)
896-
else:
897-
raise ValueError('func must be a callable if args or '
898-
'kwargs are supplied and func is not '
899-
'implemented in Cython')
880+
# this is needed so we don't try and wrap strings. If we could
881+
# resolve functions to their callable functions prior, this
882+
# wouldn't be needed
883+
if args or kwargs:
884+
if callable(func):
885+
886+
@wraps(func)
887+
def f(g):
888+
with np.errstate(all='ignore'):
889+
return func(g, *args, **kwargs)
900890
else:
901-
f = func
891+
raise ValueError('func must be a callable if args or '
892+
'kwargs are supplied')
893+
else:
894+
f = func
902895

903896
# ignore SettingWithCopy here in case the user mutates
904897
with option_context('mode.chained_assignment', None):
@@ -1464,6 +1457,25 @@ def expanding(self, *args, **kwargs):
14641457
from pandas.core.window import ExpandingGroupby
14651458
return ExpandingGroupby(self, *args, **kwargs)
14661459

1460+
def _fill(self, how, limit=None):
1461+
labels, _, _ = self.grouper.group_info
1462+
1463+
# Need int value for Cython
1464+
if limit is None:
1465+
limit = -1
1466+
output = {}
1467+
if type(self) is DataFrameGroupBy:
1468+
for nm in self.grouper.names:
1469+
output[nm] = self.obj[nm].values
1470+
for name, obj in self._iterate_slices():
1471+
indexer = np.zeros_like(labels)
1472+
mask = isnull(obj.values).view(np.uint8)
1473+
libgroupby.group_fillna_indexer(indexer, mask, labels, how,
1474+
limit)
1475+
output[name] = algorithms.take_nd(obj.values, indexer)
1476+
1477+
return self._wrap_transformed_output(output)
1478+
14671479
@Substitution(name='groupby')
14681480
def pad(self, limit=None):
14691481
"""
@@ -1481,7 +1493,7 @@ def pad(self, limit=None):
14811493
Series.fillna
14821494
DataFrame.fillna
14831495
"""
1484-
return self.apply('ffill', limit=limit)
1496+
return self._fill('ffill', limit=limit)
14851497
ffill = pad
14861498

14871499
@Substitution(name='groupby')
@@ -1501,7 +1513,7 @@ def backfill(self, limit=None):
15011513
Series.fillna
15021514
DataFrame.fillna
15031515
"""
1504-
return self.apply('bfill', limit=limit)
1516+
return self._fill('bfill', limit=limit)
15051517
bfill = backfill
15061518

15071519
@Substitution(name='groupby')
@@ -2041,38 +2053,6 @@ def _get_group_keys(self):
20412053
self.levels,
20422054
self.labels)
20432055

2044-
def _cython_apply(self, ftype, data, axis, **kwargs):
2045-
def _generate_output(ser):
2046-
# duplicative of _get_cython_function; needs refactor
2047-
dtype_str = ser.dtype.name
2048-
values = ser.values[:, None]
2049-
func = afunc = self._get_func(ftype['name'], dtype_str)
2050-
f = ftype.get('f')
2051-
2052-
def wrapper(*args, **kwargs):
2053-
return f(afunc, *args, **kwargs)
2054-
2055-
func = wrapper
2056-
labels, _, _ = self.group_info
2057-
2058-
result = _maybe_fill(np.empty_like(values, dtype=dtype_str),
2059-
fill_value=np.nan)
2060-
func(result, values, labels, **kwargs)
2061-
2062-
return result[:, 0]
2063-
2064-
# Using introspection to determine result; not ideal needs refactor
2065-
if type(data) is Series:
2066-
return Series(_generate_output(data), name=data.name)
2067-
else:
2068-
output = collections.OrderedDict()
2069-
for col in data.columns:
2070-
if col in self.names:
2071-
output[col] = data[col].values
2072-
else:
2073-
output[col] = _generate_output(data[col])
2074-
return DataFrame(output, index=data.index)
2075-
20762056
def apply(self, f, data, axis=0):
20772057
mutated = self.mutated
20782058
splitter = self._get_splitter(data, axis=axis)
@@ -2269,22 +2249,6 @@ def get_group_levels(self):
22692249
kwargs.get('na_option', 'keep')
22702250
)
22712251
}
2272-
},
2273-
'apply': {
2274-
'ffill': {
2275-
'name': 'group_fillna',
2276-
'f': lambda func, a, b, c, **kwargs: func(
2277-
a, b, c,
2278-
'ffill', kwargs['limit'] if kwargs['limit'] else -1
2279-
)
2280-
},
2281-
'bfill': {
2282-
'name': 'group_fillna',
2283-
'f': lambda func, a, b, c, **kwargs: func(
2284-
a, b, c,
2285-
'bfill', kwargs['limit'] if kwargs['limit'] else -1
2286-
)
2287-
}
22882252
}
22892253
}
22902254

@@ -2303,28 +2267,27 @@ def _is_builtin_func(self, arg):
23032267
"""
23042268
return SelectionMixin._builtin_table.get(arg, arg)
23052269

2306-
def _get_func(self, fname, dtype_str=None, is_numeric=False):
2307-
# see if there is a fused-type version of function
2308-
# only valid for numeric
2309-
f = getattr(libgroupby, fname, None)
2310-
if f is not None and is_numeric:
2311-
return f
2312-
2313-
# otherwise find dtype-specific version, falling back to object
2314-
for dt in [dtype_str, 'object']:
2315-
f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None)
2316-
if f is not None:
2317-
return f
2318-
23192270
def _get_cython_function(self, kind, how, values, is_numeric):
23202271

23212272
dtype_str = values.dtype.name
23222273

2274+
def get_func(fname):
2275+
# see if there is a fused-type version of function
2276+
# only valid for numeric
2277+
f = getattr(libgroupby, fname, None)
2278+
if f is not None and is_numeric:
2279+
return f
2280+
2281+
# otherwise find dtype-specific version, falling back to object
2282+
for dt in [dtype_str, 'object']:
2283+
f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None)
2284+
if f is not None:
2285+
return f
2286+
23232287
ftype = self._cython_functions[kind][how]
23242288

23252289
if isinstance(ftype, dict):
2326-
func = afunc = self._get_func(ftype['name'], dtype_str=dtype_str,
2327-
is_numeric=is_numeric)
2290+
func = afunc = get_func(ftype['name'])
23282291

23292292
# a sub-function
23302293
f = ftype.get('f')
@@ -2337,8 +2300,7 @@ def wrapper(*args, **kwargs):
23372300
func = wrapper
23382301

23392302
else:
2340-
func = self._get_func(ftype, dtype_str=dtype_str,
2341-
is_numeric=is_numeric)
2303+
func = get_func(ftype)
23422304

23432305
if func is None:
23442306
raise NotImplementedError("function is not implemented for this"

0 commit comments

Comments
 (0)