Skip to content

Commit e464230

Browse files
committed
Simplied groupby Cython calls for ffill/bfill
1 parent aa030f5 commit e464230

File tree

2 files changed

+106
-89
lines changed

2 files changed

+106
-89
lines changed

pandas/_libs/groupby_helper.pxi.in

+55
Original file line numberDiff line numberDiff line change
@@ -1105,3 +1105,58 @@ def group_shift_indexer(int64_t[:] out, int64_t[:] labels,
11051105
out[ii] = -1
11061106

11071107
label_indexer[lab, idxer_slot] = ii
1108+
1109+
@cython.wraparound(False)
1110+
@cython.boundscheck(False)
1111+
def group_fillna_indexer(ndarray[int64_t] out,
1112+
ndarray[uint8_t] mask,
1113+
ndarray[int64_t] labels,
1114+
object method,
1115+
int64_t limit):
1116+
"""Fills values forwards or backwards within a group
1117+
1118+
Parameters
1119+
----------
1120+
out : array of int64_t values which this method will write its results to
1121+
Missing values will be written to with a value of -1
1122+
mask : array of int64_t values where a 1 indicates a missing value
1123+
labels : array containing unique label for each group, with its ordering
1124+
matching up to the corresponding record in `values`
1125+
method : {'ffill', 'bfill'}
1126+
Direction for fill to be applied (forwards or backwards, respectively)
1127+
limit : Consecutive values to fill before stopping, or -1 for no limit
1128+
1129+
Notes
1130+
-----
1131+
This method modifies the `out` parameter rather than returning an object
1132+
"""
1133+
cdef:
1134+
Py_ssize_t i, N
1135+
ndarray[int64_t] sorted_labels
1136+
int64_t curr_fill_idx=-1
1137+
int64_t idx, filled_vals=0
1138+
1139+
N = len(out)
1140+
1141+
sorted_labels = np.argsort(labels)
1142+
if method == 'bfill':
1143+
sorted_labels = sorted_labels[::-1]
1144+
1145+
with nogil:
1146+
for i in range(N):
1147+
idx = sorted_labels[i]
1148+
if mask[idx] == 1: # is missing
1149+
# Stop filling once we've hit the limit
1150+
if filled_vals >= limit and limit != -1:
1151+
curr_fill_idx = -1
1152+
filled_vals += 1
1153+
else: # reset items when not missing
1154+
filled_vals = 0
1155+
curr_fill_idx = idx
1156+
1157+
out[idx] = curr_fill_idx
1158+
# If we move to the next group, reset
1159+
# the fill_idx and counter
1160+
if i == N - 1 or labels[idx] != labels[sorted_labels[i+1]]:
1161+
curr_fill_idx = -1
1162+
filled_vals = 0

pandas/core/groupby.py

+51-89
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
_ensure_float)
3939
from pandas.core.dtypes.cast import maybe_downcast_to_dtype
4040
from pandas.core.dtypes.generic import ABCSeries
41-
from pandas.core.dtypes.missing import isna, notna, _maybe_fill
41+
from pandas.core.dtypes.missing import isna, isnull, notna, _maybe_fill
4242

4343
from pandas.core.base import (PandasObject, SelectionMixin, GroupByError,
4444
DataError, SpecificationError)
@@ -875,28 +875,21 @@ def apply(self, func, *args, **kwargs):
875875

876876
func = self._is_builtin_func(func)
877877

878-
# Try to go down the Cython path first
879-
try:
880-
f = self.grouper._cython_functions['apply'][func]
881-
return self.grouper._cython_apply(f, self._selected_obj, self.axis,
882-
**kwargs)
883-
except KeyError:
884-
# this is needed so we don't try and wrap strings. If we could
885-
# resolve functions to their callable functions prior, this
886-
# wouldn't be needed
887-
if args or kwargs:
888-
if callable(func):
889-
890-
@wraps(func)
891-
def f(g):
892-
with np.errstate(all='ignore'):
893-
return func(g, *args, **kwargs)
894-
else:
895-
raise ValueError('func must be a callable if args or '
896-
'kwargs are supplied and func is not '
897-
'implemented in Cython')
878+
# this is needed so we don't try and wrap strings. If we could
879+
# resolve functions to their callable functions prior, this
880+
# wouldn't be needed
881+
if args or kwargs:
882+
if callable(func):
883+
884+
@wraps(func)
885+
def f(g):
886+
with np.errstate(all='ignore'):
887+
return func(g, *args, **kwargs)
898888
else:
899-
f = func
889+
raise ValueError('func must be a callable if args or '
890+
'kwargs are supplied')
891+
else:
892+
f = func
900893

901894
# ignore SettingWithCopy here in case the user mutates
902895
with option_context('mode.chained_assignment', None):
@@ -1462,6 +1455,25 @@ def expanding(self, *args, **kwargs):
14621455
from pandas.core.window import ExpandingGroupby
14631456
return ExpandingGroupby(self, *args, **kwargs)
14641457

1458+
def _fill(self, how, limit=None):
1459+
labels, _, _ = self.grouper.group_info
1460+
1461+
# Need int value for Cython
1462+
if limit is None:
1463+
limit = -1
1464+
output = {}
1465+
if type(self) is DataFrameGroupBy:
1466+
for nm in self.grouper.names:
1467+
output[nm] = self.obj[nm].values
1468+
for name, obj in self._iterate_slices():
1469+
indexer = np.zeros_like(labels)
1470+
mask = isnull(obj.values).view(np.uint8)
1471+
libgroupby.group_fillna_indexer(indexer, mask, labels, how,
1472+
limit)
1473+
output[name] = algorithms.take_nd(obj.values, indexer)
1474+
1475+
return self._wrap_transformed_output(output)
1476+
14651477
@Substitution(name='groupby')
14661478
def pad(self, limit=None):
14671479
"""
@@ -1479,7 +1491,7 @@ def pad(self, limit=None):
14791491
Series.fillna
14801492
DataFrame.fillna
14811493
"""
1482-
return self.apply('ffill', limit=limit)
1494+
return self._fill('ffill', limit=limit)
14831495
ffill = pad
14841496

14851497
@Substitution(name='groupby')
@@ -1499,7 +1511,7 @@ def backfill(self, limit=None):
14991511
Series.fillna
15001512
DataFrame.fillna
15011513
"""
1502-
return self.apply('bfill', limit=limit)
1514+
return self._fill('bfill', limit=limit)
15031515
bfill = backfill
15041516

15051517
@Substitution(name='groupby')
@@ -2039,38 +2051,6 @@ def _get_group_keys(self):
20392051
self.levels,
20402052
self.labels)
20412053

2042-
def _cython_apply(self, ftype, data, axis, **kwargs):
2043-
def _generate_output(ser):
2044-
# duplicative of _get_cython_function; needs refactor
2045-
dtype_str = ser.dtype.name
2046-
values = ser.values[:, None]
2047-
func = afunc = self._get_func(ftype['name'], dtype_str)
2048-
f = ftype.get('f')
2049-
2050-
def wrapper(*args, **kwargs):
2051-
return f(afunc, *args, **kwargs)
2052-
2053-
func = wrapper
2054-
labels, _, _ = self.group_info
2055-
2056-
result = _maybe_fill(np.empty_like(values, dtype=dtype_str),
2057-
fill_value=np.nan)
2058-
func(result, values, labels, **kwargs)
2059-
2060-
return result[:, 0]
2061-
2062-
# Using introspection to determine result; not ideal needs refactor
2063-
if type(data) is Series:
2064-
return Series(_generate_output(data), name=data.name)
2065-
else:
2066-
output = collections.OrderedDict()
2067-
for col in data.columns:
2068-
if col in self.names:
2069-
output[col] = data[col].values
2070-
else:
2071-
output[col] = _generate_output(data[col])
2072-
return DataFrame(output, index=data.index)
2073-
20742054
def apply(self, f, data, axis=0):
20752055
mutated = self.mutated
20762056
splitter = self._get_splitter(data, axis=axis)
@@ -2267,22 +2247,6 @@ def get_group_levels(self):
22672247
kwargs.get('na_option', 'keep')
22682248
)
22692249
}
2270-
},
2271-
'apply': {
2272-
'ffill': {
2273-
'name': 'group_fillna',
2274-
'f': lambda func, a, b, c, **kwargs: func(
2275-
a, b, c,
2276-
'ffill', kwargs['limit'] if kwargs['limit'] else -1
2277-
)
2278-
},
2279-
'bfill': {
2280-
'name': 'group_fillna',
2281-
'f': lambda func, a, b, c, **kwargs: func(
2282-
a, b, c,
2283-
'bfill', kwargs['limit'] if kwargs['limit'] else -1
2284-
)
2285-
}
22862250
}
22872251
}
22882252

@@ -2301,28 +2265,27 @@ def _is_builtin_func(self, arg):
23012265
"""
23022266
return SelectionMixin._builtin_table.get(arg, arg)
23032267

2304-
def _get_func(self, fname, dtype_str=None, is_numeric=False):
2305-
# see if there is a fused-type version of function
2306-
# only valid for numeric
2307-
f = getattr(libgroupby, fname, None)
2308-
if f is not None and is_numeric:
2309-
return f
2310-
2311-
# otherwise find dtype-specific version, falling back to object
2312-
for dt in [dtype_str, 'object']:
2313-
f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None)
2314-
if f is not None:
2315-
return f
2316-
23172268
def _get_cython_function(self, kind, how, values, is_numeric):
23182269

23192270
dtype_str = values.dtype.name
23202271

2272+
def get_func(fname):
2273+
# see if there is a fused-type version of function
2274+
# only valid for numeric
2275+
f = getattr(libgroupby, fname, None)
2276+
if f is not None and is_numeric:
2277+
return f
2278+
2279+
# otherwise find dtype-specific version, falling back to object
2280+
for dt in [dtype_str, 'object']:
2281+
f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None)
2282+
if f is not None:
2283+
return f
2284+
23212285
ftype = self._cython_functions[kind][how]
23222286

23232287
if isinstance(ftype, dict):
2324-
func = afunc = self._get_func(ftype['name'], dtype_str=dtype_str,
2325-
is_numeric=is_numeric)
2288+
func = afunc = get_func(ftype['name'])
23262289

23272290
# a sub-function
23282291
f = ftype.get('f')
@@ -2335,8 +2298,7 @@ def wrapper(*args, **kwargs):
23352298
func = wrapper
23362299

23372300
else:
2338-
func = self._get_func(ftype, dtype_str=dtype_str,
2339-
is_numeric=is_numeric)
2301+
func = get_func(ftype)
23402302

23412303
if func is None:
23422304
raise NotImplementedError("function is not implemented for this"

0 commit comments

Comments
 (0)