From af060ba9bed9014e471faaf66a315d7f96e153ec Mon Sep 17 00:00:00 2001 From: Jasmine Sandhu Date: Mon, 16 Nov 2015 15:03:49 -0800 Subject: [PATCH 1/3] ENH: GH4964 Separated array_to_roll in roll_generic In fixing GH4964, added an array_to_roll argument to roll_generic(). This will be used by rolling_apply() to work with non-float dtypes. array_to_roll defaults to None in which case, this function rolls over input array as it did previously. --- pandas/algos.pyx | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 8569209f2e946..45dc64d81c338 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -1820,9 +1820,11 @@ def roll_quantile(ndarray[float64_t, cast=True] input, int win, return output + def roll_generic(ndarray[float64_t, cast=True] input, int win, int minp, int offset, - object func, object args, object kwargs): + object func, object args, object kwargs, + object array_to_roll=None): cdef ndarray[double_t] output, counts, bufarr cdef Py_ssize_t i, n cdef float64_t *buf @@ -1837,32 +1839,41 @@ def roll_generic(ndarray[float64_t, cast=True] input, minp = _check_minp(win, minp, n, floor=0) output = np.empty(n, dtype=float) - counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), np.array([0.] * offset))), win, minp)[offset:] + counts = roll_sum(np.concatenate((np.isfinite(input).astype(float), + np.array([0.] * offset))), + win, minp)[offset:] + + # default behavior is to roll over input array + if array_to_roll is None: + array_to_roll = input # truncated windows at the beginning, through first full-length window for i from 0 <= i < (int_min(win, n) - offset): if counts[i] >= minp: - output[i] = func(input[0 : (i + offset + 1)], *args, **kwargs) + output[i] = func(array_to_roll[0:(i + offset + 1)], + *args, + **kwargs) else: output[i] = NaN # remaining full-length windows - buf = input.data - bufarr = np.empty(win, dtype=float) - oldbuf = bufarr.data + # array_to_roll is a numpy array and doing a slice of contiguous data does + # not make a copy for i from (win - offset) <= i < (n - offset): - buf = buf + 1 - bufarr.data = buf if counts[i] >= minp: - output[i] = func(bufarr, *args, **kwargs) + # full length windows will start at index 1 and be of length win + output[i] = \ + func(array_to_roll[i - (win - offset) + 1:i + offset + 1], + *args, **kwargs) else: output[i] = NaN - bufarr.data = oldbuf # truncated windows at the end for i from int_max(n - offset, 0) <= i < n: if counts[i] >= minp: - output[i] = func(input[int_max(i + offset - win + 1, 0) : n], *args, **kwargs) + output[i] = func(array_to_roll[int_max(i + offset - win + 1, 0):n], + *args, + **kwargs) else: output[i] = NaN From da418efbceda1b77927d9c60647034d0f257a3a2 Mon Sep 17 00:00:00 2001 From: Jasmine Sandhu Date: Mon, 16 Nov 2015 15:09:01 -0800 Subject: [PATCH 2/3] ENH: GH4964 Add coercion kwarg to rolling_apply Added coercion bool in rolling_apply and _process_data_structure() It defaults to True in which case the _process_data_structure() converts arg to float and things work as they did before this change (backwards compatible). If user wishes to use rolling_apply() with string array, then set coercion=False. Default of coercion=True prioritizes performance. --- pandas/stats/moments.py | 54 +++++++++++++++++++++++++++++------------ 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/pandas/stats/moments.py b/pandas/stats/moments.py index 3cddae45e7516..c4183d748fc9c 100644 --- a/pandas/stats/moments.py +++ b/pandas/stats/moments.py @@ -355,7 +355,7 @@ def rolling_corr_pairwise(df1, df2=None, window=None, min_periods=None, def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, - how=None, args=(), kwargs={}, **kwds): + how=None, coercion=True, args=(), kwargs={}, **kwds): """ Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. @@ -374,6 +374,10 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, Whether the label should correspond with center of window how : string, default 'mean' Method for down- or re-sampling + coercion: bool flag with default True. It tries to coerce args to a float + to optimize for speed. If rolling_apply() is invoked on objects that + cannot be coerced into a float, it raises a ValueError. Be sure + to set coercion=False in this case. args : tuple Passed on to func kwargs : dict @@ -385,7 +389,7 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, """ arg = _conv_timerule(arg, freq, how) - return_hook, values = _process_data_structure(arg) + return_hook, values = _process_data_structure(arg, coercion=coercion) if values.size == 0: result = values.copy() @@ -393,9 +397,18 @@ def _rolling_moment(arg, window, func, minp, axis=0, freq=None, center=False, # actually calculate the moment. Faster way to do this? offset = int((window - 1) / 2.) if center else 0 additional_nans = np.array([np.NaN] * offset) - calc = lambda x: func(np.concatenate((x, additional_nans)) if center else x, - window, minp=minp, args=args, kwargs=kwargs, - **kwds) + + if coercion: + calc = lambda x: func(np.concatenate((x, additional_nans)) if + center else x, window, minp=minp, args=args, + kwargs=kwargs, **kwds) + else: + p0 = np.arange(0, len(values), dtype=float) + calc = lambda x: func(np.concatenate((p0, additional_nans)) + if center else p0, window, minp=minp, + args=args, kwargs=kwargs, + array_to_roll=x, **kwds) + if values.ndim > 1: result = np.apply_along_axis(calc, axis, values) else: @@ -423,7 +436,7 @@ def _center_window(rs, window, axis): return rs -def _process_data_structure(arg, kill_inf=True): +def _process_data_structure(arg, kill_inf=True, coercion=True): if isinstance(arg, DataFrame): return_hook = lambda v: type(arg)(v, index=arg.index, columns=arg.columns) @@ -435,12 +448,13 @@ def _process_data_structure(arg, kill_inf=True): return_hook = lambda v: v values = arg - if not issubclass(values.dtype.type, float): - values = values.astype(float) + if coercion: + if not issubclass(values.dtype.type, float): + values = values.astype(float) - if kill_inf: - values = values.copy() - values[np.isinf(values)] = np.NaN + if kill_inf: + values = values.copy() + values[np.isinf(values)] = np.NaN return return_hook, values @@ -712,7 +726,7 @@ def call_cython(arg, window, minp, args=(), kwargs={}): def rolling_apply(arg, window, func, min_periods=None, freq=None, - center=False, args=(), kwargs={}): + center=False, coercion=True, args=(), kwargs={}): """Generic moving function application. Parameters @@ -731,6 +745,10 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None, as a frequency string or DateOffset object. center : boolean, default False Whether the label should correspond with center of window + coercion: bool flag with default True. It tries to coerce args to a float + to optimize for speed. If rolling_apply() is invoked on objects that + cannot be coerced into a float, it raises a ValueError. Be sure + to set coercion=False in this case. args : tuple Passed on to func kwargs : dict @@ -750,11 +768,15 @@ def rolling_apply(arg, window, func, min_periods=None, freq=None, of :meth:`~pandas.Series.resample` (i.e. using the `mean`). """ offset = int((window - 1) / 2.) if center else 0 - def call_cython(arg, window, minp, args, kwargs): + + def call_cython(arg, window, minp, args, kwargs, array_to_roll=None): minp = _use_window(minp, window) - return algos.roll_generic(arg, window, minp, offset, func, args, kwargs) - return _rolling_moment(arg, window, call_cython, min_periods, freq=freq, - center=False, args=args, kwargs=kwargs) + return algos.roll_generic(arg, window, minp, offset, func, args, + kwargs, array_to_roll) + + return _rolling_moment(arg, window, call_cython, min_periods, + freq=freq, center=False, coercion=coercion, + args=args, kwargs=kwargs) def rolling_window(arg, window=None, win_type=None, min_periods=None, From ba73145695d64ef6672685fcf7c81ddc0be5d496 Mon Sep 17 00:00:00 2001 From: Jasmine Sandhu Date: Mon, 16 Nov 2015 15:15:08 -0800 Subject: [PATCH 3/3] ENH: GH4964 Add rolling_apply on strings as test --- pandas/stats/tests/test_moments.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pandas/stats/tests/test_moments.py b/pandas/stats/tests/test_moments.py index e2ed27156d2b5..69a436798e354 100644 --- a/pandas/stats/tests/test_moments.py +++ b/pandas/stats/tests/test_moments.py @@ -363,6 +363,27 @@ def roll_mean(x, window, min_periods=None, freq=None, center=False): expected = Series([1., 2., 2.]) assert_series_equal(result, expected) + def test_rolling_apply_nonfloat(self): + ''' + test rolling_apply now also works for non-float data types if coercion + is set to False. The return type is still float but the 'roll' + is applied to arg which no longer has to be a float + ''' + # check rolling_apply with coercion set to False + orig = Series([ord('a'), ord('b'), ord('c')], dtype=float) + s = Series(['a', 'b', 'c']) + + for min_p in (None, 0): + s_res = mom.rolling_apply(s, 2, lambda x: ord(x[-1]), + coercion=False, min_periods=min_p) + o_res = mom.rolling_apply(orig, 2, lambda x: x[-1], + coercion=False, min_periods=min_p) + + # assert that NaN values appear at same place since min_periods + # defines the NaN values. Also assert that valid answers match + assert all(np.isfinite(s_res) == np.isfinite(o_res)) + assert all(s_res[np.isfinite(s_res)] == o_res[np.isfinite(o_res)]) + def test_rolling_apply_out_of_bounds(self): # #1850 arr = np.arange(4)