diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 9895afdc4c7b5..9c4bb0980b569 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -365,7 +365,7 @@ Performance Improvements - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). -- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`) +- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`, :issue:`15109`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index e4edbcacd5b0a..99220232114ce 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -18,13 +18,13 @@ is_timedelta64_dtype, is_datetime64_dtype, is_categorical_dtype, is_datetimelike, - is_datetime_or_timedelta_dtype, is_datetime64_any_dtype, is_bool, is_integer_dtype, is_complex_dtype, is_bool_dtype, is_scalar, is_list_like, + needs_i8_conversion, _ensure_float64, _ensure_platform_int, _ensure_int64, @@ -1876,15 +1876,21 @@ def _cython_operation(self, kind, values, how, axis): "supported for the 'how' argument") out_shape = (self.ngroups,) + values.shape[1:] + is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) - if is_datetime_or_timedelta_dtype(values.dtype): + if is_datetimelike: values = values.view('int64') is_numeric = True elif is_bool_dtype(values.dtype): values = _ensure_float64(values) elif is_integer_dtype(values): - values = values.astype('int64', copy=False) + # we use iNaT for the missing value on ints + # so pre-convert to guard this condition + if (values == tslib.iNaT).any(): + values = _ensure_float64(values) + else: + values = values.astype('int64', copy=False) elif is_numeric and not is_complex_dtype(values): values = _ensure_float64(values) else: @@ -1913,20 +1919,20 @@ def _cython_operation(self, kind, values, how, axis): fill_value=np.nan) counts = np.zeros(self.ngroups, dtype=np.int64) result = self._aggregate( - result, counts, values, labels, func, is_numeric) + result, counts, values, labels, func, is_numeric, + is_datetimelike) elif kind == 'transform': result = _maybe_fill(np.empty_like(values, dtype=out_dtype), fill_value=np.nan) - # temporary storange for running-total type tranforms - accum = np.empty(out_shape, dtype=out_dtype) result = self._transform( - result, accum, values, labels, func, is_numeric) + result, values, labels, func, is_numeric, is_datetimelike) if is_integer_dtype(result): - if len(result[result == tslib.iNaT]) > 0: + mask = result == tslib.iNaT + if mask.any(): result = result.astype('float64') - result[result == tslib.iNaT] = np.nan + result[mask] = np.nan if kind == 'aggregate' and \ self._filter_empty_groups and not counts.all(): @@ -1962,7 +1968,7 @@ def transform(self, values, how, axis=0): return self._cython_operation('transform', values, how, axis) def _aggregate(self, result, counts, values, comp_ids, agg_func, - is_numeric): + is_numeric, is_datetimelike): if values.ndim > 3: # punting for now raise NotImplementedError("number of dimensions is currently " @@ -1977,8 +1983,9 @@ def _aggregate(self, result, counts, values, comp_ids, agg_func, return result - def _transform(self, result, accum, values, comp_ids, transform_func, - is_numeric): + def _transform(self, result, values, comp_ids, transform_func, + is_numeric, is_datetimelike): + comp_ids, _, ngroups = self.group_info if values.ndim > 3: # punting for now @@ -1989,9 +1996,9 @@ def _transform(self, result, accum, values, comp_ids, transform_func, chunk = chunk.squeeze() transform_func(result[:, :, i], values, - comp_ids, accum) + comp_ids, is_datetimelike) else: - transform_func(result, values, comp_ids, accum) + transform_func(result, values, comp_ids, is_datetimelike) return result diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index 4b031afed3d43..fda1e51bd2b1f 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -574,16 +574,18 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] values, ndarray[int64_t] labels, - ndarray[{{dest_type2}}, ndim=2] accum): + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size {{dest_type2}} val, min_val = 0 + ndarray[{{dest_type2}}, ndim=2] accum int64_t lab N, K = ( values).shape + accum = np.empty_like(values) accum.fill({{inf_val}}) with nogil: @@ -600,7 +602,7 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, accum[lab, j] = min_val out[i, j] = accum[lab, j] # val = nan - else: + elif is_datetimelike: out[i, j] = {{nan_val}} @@ -609,16 +611,18 @@ def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, ndarray[{{dest_type2}}, ndim=2] values, ndarray[int64_t] labels, - ndarray[{{dest_type2}}, ndim=2] accum): + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size {{dest_type2}} val, max_val = 0 + ndarray[{{dest_type2}}, ndim=2] accum int64_t lab N, K = ( values).shape + accum = np.empty_like(values) accum.fill(-{{inf_val}}) with nogil: @@ -635,7 +639,7 @@ def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, accum[lab, j] = max_val out[i, j] = accum[lab, j] # val = nan - else: + elif is_datetimelike: out[i, j] = {{nan_val}} {{endfor}} @@ -682,17 +686,18 @@ def group_median_float64(ndarray[float64_t, ndim=2] out, def group_cumprod_float64(float64_t[:, :] out, float64_t[:, :] values, int64_t[:] labels, - float64_t[:, :] accum): + bint is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size float64_t val + float64_t[:, :] accum int64_t lab N, K = ( values).shape - accum = np.ones_like(accum) + accum = np.ones_like(values) with nogil: for i in range(N): @@ -712,17 +717,18 @@ def group_cumprod_float64(float64_t[:, :] out, def group_cumsum(numeric[:, :] out, numeric[:, :] values, int64_t[:] labels, - numeric[:, :] accum): + is_datetimelike): """ Only transforms on axis=0 """ cdef: Py_ssize_t i, j, N, K, size numeric val + numeric[:, :] accum int64_t lab N, K = ( values).shape - accum = np.zeros_like(accum) + accum = np.zeros_like(values) with nogil: for i in range(N): diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 68689189a92b2..ffb6025163a6b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -5507,13 +5507,13 @@ def test_cython_group_transform_algos(self): ops = [(pd.algos.group_cumprod_float64, np.cumproduct, [np.float64]), (pd.algos.group_cumsum, np.cumsum, dtypes)] + is_datetimelike = False for pd_op, np_op, dtypes in ops: for dtype in dtypes: data = np.array([[1], [2], [3], [4]], dtype=dtype) ans = np.zeros_like(data) - accum = np.array([[0]], dtype=dtype) labels = np.array([0, 0, 0, 0], dtype=np.int64) - pd_op(ans, data, labels, accum) + pd_op(ans, data, labels, is_datetimelike) self.assert_numpy_array_equal(np_op(data), ans[:, 0], check_dtype=False) @@ -5521,25 +5521,24 @@ def test_cython_group_transform_algos(self): labels = np.array([0, 0, 0, 0, 0], dtype=np.int64) data = np.array([[1], [2], [3], [np.nan], [4]], dtype='float64') - accum = np.array([[0.0]]) actual = np.zeros_like(data) actual.fill(np.nan) - pd.algos.group_cumprod_float64(actual, data, labels, accum) + pd.algos.group_cumprod_float64(actual, data, labels, is_datetimelike) expected = np.array([1, 2, 6, np.nan, 24], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) - accum = np.array([[0.0]]) actual = np.zeros_like(data) actual.fill(np.nan) - pd.algos.group_cumsum(actual, data, labels, accum) + pd.algos.group_cumsum(actual, data, labels, is_datetimelike) expected = np.array([1, 3, 6, np.nan, 10], dtype='float64') self.assert_numpy_array_equal(actual[:, 0], expected) # timedelta + is_datetimelike = True data = np.array([np.timedelta64(1, 'ns')] * 5, dtype='m8[ns]')[:, None] - accum = np.array([[0]], dtype='int64') actual = np.zeros_like(data, dtype='int64') - pd.algos.group_cumsum(actual, data.view('int64'), labels, accum) + pd.algos.group_cumsum(actual, data.view('int64'), labels, + is_datetimelike) expected = np.array([np.timedelta64(1, 'ns'), np.timedelta64( 2, 'ns'), np.timedelta64(3, 'ns'), np.timedelta64(4, 'ns'), np.timedelta64(5, 'ns')]) @@ -5965,12 +5964,9 @@ def test_cummin_cummax(self): df.loc[[2, 6], 'B'] = min_val expected.loc[[2, 3, 6, 7], 'B'] = min_val result = df.groupby('A').cummin() - - # TODO: GH 15019 - # overwriting NaNs - # tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() - # tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # cummax expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) @@ -5983,13 +5979,9 @@ def test_cummin_cummax(self): df.loc[[2, 6], 'B'] = max_val expected.loc[[2, 3, 6, 7], 'B'] = max_val result = df.groupby('A').cummax() - - # TODO: GH 15019 - # overwriting NaNs - # tm.assert_frame_equal(result, expected) - + tm.assert_frame_equal(result, expected) expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() - # tm.assert_frame_equal(result, expected) + tm.assert_frame_equal(result, expected) # Test nan in some values base_df.loc[[0, 2, 4, 6], 'B'] = np.nan