From 5e8ba636b1980655a3457a8b14ad96695b3aa76c Mon Sep 17 00:00:00 2001 From: Matt Roeschke Date: Tue, 3 Jan 2017 23:57:59 -0800 Subject: [PATCH] PERF: Cythonize Groupby.cummin/cummax (#15048) pep8, removed args, kwargs add whatsnew + test + changed logic small error in algo Use a more obvious test Fixed algo & test passed Add dtypes test Add additional tests handle nan case Adapt max/min for different dtypes + tests remove uncessary comments Added test & adjust algorithm Fix linting errors --- asv_bench/benchmarks/groupby.py | 3 - doc/source/whatsnew/v0.20.0.txt | 2 +- pandas/core/groupby.py | 25 +++++++- pandas/src/algos_groupby_helper.pxi.in | 70 +++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 87 +++++++++++++++++++++++--- 5 files changed, 173 insertions(+), 14 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index ad58cd0fc6d70..597b040b8075c 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -690,6 +690,3 @@ def time_shift(self): def time_transform_dataframe(self): # GH 12737 self.df_nans.groupby('key').transform('first') - - - diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index c9ea7b427b3f2..e7f61289db4fb 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -283,7 +283,7 @@ Performance Improvements - Increased performance of ``pd.factorize()`` by releasing the GIL with ``object`` dtype when inferred as strings (:issue:`14859`) - Improved performance of timeseries plotting with an irregular DatetimeIndex (or with ``compat_x=True``) (:issue:`15073`). - +- Improved performance of ``groupby().cummin()`` and ``groupby().cummax()`` (:issue:`15048`) - When reading buffer object in ``read_sas()`` method without specified format, filepath string is inferred rather than buffer object. diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 7eba32b4932d0..700e279cb0030 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -75,7 +75,7 @@ 'last', 'first', 'head', 'tail', 'median', 'mean', 'sum', 'min', 'max', - 'cumsum', 'cumprod', 'cummin', 'cummax', 'cumcount', + 'cumcount', 'resample', 'describe', 'rank', 'quantile', @@ -97,7 +97,8 @@ _dataframe_apply_whitelist = \ _common_apply_whitelist | frozenset(['dtypes', 'corrwith']) -_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift']) +_cython_transforms = frozenset(['cumprod', 'cumsum', 'shift', + 'cummin', 'cummax']) def _groupby_function(name, alias, npfunc, numeric_only=True, @@ -1415,6 +1416,24 @@ def cumsum(self, axis=0, *args, **kwargs): return self._cython_transform('cumsum') + @Substitution(name='groupby') + @Appender(_doc_template) + def cummin(self, axis=0): + """Cumulative min for each group""" + if axis != 0: + return self.apply(lambda x: np.minimum.accumulate(x, axis)) + + return self._cython_transform('cummin') + + @Substitution(name='groupby') + @Appender(_doc_template) + def cummax(self, axis=0): + """Cumulative max for each group""" + if axis != 0: + return self.apply(lambda x: np.maximum.accumulate(x, axis)) + + return self._cython_transform('cummax') + @Substitution(name='groupby') @Appender(_doc_template) def shift(self, periods=1, freq=None, axis=0): @@ -1752,6 +1771,8 @@ def get_group_levels(self): 'transform': { 'cumprod': 'group_cumprod', 'cumsum': 'group_cumsum', + 'cummin': 'group_cummin', + 'cummax': 'group_cummax', } } diff --git a/pandas/src/algos_groupby_helper.pxi.in b/pandas/src/algos_groupby_helper.pxi.in index 5c704436ce3a0..70862e198edba 100644 --- a/pandas/src/algos_groupby_helper.pxi.in +++ b/pandas/src/algos_groupby_helper.pxi.in @@ -568,6 +568,76 @@ def group_min_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, else: out[i, j] = minx[i, j] + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummin_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels, + ndarray[{{dest_type2}}, ndim=2] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + {{dest_type2}} val, min_val + int64_t lab + + N, K = ( values).shape + accum.fill({{inf_val}}) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + if val < accum[lab, j]: + min_val = val + accum[lab, j] = min_val + out[i, j] = accum[lab, j] + # val = nan + else: + out[i, j] = {{nan_val}} + + +@cython.boundscheck(False) +@cython.wraparound(False) +def group_cummax_{{name}}(ndarray[{{dest_type2}}, ndim=2] out, + ndarray[{{dest_type2}}, ndim=2] values, + ndarray[int64_t] labels, + ndarray[{{dest_type2}}, ndim=2] accum): + """ + Only transforms on axis=0 + """ + cdef: + Py_ssize_t i, j, N, K, size + {{dest_type2}} val, max_val + int64_t lab + + N, K = ( values).shape + accum.fill(-{{inf_val}}) + + with nogil: + for i in range(N): + lab = labels[i] + + if lab < 0: + continue + for j in range(K): + val = values[i, j] + if val == val: + if val > accum[lab, j]: + max_val = val + accum[lab, j] = max_val + out[i, j] = accum[lab, j] + # val = nan + else: + out[i, j] = {{nan_val}} + {{endfor}} #---------------------------------------------------------------------- diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e87b5d04271e8..3cde3b81fea3b 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -4977,10 +4977,6 @@ def test_groupby_whitelist(self): 'max', 'head', 'tail', - 'cumsum', - 'cumprod', - 'cummin', - 'cummax', 'cumcount', 'resample', 'describe', @@ -5018,10 +5014,6 @@ def test_groupby_whitelist(self): 'max', 'head', 'tail', - 'cumsum', - 'cumprod', - 'cummin', - 'cummax', 'cumcount', 'resample', 'describe', @@ -5777,6 +5769,85 @@ def test_agg_over_numpy_arrays(self): assert_frame_equal(result, expected) + def test_cummin_cummax(self): + # GH 15048 + num_types = [np.int32, np.int64, np.float32, np.float64] + num_mins = [np.iinfo(np.int32).min, np.iinfo(np.int64).min, + np.finfo(np.float32).min, np.finfo(np.float64).min] + num_max = [np.iinfo(np.int32).max, np.iinfo(np.int64).max, + np.finfo(np.float32).max, np.finfo(np.float64).max] + base_df = pd.DataFrame({'A': [1, 1, 1, 1, 2, 2, 2, 2], + 'B': [3, 4, 3, 2, 2, 3, 2, 1]}) + expected_mins = [3, 3, 3, 2, 2, 2, 2, 1] + expected_maxs = [3, 4, 4, 4, 2, 3, 3, 3] + + for dtype, min_val, max_val in zip(num_types, num_mins, num_max): + df = base_df.astype(dtype) + + # cummin + expected = pd.DataFrame({'B': expected_mins}).astype(dtype) + result = df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + result = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummin w/ min value for dtype + df.loc[[2, 6], 'B'] = min_val + expected.loc[[2, 3, 6, 7], 'B'] = min_val + result = df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + expected = df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(result, expected) + + # cummax + expected = pd.DataFrame({'B': expected_maxs}).astype(dtype) + result = df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + result = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test cummax w/ max value for dtype + df.loc[[2, 6], 'B'] = max_val + expected.loc[[2, 3, 6, 7], 'B'] = max_val + result = df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + expected = df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(result, expected) + + # Test nan in some values + base_df.loc[[0, 2, 4, 6], 'B'] = np.nan + expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 2, + np.nan, 3, np.nan, 1]}) + result = base_df.groupby('A').cummin() + tm.assert_frame_equal(result, expected) + expected = (base_df.groupby('A') + .B + .apply(lambda x: x.cummin()) + .to_frame()) + tm.assert_frame_equal(result, expected) + + expected = pd.DataFrame({'B': [np.nan, 4, np.nan, 4, + np.nan, 3, np.nan, 3]}) + result = base_df.groupby('A').cummax() + tm.assert_frame_equal(result, expected) + expected = (base_df.groupby('A') + .B + .apply(lambda x: x.cummax()) + .to_frame()) + tm.assert_frame_equal(result, expected) + + # Test nan in entire column + base_df['B'] = np.nan + expected = pd.DataFrame({'B': [np.nan] * 8}) + result = base_df.groupby('A').cummin() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').B.apply(lambda x: x.cummin()).to_frame() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').cummax() + tm.assert_frame_equal(expected, result) + result = base_df.groupby('A').B.apply(lambda x: x.cummax()).to_frame() + tm.assert_frame_equal(expected, result) + def assert_fp_equal(a, b): assert (np.abs(a - b) < 1e-12).all()