From 6edfb839f51ebaa53b33f2e3c57a2007738b0573 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 23 Jan 2022 13:49:20 -0500 Subject: [PATCH 1/6] faster groupby.diff --- asv_bench/benchmarks/groupby.py | 1 + pandas/core/groupby/groupby.py | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index ff58e382a9ba2..b2375a0bdbf9a 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -417,6 +417,7 @@ class GroupByMethods: "cumprod", "cumsum", "describe", + "diff", "ffill", "first", "head", diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e4c5541468629..3942163e4a6bd 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3452,6 +3452,47 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): ) return res + @final + @Substitution(name="groupby") + @Appender(_common_see_also) + def diff(self, periods=1, axis=0): + """ + First discrete difference of element. + + Calculates the difference of each element compared with another + element in the group (default is element in previous row). + + Parameters + ---------- + periods : int, default 1 + Periods to shift for calculating difference, accepts negative values. + axis : axis to shift, default 0 + Take difference over rows (0) or columns (1). + + Returns + ------- + Series or DataFrame + First differences. + """ + if axis != 0: + return self.apply(lambda x: x.diff(periods=periods, axis=axis)) + + obj = self._obj_with_exclusions + shifted = self.shift(periods=periods, axis=axis) + + # GH45562 - to retain existing behavior and match behavior of Series.diff(), + # int8 and int16 are coerced to float32 rather than float64. + dtypes_to_f32 = ["int8", "int16"] + if obj.ndim == 1: + if obj.dtype in dtypes_to_f32: + shifted = shifted.astype("float32") + else: + mask = obj.dtypes.astype(str).isin(dtypes_to_f32).values + if mask.any(): + shifted.loc[:, mask] = shifted.loc[:, mask].astype("float32") + + return obj - shifted + @final @Substitution(name="groupby") @Appender(_common_see_also) From 3badfadb78dbebc35ba13827fa0967087d76f75c Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 23 Jan 2022 14:48:37 -0500 Subject: [PATCH 2/6] whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 373f4c666a116..63159c8116c3b 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -160,6 +160,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`.GroupBy.transform` for some user-defined DataFrame -> Series functions (:issue:`45387`) - Performance improvement in :meth:`DataFrame.duplicated` when subset consists of only one column (:issue:`45236`) +- Performance improvement in :meth:`.GroupBy.diff` (:issue:`16706`) - .. --------------------------------------------------------------------------- From 1c730a086a919bafe9aa71d863157d746c0a4494 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 23 Jan 2022 15:52:08 -0500 Subject: [PATCH 3/6] fix type coercion, add test --- pandas/core/groupby/groupby.py | 6 ++--- .../tests/groupby/test_groupby_shift_diff.py | 25 ++++++++++++++++++- 2 files changed, 27 insertions(+), 4 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 3f84facc2a221..7fa53e83deabe 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3519,9 +3519,9 @@ def diff(self, periods=1, axis=0): if obj.dtype in dtypes_to_f32: shifted = shifted.astype("float32") else: - mask = obj.dtypes.astype(str).isin(dtypes_to_f32).values - if mask.any(): - shifted.loc[:, mask] = shifted.loc[:, mask].astype("float32") + to_coerce = [c for c, dtype in obj.dtypes.items() if dtype in dtypes_to_f32] + if len(to_coerce): + shifted = shifted.astype({c: "float32" for c in to_coerce}) return obj - shifted diff --git a/pandas/tests/groupby/test_groupby_shift_diff.py b/pandas/tests/groupby/test_groupby_shift_diff.py index c989c0e0c94cd..7ffee412e3cdf 100644 --- a/pandas/tests/groupby/test_groupby_shift_diff.py +++ b/pandas/tests/groupby/test_groupby_shift_diff.py @@ -69,7 +69,7 @@ def test_group_shift_lose_timezone(): tm.assert_series_equal(result, expected) -def test_group_diff_real(any_real_numpy_dtype): +def test_group_diff_real_series(any_real_numpy_dtype): df = DataFrame( {"a": [1, 2, 3, 3, 2], "b": [1, 2, 3, 4, 5]}, dtype=any_real_numpy_dtype, @@ -82,6 +82,29 @@ def test_group_diff_real(any_real_numpy_dtype): tm.assert_series_equal(result, expected) +def test_group_diff_real_frame(any_real_numpy_dtype): + df = DataFrame( + { + "a": [1, 2, 3, 3, 2], + "b": [1, 2, 3, 4, 5], + "c": [1, 2, 3, 4, 6], + }, + dtype=any_real_numpy_dtype, + ) + result = df.groupby("a").diff() + exp_dtype = "float" + if any_real_numpy_dtype in ["int8", "int16", "float32"]: + exp_dtype = "float32" + expected = DataFrame( + { + "b": [np.nan, np.nan, np.nan, 1.0, 3.0], + "c": [np.nan, np.nan, np.nan, 1.0, 4.0], + }, + dtype=exp_dtype, + ) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( "data", [ From 3d5ca7a93282d2caea6d48579d09a7464bbe5fe1 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 26 Feb 2022 15:18:35 -0500 Subject: [PATCH 4/6] typing --- pandas/core/groupby/groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 44dacf0d00615..ace4fdb83045f 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3494,7 +3494,7 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None): @final @Substitution(name="groupby") @Appender(_common_see_also) - def diff(self, periods=1, axis=0): + def diff(self, periods: int = 1, axis: int = 0) -> Series | DataFrame: """ First discrete difference of element. From c4459b4e150302a0fb3ee4c101fa63a77c902467 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sat, 26 Feb 2022 15:19:40 -0500 Subject: [PATCH 5/6] add int16 to GroupByMethods asv --- asv_bench/benchmarks/groupby.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 247c20e54d35a..92b2ab99476d0 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -405,7 +405,7 @@ class GroupByMethods: param_names = ["dtype", "method", "application", "ncols"] params = [ - ["int", "float", "object", "datetime", "uint"], + ["int", "int16", "float", "object", "datetime", "uint"], [ "all", "any", @@ -479,7 +479,7 @@ def setup(self, dtype, method, application, ncols): values = rng.take(taker, axis=0) if dtype == "int": key = np.random.randint(0, size, size=size) - elif dtype == "uint": + elif dtype in ("int16", "uint"): key = np.random.randint(0, size, size=size, dtype=dtype) elif dtype == "float": key = np.concatenate( From 8e990c0560e21475f9622bf4b809c7bd6f00e6e7 Mon Sep 17 00:00:00 2001 From: Luke Manley Date: Sun, 27 Feb 2022 20:44:44 -0500 Subject: [PATCH 6/6] fix asv --- asv_bench/benchmarks/groupby.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 92b2ab99476d0..cf6f3f92068e8 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -18,6 +18,7 @@ method_blocklist = { "object": { + "diff", "median", "prod", "sem",