From 6ed841da09429e16b4d083ad883c405966e63862 Mon Sep 17 00:00:00 2001 From: krasch Date: Mon, 24 Oct 2022 11:09:04 +0200 Subject: [PATCH 1/2] GH28635 Add ASV benchmark for resample after groupby --- asv_bench/benchmarks/groupby.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 97f48a3a6f69f..37b6500d97c28 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -14,6 +14,7 @@ Timestamp, date_range, period_range, + to_timedelta, ) from .pandas_vb_common import tm @@ -987,4 +988,26 @@ def time_sample_weights(self): self.df.groupby(self.groups).sample(n=1, weights=self.weights) +class Resample: + # GH 28635 + def setup(self): + num_timedeltas = 100_000 + num_groups = 5 + + index = MultiIndex.from_product( + [ + np.arange(num_groups), + to_timedelta(np.arange(num_timedeltas), unit="s"), + ] + ) + + self.df = DataFrame(np.random.randint(0, 1000, size=(len(index))), index=index) + + def time_resample(self): + self.df.reset_index(1).groupby(level=0).resample("10s", on="level_1").mean() + + def time_resample_multiindex(self): + self.df.groupby(level=0).resample("10s", level=1).mean() + + from .pandas_vb_common import setup # noqa: F401 isort:skip From 605bd2d69a9ba2a45c9f1bffb5a8c570bd6853d0 Mon Sep 17 00:00:00 2001 From: krasch Date: Mon, 24 Oct 2022 14:33:14 +0200 Subject: [PATCH 2/2] Decrease size of sample data, move dataset generation into setup method --- asv_bench/benchmarks/groupby.py | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/asv_bench/benchmarks/groupby.py b/asv_bench/benchmarks/groupby.py index 37b6500d97c28..2225cbd74d718 100644 --- a/asv_bench/benchmarks/groupby.py +++ b/asv_bench/benchmarks/groupby.py @@ -991,23 +991,28 @@ def time_sample_weights(self): class Resample: # GH 28635 def setup(self): - num_timedeltas = 100_000 - num_groups = 5 + num_timedeltas = 20_000 + num_groups = 3 index = MultiIndex.from_product( [ np.arange(num_groups), to_timedelta(np.arange(num_timedeltas), unit="s"), - ] + ], + names=["groups", "timedeltas"], ) + data = np.random.randint(0, 1000, size=(len(index))) - self.df = DataFrame(np.random.randint(0, 1000, size=(len(index))), index=index) + self.df = DataFrame(data, index=index).reset_index("timedeltas") + self.df_multiindex = DataFrame(data, index=index) def time_resample(self): - self.df.reset_index(1).groupby(level=0).resample("10s", on="level_1").mean() + self.df.groupby(level="groups").resample("10s", on="timedeltas").mean() def time_resample_multiindex(self): - self.df.groupby(level=0).resample("10s", level=1).mean() + self.df_multiindex.groupby(level="groups").resample( + "10s", level="timedeltas" + ).mean() from .pandas_vb_common import setup # noqa: F401 isort:skip