From 7efefb98c07c524cd5e5363eca889849abd3091c Mon Sep 17 00:00:00 2001 From: DataOmbudsman Date: Thu, 31 May 2018 13:45:23 +0200 Subject: [PATCH 1/6] PERF: improve performance of NDFrame.describe Calculating percentiles in one pass is faster than separately. --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9e4eda1bc4dc7..2adc15651ffca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8519,7 +8519,7 @@ def describe_numeric_1d(series): stat_index = (['count', 'mean', 'std', 'min'] + formatted_percentiles + ['max']) d = ([series.count(), series.mean(), series.std(), series.min()] + - [series.quantile(x) for x in percentiles] + [series.max()]) + series.quantile(percentiles).tolist() + [series.max()]) return pd.Series(d, index=stat_index, name=series.name) def describe_categorical_1d(data): From 7e3ad1241b58d88a637d50b666865e61709408c1 Mon Sep 17 00:00:00 2001 From: DataOmbudsman Date: Mon, 4 Jun 2018 12:31:16 +0200 Subject: [PATCH 2/6] Add ASV benchmark --- asv_bench/benchmarks/frame_methods.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 4ff71c706cd34..89e762ab0c9a9 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -512,3 +512,21 @@ def time_nlargest(self, keep): def time_nsmallest(self, keep): self.df.nsmallest(100, 'A', keep=keep) + +class Describe(object): + + goal_time = 0.2 + + def setup(self): + np.random.seed(123) + self.df = DataFrame({ + 'a': np.random.randint(0, 100, int(1e6)), + 'b': np.random.randint(0, 100, int(1e6)), + 'c': np.random.randint(0, 100, int(1e6)) + }) + + def time_series_describe(self): + self.df['a'].describe() + + def time_dataframe_describe(self): + self.df.describe() From 70668a19291d00f0e970f2da6b91ee014b03e370 Mon Sep 17 00:00:00 2001 From: DataOmbudsman Date: Mon, 4 Jun 2018 12:52:28 +0200 Subject: [PATCH 3/6] Add whatsnew entry --- doc/source/whatsnew/v0.24.0.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 6cbc19cca99e1..bd21978e32759 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -63,8 +63,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- -- +- Improved performance of :func:`Series.describe` in case of numeric dtpyes - .. _whatsnew_0240.docs: From b866d0f10590bdf1143972aedac38641d51807f3 Mon Sep 17 00:00:00 2001 From: DataOmbudsman Date: Mon, 4 Jun 2018 13:34:47 +0200 Subject: [PATCH 4/6] Add blank line for pep8 --- asv_bench/benchmarks/frame_methods.py | 1 + 1 file changed, 1 insertion(+) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 89e762ab0c9a9..4b3d25123492e 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -513,6 +513,7 @@ def time_nlargest(self, keep): def time_nsmallest(self, keep): self.df.nsmallest(100, 'A', keep=keep) + class Describe(object): goal_time = 0.2 From 31216c80cd92108f9bf3adde886f7cf49d2c7184 Mon Sep 17 00:00:00 2001 From: DataOmbudsman Date: Tue, 5 Jun 2018 09:05:03 +0200 Subject: [PATCH 5/6] remove random seed --- asv_bench/benchmarks/frame_methods.py | 1 - 1 file changed, 1 deletion(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 4b3d25123492e..12e4824b2dd2a 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -519,7 +519,6 @@ class Describe(object): goal_time = 0.2 def setup(self): - np.random.seed(123) self.df = DataFrame({ 'a': np.random.randint(0, 100, int(1e6)), 'b': np.random.randint(0, 100, int(1e6)), From 6dda68e98b5ae1e522293c3a9fba8b23dceb3bef Mon Sep 17 00:00:00 2001 From: DataOmbudsman Date: Tue, 5 Jun 2018 14:22:21 +0200 Subject: [PATCH 6/6] Add issue (PR) number --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index bd21978e32759..c69de149a0f35 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -63,7 +63,7 @@ Removal of prior version deprecations/changes Performance Improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Improved performance of :func:`Series.describe` in case of numeric dtpyes +- Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - .. _whatsnew_0240.docs: