From a7e644e779a418289de0db42ca6d2d47c88c04b0 Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Wed, 23 Sep 2015 18:26:43 -0400 Subject: [PATCH] improves groupby.get_group_index when shape is a long sequence --- asv_bench/benchmarks/frame_methods.py | 10 +++++++++- doc/source/whatsnew/v0.17.0.txt | 1 + pandas/core/groupby.py | 11 +++++++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 98b0ec73fb23c..9bece56e15c90 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -293,6 +293,14 @@ def setup(self): def time_frame_duplicated(self): self.df.duplicated() +class frame_duplicated_wide(object): + goal_time = 0.2 + + def setup(self): + self.df = DataFrame(np.random.randn(1000, 100).astype(str)) + + def time_frame_duplicated_wide(self): + self.df.T.duplicated() class frame_fancy_lookup(object): goal_time = 0.2 @@ -929,4 +937,4 @@ def setup(self): self.s = Series((['abcdefg', np.nan] * 500000)) def time_series_string_vector_slice(self): - self.s.str[:5] \ No newline at end of file + self.s.str[:5] diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index bf3dfa227732e..ca2c7dfc36353 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -1044,6 +1044,7 @@ Performance Improvements - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`) - Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` and ``SeriesGroupby.transform`` (:issue:`10820`, :issue:`11077`) - Performance improvements in ``DataFrame.drop_duplicates`` with integer dtypes (:issue:`10917`) +- Performance improvements in ``DataFrame.duplicated`` with wide frames. (:issue:`11180`) - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`) - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`) - Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`) diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index c486c414081f2..e72f7c6c6a6bf 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3739,10 +3739,17 @@ def get_group_index(labels, shape, sort, xnull): An array of type int64 where two elements are equal if their corresponding labels are equal at all location. """ + def _int64_cut_off(shape): + acc = long(1) + for i, mul in enumerate(shape): + acc *= long(mul) + if not acc < _INT64_MAX: + return i + return len(shape) + def loop(labels, shape): # how many levels can be done without overflow: - pred = lambda i: not _int64_overflow_possible(shape[:i]) - nlev = next(filter(pred, range(len(shape), 0, -1))) + nlev = _int64_cut_off(shape) # compute flat ids for the first `nlev` levels stride = np.prod(shape[1:nlev], dtype='i8')