Skip to content

Commit 3fb802a

Browse files
behzadnourijreback
authored andcommitted
PERF: improves groupby.get_group_index when shape is a long sequence, #10161
1 parent 6ea3389 commit 3fb802a

File tree

3 files changed

+19
-3
lines changed

3 files changed

+19
-3
lines changed

asv_bench/benchmarks/frame_methods.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,14 @@ def setup(self):
293293
def time_frame_duplicated(self):
294294
self.df.duplicated()
295295

296+
class frame_duplicated_wide(object):
297+
goal_time = 0.2
298+
299+
def setup(self):
300+
self.df = DataFrame(np.random.randn(1000, 100).astype(str))
301+
302+
def time_frame_duplicated_wide(self):
303+
self.df.T.duplicated()
296304

297305
class frame_fancy_lookup(object):
298306
goal_time = 0.2
@@ -929,4 +937,4 @@ def setup(self):
929937
self.s = Series((['abcdefg', np.nan] * 500000))
930938

931939
def time_series_string_vector_slice(self):
932-
self.s.str[:5]
940+
self.s.str[:5]

doc/source/whatsnew/v0.17.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -1045,6 +1045,7 @@ Performance Improvements
10451045
- Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
10461046
- Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` and ``SeriesGroupby.transform`` (:issue:`10820`, :issue:`11077`)
10471047
- Performance improvements in ``DataFrame.drop_duplicates`` with integer dtypes (:issue:`10917`)
1048+
- Performance improvements in ``DataFrame.duplicated`` with wide frames. (:issue:`10161`, :issue:`11180`)
10481049
- 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
10491050
- 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
10501051
- Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)

pandas/core/groupby.py

+9-2
Original file line numberDiff line numberDiff line change
@@ -3739,10 +3739,17 @@ def get_group_index(labels, shape, sort, xnull):
37393739
An array of type int64 where two elements are equal if their corresponding
37403740
labels are equal at all location.
37413741
"""
3742+
def _int64_cut_off(shape):
3743+
acc = long(1)
3744+
for i, mul in enumerate(shape):
3745+
acc *= long(mul)
3746+
if not acc < _INT64_MAX:
3747+
return i
3748+
return len(shape)
3749+
37423750
def loop(labels, shape):
37433751
# how many levels can be done without overflow:
3744-
pred = lambda i: not _int64_overflow_possible(shape[:i])
3745-
nlev = next(filter(pred, range(len(shape), 0, -1)))
3752+
nlev = _int64_cut_off(shape)
37463753

37473754
# compute flat ids for the first `nlev` levels
37483755
stride = np.prod(shape[1:nlev], dtype='i8')

0 commit comments

Comments
 (0)