Skip to content

Commit 6630832

Browse files
committed
performance improvement in MultiIndex.sortlevel
1 parent 107cb10 commit 6630832

File tree

3 files changed

+21
-13
lines changed

3 files changed

+21
-13
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ Performance
175175
- Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument (:issue:`9163`)
176176
- Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)
177177
- Performance improvements in multi-key ``groupby`` (:issue:`9429`)
178+
- Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`)
178179

179180
Bug Fixes
180181
~~~~~~~~~

pandas/core/groupby.py

+8-13
Original file line numberDiff line numberDiff line change
@@ -3584,21 +3584,15 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
35843584

35853585

35863586
def _indexer_from_factorized(labels, shape, compress=True):
3587-
if _int64_overflow_possible(shape):
3588-
indexer = np.lexsort(np.array(labels[::-1]))
3589-
return indexer
3590-
3591-
group_index = get_group_index(labels, shape, sort=True, xnull=True)
3587+
ids = get_group_index(labels, shape, sort=True, xnull=False)
35923588

3593-
if compress:
3594-
comp_ids, obs_ids = _compress_group_index(group_index)
3595-
max_group = len(obs_ids)
3589+
if not compress:
3590+
ngroups = (ids.size and ids.max()) + 1
35963591
else:
3597-
comp_ids = group_index
3598-
max_group = com._long_prod(shape)
3592+
ids, obs = _compress_group_index(ids, sort=True)
3593+
ngroups = len(obs)
35993594

3600-
indexer = _get_group_index_sorter(comp_ids.astype(np.int64), max_group)
3601-
return indexer
3595+
return _get_group_index_sorter(ids, ngroups)
36023596

36033597

36043598
def _lexsort_indexer(keys, orders=None, na_position='last'):
@@ -3753,7 +3747,8 @@ def _compress_group_index(group_index, sort=True):
37533747
(comp_ids) into the list of unique labels (obs_group_ids).
37543748
"""
37553749

3756-
table = _hash.Int64HashTable(min(1000000, len(group_index)))
3750+
size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT)
3751+
table = _hash.Int64HashTable(size_hint)
37573752

37583753
group_index = com._ensure_int64(group_index)
37593754

vb_suite/index_object.py

+12
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,15 @@
159159
datetime_index_repr = \
160160
Benchmark("dr._is_dates_only", setup,
161161
start_date=datetime(2012, 1, 11))
162+
163+
setup = common_setup + """
164+
n = 3 * 5 * 7 * 11 * (1 << 10)
165+
low, high = - 1 << 12, 1 << 12
166+
f = lambda k: np.repeat(np.random.randint(low, high, n // k), k)
167+
168+
i = np.random.permutation(n)
169+
mi = MultiIndex.from_arrays([f(11), f(7), f(5), f(3), f(1)])[i]
170+
"""
171+
172+
multiindex_sortlevel_int64 = Benchmark('mi.sortlevel()', setup,
173+
name='multiindex_sortlevel_int64')

0 commit comments

Comments
 (0)