Skip to content

Commit b714262

Browse files
committed
Merge pull request #9445 from behzadnouri/i8sort
PERF: performance improvement in MultiIndex.sortlevel
2 parents fc88ab9 + 6630832 commit b714262

File tree

3 files changed

+21
-13
lines changed

3 files changed

+21
-13
lines changed

doc/source/whatsnew/v0.16.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,7 @@ Performance
175175
- Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument (:issue:`9163`)
176176
- Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)
177177
- Performance improvements in multi-key ``groupby`` (:issue:`9429`)
178+
- Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`)
178179

179180
Bug Fixes
180181
~~~~~~~~~

pandas/core/groupby.py

+8-13
Original file line numberDiff line numberDiff line change
@@ -3581,21 +3581,15 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
35813581

35823582

35833583
def _indexer_from_factorized(labels, shape, compress=True):
3584-
if _int64_overflow_possible(shape):
3585-
indexer = np.lexsort(np.array(labels[::-1]))
3586-
return indexer
3587-
3588-
group_index = get_group_index(labels, shape, sort=True, xnull=True)
3584+
ids = get_group_index(labels, shape, sort=True, xnull=False)
35893585

3590-
if compress:
3591-
comp_ids, obs_ids = _compress_group_index(group_index)
3592-
max_group = len(obs_ids)
3586+
if not compress:
3587+
ngroups = (ids.size and ids.max()) + 1
35933588
else:
3594-
comp_ids = group_index
3595-
max_group = com._long_prod(shape)
3589+
ids, obs = _compress_group_index(ids, sort=True)
3590+
ngroups = len(obs)
35963591

3597-
indexer = _get_group_index_sorter(comp_ids.astype(np.int64), max_group)
3598-
return indexer
3592+
return _get_group_index_sorter(ids, ngroups)
35993593

36003594

36013595
def _lexsort_indexer(keys, orders=None, na_position='last'):
@@ -3750,7 +3744,8 @@ def _compress_group_index(group_index, sort=True):
37503744
(comp_ids) into the list of unique labels (obs_group_ids).
37513745
"""
37523746

3753-
table = _hash.Int64HashTable(min(1000000, len(group_index)))
3747+
size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT)
3748+
table = _hash.Int64HashTable(size_hint)
37543749

37553750
group_index = com._ensure_int64(group_index)
37563751

vb_suite/index_object.py

+12
Original file line numberDiff line numberDiff line change
@@ -159,3 +159,15 @@
159159
datetime_index_repr = \
160160
Benchmark("dr._is_dates_only", setup,
161161
start_date=datetime(2012, 1, 11))
162+
163+
setup = common_setup + """
164+
n = 3 * 5 * 7 * 11 * (1 << 10)
165+
low, high = - 1 << 12, 1 << 12
166+
f = lambda k: np.repeat(np.random.randint(low, high, n // k), k)
167+
168+
i = np.random.permutation(n)
169+
mi = MultiIndex.from_arrays([f(11), f(7), f(5), f(3), f(1)])[i]
170+
"""
171+
172+
multiindex_sortlevel_int64 = Benchmark('mi.sortlevel()', setup,
173+
name='multiindex_sortlevel_int64')

0 commit comments

Comments
 (0)