From 66308320ec2f173c22cf12f6fa40c7a674cea8b3 Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Sat, 7 Feb 2015 10:07:50 -0500 Subject: [PATCH] performance improvement in MultiIndex.sortlevel --- doc/source/whatsnew/v0.16.0.txt | 1 + pandas/core/groupby.py | 21 ++++++++------------- vb_suite/index_object.py | 12 ++++++++++++ 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt index 0234a0dab8e28..8a8e3db83a583 100644 --- a/doc/source/whatsnew/v0.16.0.txt +++ b/doc/source/whatsnew/v0.16.0.txt @@ -175,6 +175,7 @@ Performance - Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument (:issue:`9163`) - Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`) - Performance improvements in multi-key ``groupby`` (:issue:`9429`) +- Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`) Bug Fixes ~~~~~~~~~ diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py index 0a12484f9ab3a..792c7891053b8 100644 --- a/pandas/core/groupby.py +++ b/pandas/core/groupby.py @@ -3584,21 +3584,15 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels): def _indexer_from_factorized(labels, shape, compress=True): - if _int64_overflow_possible(shape): - indexer = np.lexsort(np.array(labels[::-1])) - return indexer - - group_index = get_group_index(labels, shape, sort=True, xnull=True) + ids = get_group_index(labels, shape, sort=True, xnull=False) - if compress: - comp_ids, obs_ids = _compress_group_index(group_index) - max_group = len(obs_ids) + if not compress: + ngroups = (ids.size and ids.max()) + 1 else: - comp_ids = group_index - max_group = com._long_prod(shape) + ids, obs = _compress_group_index(ids, sort=True) + ngroups = len(obs) - indexer = _get_group_index_sorter(comp_ids.astype(np.int64), max_group) - return indexer + return _get_group_index_sorter(ids, ngroups) def _lexsort_indexer(keys, orders=None, na_position='last'): @@ -3753,7 +3747,8 @@ def _compress_group_index(group_index, sort=True): (comp_ids) into the list of unique labels (obs_group_ids). """ - table = _hash.Int64HashTable(min(1000000, len(group_index))) + size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT) + table = _hash.Int64HashTable(size_hint) group_index = com._ensure_int64(group_index) diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py index 08ad96d1d0427..768eb2658af8f 100644 --- a/vb_suite/index_object.py +++ b/vb_suite/index_object.py @@ -159,3 +159,15 @@ datetime_index_repr = \ Benchmark("dr._is_dates_only", setup, start_date=datetime(2012, 1, 11)) + +setup = common_setup + """ +n = 3 * 5 * 7 * 11 * (1 << 10) +low, high = - 1 << 12, 1 << 12 +f = lambda k: np.repeat(np.random.randint(low, high, n // k), k) + +i = np.random.permutation(n) +mi = MultiIndex.from_arrays([f(11), f(7), f(5), f(3), f(1)])[i] +""" + +multiindex_sortlevel_int64 = Benchmark('mi.sortlevel()', setup, + name='multiindex_sortlevel_int64')