From 66308320ec2f173c22cf12f6fa40c7a674cea8b3 Mon Sep 17 00:00:00 2001
From: behzad nouri <behzadnouri@gmail.com>
Date: Sat, 7 Feb 2015 10:07:50 -0500
Subject: [PATCH] performance improvement in MultiIndex.sortlevel

---
 doc/source/whatsnew/v0.16.0.txt |  1 +
 pandas/core/groupby.py          | 21 ++++++++-------------
 vb_suite/index_object.py        | 12 ++++++++++++
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/doc/source/whatsnew/v0.16.0.txt b/doc/source/whatsnew/v0.16.0.txt
index 0234a0dab8e28..8a8e3db83a583 100644
--- a/doc/source/whatsnew/v0.16.0.txt
+++ b/doc/source/whatsnew/v0.16.0.txt
@@ -175,6 +175,7 @@ Performance
 - Performance improvement of up to 20x in ``DataFrame.count`` when using a ``MultiIndex`` and the ``level`` keyword argument  (:issue:`9163`)
 - Performance and memory usage improvements in ``merge`` when key space exceeds ``int64`` bounds (:issue:`9151`)
 - Performance improvements in multi-key ``groupby`` (:issue:`9429`)
+- Performance improvements in ``MultiIndex.sortlevel`` (:issue:`9445`)
 
 Bug Fixes
 ~~~~~~~~~
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index 0a12484f9ab3a..792c7891053b8 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -3584,21 +3584,15 @@ def decons_obs_group_ids(comp_ids, obs_ids, shape, labels):
 
 
 def _indexer_from_factorized(labels, shape, compress=True):
-    if _int64_overflow_possible(shape):
-        indexer = np.lexsort(np.array(labels[::-1]))
-        return indexer
-
-    group_index = get_group_index(labels, shape, sort=True, xnull=True)
+    ids = get_group_index(labels, shape, sort=True, xnull=False)
 
-    if compress:
-        comp_ids, obs_ids = _compress_group_index(group_index)
-        max_group = len(obs_ids)
+    if not compress:
+        ngroups = (ids.size and ids.max()) + 1
     else:
-        comp_ids = group_index
-        max_group = com._long_prod(shape)
+        ids, obs = _compress_group_index(ids, sort=True)
+        ngroups = len(obs)
 
-    indexer = _get_group_index_sorter(comp_ids.astype(np.int64), max_group)
-    return indexer
+    return _get_group_index_sorter(ids, ngroups)
 
 
 def _lexsort_indexer(keys, orders=None, na_position='last'):
@@ -3753,7 +3747,8 @@ def _compress_group_index(group_index, sort=True):
     (comp_ids) into the list of unique labels (obs_group_ids).
     """
 
-    table = _hash.Int64HashTable(min(1000000, len(group_index)))
+    size_hint = min(len(group_index), _hash._SIZE_HINT_LIMIT)
+    table = _hash.Int64HashTable(size_hint)
 
     group_index = com._ensure_int64(group_index)
 
diff --git a/vb_suite/index_object.py b/vb_suite/index_object.py
index 08ad96d1d0427..768eb2658af8f 100644
--- a/vb_suite/index_object.py
+++ b/vb_suite/index_object.py
@@ -159,3 +159,15 @@
 datetime_index_repr = \
     Benchmark("dr._is_dates_only", setup,
               start_date=datetime(2012, 1, 11))
+
+setup = common_setup + """
+n = 3 * 5 * 7 * 11 * (1 << 10)
+low, high = - 1 << 12, 1 << 12
+f = lambda k: np.repeat(np.random.randint(low, high, n // k), k)
+
+i = np.random.permutation(n)
+mi = MultiIndex.from_arrays([f(11), f(7), f(5), f(3), f(1)])[i]
+"""
+
+multiindex_sortlevel_int64 = Benchmark('mi.sortlevel()', setup,
+                                       name='multiindex_sortlevel_int64')