From c61dd9a982e767dbfaed656a3082dad37c763ba2 Mon Sep 17 00:00:00 2001 From: Pietro Battiston Date: Sun, 15 Apr 2018 11:16:24 +0200 Subject: [PATCH] PERF: enhance MultiIndex.remove_unused_levels when no levels are unused closes #19289 --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/indexes/multi.py | 26 +++++++++++++++++--------- 2 files changed, 18 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index eee0f1997d081..772e80e160579 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -901,6 +901,7 @@ Performance Improvements - :func:`Series` / :func:`DataFrame` tab completion limits to 100 values, for better performance. (:issue:`18587`) - Improved performance of :func:`DataFrame.median` with ``axis=1`` when bottleneck is not installed (:issue:`16468`) - Improved performance of :func:`MultiIndex.get_loc` for large indexes, at the cost of a reduction in performance for small ones (:issue:`18519`) +- Improved performance of :func:`MultiIndex.remove_unused_levels` when there are no unused levels, at the cost of a reduction in performance when there are (:issue:`19289`) - Improved performance of pairwise ``.rolling()`` and ``.expanding()`` with ``.cov()`` and ``.corr()`` operations (:issue:`17917`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` (:issue:`15779`) - Improved performance of variable ``.rolling()`` on ``.min()`` and ``.max()`` (:issue:`19521`) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index d4b9545999bc7..8098f7bb7d246 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -1476,27 +1476,35 @@ def remove_unused_levels(self): changed = False for lev, lab in zip(self.levels, self.labels): - uniques = algos.unique(lab) - na_idx = np.where(uniques == -1)[0] - - # nothing unused - if len(uniques) != len(lev) + len(na_idx): + # Since few levels are typically unused, bincount() is more + # efficient than unique() - however it only accepts positive values + # (and drops order): + uniques = np.where(np.bincount(lab + 1) > 0)[0] - 1 + has_na = int(len(uniques) and (uniques[0] == -1)) + + if len(uniques) != len(lev) + has_na: + # We have unused levels changed = True - if len(na_idx): + # Recalculate uniques, now preserving order. + # Can easily be cythonized by exploiting the already existing + # "uniques" and stop parsing "lab" when all items are found: + uniques = algos.unique(lab) + if has_na: + na_idx = np.where(uniques == -1)[0] # Just ensure that -1 is in first position: uniques[[0, na_idx[0]]] = uniques[[na_idx[0], 0]] # labels get mapped from uniques to 0:len(uniques) # -1 (if present) is mapped to last position - label_mapping = np.zeros(len(lev) + len(na_idx)) + label_mapping = np.zeros(len(lev) + has_na) # ... and reassigned value -1: - label_mapping[uniques] = np.arange(len(uniques)) - len(na_idx) + label_mapping[uniques] = np.arange(len(uniques)) - has_na lab = label_mapping[lab] # new levels are simple - lev = lev.take(uniques[len(na_idx):]) + lev = lev.take(uniques[has_na:]) new_levels.append(lev) new_labels.append(lab)