PERF: improves performance in remove_unused_categories

behzadnouri · jreback · commit 7031e5f11155 · 2015-11-18T22:12:27.000-05:00
diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt
@@ -133,6 +133,7 @@ Performance Improvements
 - Improved performance of ``rolling_median`` (:issue:`11450`)
 - Improved performance to ``to_excel`` (:issue:`11352`)
 - Performance bug in repr of ``Categorical`` categories, which was rendering the strings before chopping them for display (:issue:`11305`)
+- Performance improvement in ``Categorical.remove_unused_categories``, (:issue:`11643`).
 - Improved performance of ``Series`` constructor with no data and ``DatetimeIndex`` (:issue:`11433`)
 
 - Improved performance of ``shift``, ``cumprod``, and ``cumsum`` with groupby (:issue:`4095`)
@@ -188,5 +189,5 @@ Bug Fixes
 - Bug in ``DataFrame.join()`` with ``how='right'`` producing a ``TypeError`` (:issue:`11519`)
 - Bug in ``Series.quantile`` with empty list results has ``Index`` with ``object`` dtype (:issue:`11588`)
 - Bug in ``pd.merge`` results in empty ``Int64Index`` rather than ``Index(dtype=object)`` when the merge result is empty (:issue:`11588`)
-- Bug in ``remove_unused_categories`` when having ``NaN`` values (:issue:`11599`).
+- Bug in ``Categorical.remove_unused_categories`` when having ``NaN`` values (:issue:`11599`)
 - Bug in ``DataFrame.to_sparse()`` loses column names for MultiIndexes (:issue:`11600`)
diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py
@@ -816,16 +816,14 @@ def remove_unused_categories(self, inplace=False):
         set_categories
         """
         cat = self if inplace else self.copy()
-        _used = sorted(np.unique(cat._codes))
-        if _used[0] == -1:
-            _used = _used[1:]
+        idx, inv = np.unique(cat._codes, return_inverse=True)
 
-        new_categories = cat.categories.take(_ensure_platform_int(_used))
+        if idx.size != 0 and idx[0] == -1:  # na sentinel
+            idx, inv = idx[1:], inv - 1
+
+        cat._codes = inv
+        cat._categories = cat.categories.take(idx)
 
-        from pandas.core.index import _ensure_index
-        new_categories = _ensure_index(new_categories)
-        cat._codes = _get_codes_for_values(cat.__array__(), new_categories)
-        cat._categories = new_categories
         if not inplace:
             return cat
 
diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py
@@ -850,6 +850,21 @@ def test_remove_unused_categories(self):
         self.assert_numpy_array_equal(res.categories, np.array(["a","b","c"]))
         self.assert_numpy_array_equal(c.categories, exp_categories_all)
 
+        val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan]
+        cat = pd.Categorical(values=val, categories=list('ABCDEFG'))
+        out = cat.remove_unused_categories()
+        self.assert_numpy_array_equal(out.categories, ['B', 'D', 'F'])
+        self.assert_numpy_array_equal(out.codes, [ 2, -1,  1,  0,  1,  2, -1])
+        self.assertEqual(out.get_values().tolist(), val)
+
+        alpha = list('abcdefghijklmnopqrstuvwxyz')
+        val = np.random.choice(alpha[::2], 10000).astype('object')
+        val[np.random.choice(len(val), 100)] = np.nan
+
+        cat = pd.Categorical(values=val, categories=alpha)
+        out = cat.remove_unused_categories()
+        self.assertEqual(out.get_values().tolist(), val.tolist())
+
     def test_nan_handling(self):
 
         # Nans are represented as -1 in codes