Skip to content

Commit 7031e5f

Browse files
behzadnourijreback
authored andcommitted
PERF: improves performance in remove_unused_categories
1 parent 207e0ce commit 7031e5f

File tree

3 files changed

+23
-9
lines changed

3 files changed

+23
-9
lines changed

doc/source/whatsnew/v0.17.1.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,7 @@ Performance Improvements
133133
- Improved performance of ``rolling_median`` (:issue:`11450`)
134134
- Improved performance to ``to_excel`` (:issue:`11352`)
135135
- Performance bug in repr of ``Categorical`` categories, which was rendering the strings before chopping them for display (:issue:`11305`)
136+
- Performance improvement in ``Categorical.remove_unused_categories``, (:issue:`11643`).
136137
- Improved performance of ``Series`` constructor with no data and ``DatetimeIndex`` (:issue:`11433`)
137138

138139
- Improved performance of ``shift``, ``cumprod``, and ``cumsum`` with groupby (:issue:`4095`)
@@ -188,5 +189,5 @@ Bug Fixes
188189
- Bug in ``DataFrame.join()`` with ``how='right'`` producing a ``TypeError`` (:issue:`11519`)
189190
- Bug in ``Series.quantile`` with empty list results has ``Index`` with ``object`` dtype (:issue:`11588`)
190191
- Bug in ``pd.merge`` results in empty ``Int64Index`` rather than ``Index(dtype=object)`` when the merge result is empty (:issue:`11588`)
191-
- Bug in ``remove_unused_categories`` when having ``NaN`` values (:issue:`11599`).
192+
- Bug in ``Categorical.remove_unused_categories`` when having ``NaN`` values (:issue:`11599`)
192193
- Bug in ``DataFrame.to_sparse()`` loses column names for MultiIndexes (:issue:`11600`)

pandas/core/categorical.py

+6-8
Original file line numberDiff line numberDiff line change
@@ -816,16 +816,14 @@ def remove_unused_categories(self, inplace=False):
816816
set_categories
817817
"""
818818
cat = self if inplace else self.copy()
819-
_used = sorted(np.unique(cat._codes))
820-
if _used[0] == -1:
821-
_used = _used[1:]
819+
idx, inv = np.unique(cat._codes, return_inverse=True)
822820

823-
new_categories = cat.categories.take(_ensure_platform_int(_used))
821+
if idx.size != 0 and idx[0] == -1: # na sentinel
822+
idx, inv = idx[1:], inv - 1
823+
824+
cat._codes = inv
825+
cat._categories = cat.categories.take(idx)
824826

825-
from pandas.core.index import _ensure_index
826-
new_categories = _ensure_index(new_categories)
827-
cat._codes = _get_codes_for_values(cat.__array__(), new_categories)
828-
cat._categories = new_categories
829827
if not inplace:
830828
return cat
831829

pandas/tests/test_categorical.py

+15
Original file line numberDiff line numberDiff line change
@@ -850,6 +850,21 @@ def test_remove_unused_categories(self):
850850
self.assert_numpy_array_equal(res.categories, np.array(["a","b","c"]))
851851
self.assert_numpy_array_equal(c.categories, exp_categories_all)
852852

853+
val = ['F', np.nan, 'D', 'B', 'D', 'F', np.nan]
854+
cat = pd.Categorical(values=val, categories=list('ABCDEFG'))
855+
out = cat.remove_unused_categories()
856+
self.assert_numpy_array_equal(out.categories, ['B', 'D', 'F'])
857+
self.assert_numpy_array_equal(out.codes, [ 2, -1, 1, 0, 1, 2, -1])
858+
self.assertEqual(out.get_values().tolist(), val)
859+
860+
alpha = list('abcdefghijklmnopqrstuvwxyz')
861+
val = np.random.choice(alpha[::2], 10000).astype('object')
862+
val[np.random.choice(len(val), 100)] = np.nan
863+
864+
cat = pd.Categorical(values=val, categories=alpha)
865+
out = cat.remove_unused_categories()
866+
self.assertEqual(out.get_values().tolist(), val.tolist())
867+
853868
def test_nan_handling(self):
854869

855870
# Nans are represented as -1 in codes

0 commit comments

Comments
 (0)