Skip to content

Commit 9b311f4

Browse files
committed
PERF: Avoid values in Categorical.set_categories
Mater: ```python In [1]: import pandas as pd; import numpy as np In [2]: arr = ['s%04d' % i for i in np.random.randint(0, 500000 // 10, size=500000)]; s = pd.Series(arr).astype('category') In [3]: %timeit s.cat.set_categories(s.cat.categories) 68.5 ms ± 846 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` HEAD: ```python In [1]: import pandas as pd; import numpy as np In [2]: arr = ['s%04d' % i for i in np.random.randint(0, 500000 // 10, size=500000)] s = pd.Series(arr).astype('category') In [3]: %timeit s.cat.set_categories(s.cat.categories) 7.43 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` Closes #17508
1 parent f11bbf2 commit 9b311f4

File tree

4 files changed

+101
-2
lines changed

4 files changed

+101
-2
lines changed

asv_bench/benchmarks/categoricals.py

+3
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ def time_value_counts_dropna(self):
6767
def time_rendering(self):
6868
str(self.sel)
6969

70+
def time_set_categories(self):
71+
self.ts.cat.set_categories(self.ts.cat.categories[::2])
72+
7073

7174
class Categoricals3(object):
7275
goal_time = 0.2

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -414,6 +414,7 @@ Performance Improvements
414414

415415
- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`)
416416
- :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`)
417+
- Improved performance of :meth:`Categorical.set_categories` by not materializing the values (:issue:`17508`)
417418
- :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`)
418419

419420
.. _whatsnew_0210.bug_fixes:

pandas/core/categorical.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -771,8 +771,9 @@ def set_categories(self, new_categories, ordered=None, rename=False,
771771
# remove all _codes which are larger and set to -1/NaN
772772
self._codes[self._codes >= len(new_categories)] = -1
773773
else:
774-
values = cat.__array__()
775-
cat._codes = _get_codes_for_values(values, new_categories)
774+
codes = _recode_for_categories(self.codes, self.categories,
775+
new_categories)
776+
cat._codes = codes
776777
cat._categories = new_categories
777778

778779
if ordered is None:
@@ -2107,6 +2108,38 @@ def _get_codes_for_values(values, categories):
21072108
return coerce_indexer_dtype(t.lookup(vals), cats)
21082109

21092110

2111+
def _recode_for_categories(codes, old_categories, new_categories):
2112+
"""
2113+
Convert a set of codes for to a new set of categories
2114+
2115+
Parameters
2116+
----------
2117+
codes : array
2118+
old_categories, new_categories : Index
2119+
2120+
Returns
2121+
-------
2122+
new_codes : array
2123+
2124+
Examples
2125+
--------
2126+
>>> old_cat = pd.Index(['b', 'a', 'c'])
2127+
>>> new_cat = pd.Index(['a', 'b'])
2128+
>>> codes = np.array([0, 1, 1, 2])
2129+
>>> _recode_for_categories(codes, old_cat, new_cat)
2130+
array([ 1, 0, 0, -1])
2131+
"""
2132+
if len(old_categories) == 0:
2133+
# All null anyway, so just retain the nulls
2134+
return codes
2135+
new_codes = codes.copy()
2136+
mapping = new_categories.get_indexer_for(old_categories)
2137+
mapping = coerce_indexer_dtype(mapping, new_categories)
2138+
new_codes = mapping[codes]
2139+
new_codes[codes == -1] = -1
2140+
return new_codes
2141+
2142+
21102143
def _convert_to_list_like(list_like):
21112144
if hasattr(list_like, "dtype"):
21122145
return list_like

pandas/tests/test_categorical.py

+62
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
Interval, IntervalIndex)
2727
from pandas.compat import range, lrange, u, PY3, PYPY
2828
from pandas.core.config import option_context
29+
from pandas.core.categorical import _recode_for_categories
2930

3031

3132
class TestCategorical(object):
@@ -963,6 +964,67 @@ def test_rename_categories(self):
963964
with pytest.raises(ValueError):
964965
cat.rename_categories([1, 2])
965966

967+
@pytest.mark.parametrize('codes, old, new, expected', [
968+
([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]),
969+
([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]),
970+
([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]),
971+
([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]),
972+
([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]),
973+
([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]),
974+
([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]),
975+
([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]),
976+
([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]),
977+
([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]),
978+
([-1, -1], [], ['a', 'b'], [-1, -1]),
979+
([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]),
980+
])
981+
def test_recode_to_categories(self, codes, old, new, expected):
982+
codes = np.asanyarray(codes, dtype=np.int8)
983+
expected = np.asanyarray(expected, dtype=np.int8)
984+
old = Index(old)
985+
new = Index(new)
986+
result = _recode_for_categories(codes, old, new)
987+
tm.assert_numpy_array_equal(result, expected)
988+
989+
def test_recode_to_categories_large(self):
990+
N = 1000
991+
codes = np.arange(N)
992+
old = Index(codes)
993+
expected = np.arange(N - 1, -1, -1, dtype=np.int16)
994+
new = Index(expected)
995+
result = _recode_for_categories(codes, old, new)
996+
tm.assert_numpy_array_equal(result, expected)
997+
998+
@pytest.mark.parametrize('values, categories, new_categories', [
999+
# No NaNs, same cats, same order
1000+
(['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
1001+
# No NaNs, same cats, different order
1002+
(['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
1003+
# Same, unsorted
1004+
(['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
1005+
# No NaNs, same cats, different order
1006+
(['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
1007+
# NaNs
1008+
(['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
1009+
(['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
1010+
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
1011+
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
1012+
# Introduce NaNs
1013+
(['a', 'b', 'c'], ['a', 'b'], ['a']),
1014+
(['a', 'b', 'c'], ['a', 'b'], ['b']),
1015+
(['b', 'a', 'c'], ['a', 'b'], ['a']),
1016+
(['b', 'a', 'c'], ['a', 'b'], ['a']),
1017+
# No overlap
1018+
(['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
1019+
])
1020+
@pytest.mark.parametrize('ordered', [True, False])
1021+
def test_set_categories_many(self, values, categories, new_categories,
1022+
ordered):
1023+
c = Categorical(values, categories)
1024+
expected = Categorical(values, new_categories, ordered)
1025+
result = c.set_categories(new_categories, ordered=ordered)
1026+
tm.assert_categorical_equal(result, expected)
1027+
9661028
def test_reorder_categories(self):
9671029
cat = Categorical(["a", "b", "c", "a"], ordered=True)
9681030
old = cat.copy()

0 commit comments

Comments
 (0)