Skip to content

Commit d238e3e

Browse files
committed
PERF: Avoid values in Categorical.set_categories
Mater: ```python In [1]: import pandas as pd; import numpy as np In [2]: arr = ['s%04d' % i for i in np.random.randint(0, 500000 // 10, size=500000)]; s = pd.Series(arr).astype('category') In [3]: %timeit s.cat.set_categories(s.cat.categories) 68.5 ms ± 846 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` HEAD: ```python In [1]: import pandas as pd; import numpy as np In [2]: arr = ['s%04d' % i for i in np.random.randint(0, 500000 // 10, size=500000)] s = pd.Series(arr).astype('category') In [3]: %timeit s.cat.set_categories(s.cat.categories) 7.43 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` Closes #17508
1 parent 97abd2c commit d238e3e

File tree

5 files changed

+104
-10
lines changed

5 files changed

+104
-10
lines changed

asv_bench/benchmarks/categoricals.py

+3
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ def time_value_counts_dropna(self):
6767
def time_rendering(self):
6868
str(self.sel)
6969

70+
def time_set_categories(self):
71+
self.ts.cat.set_categories(self.ts.cat.categories[::2])
72+
7073

7174
class Categoricals3(object):
7275
goal_time = 0.2

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -467,6 +467,7 @@ Performance Improvements
467467

468468
- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`)
469469
- :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`)
470+
- Improved performance of :meth:`Categorical.set_categories` by not materializing the values (:issue:`17508`)
470471
- :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`)
471472

472473
.. _whatsnew_0210.bug_fixes:

pandas/core/categorical.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -777,8 +777,9 @@ def set_categories(self, new_categories, ordered=None, rename=False,
777777
# remove all _codes which are larger and set to -1/NaN
778778
self._codes[self._codes >= len(new_categories)] = -1
779779
else:
780-
values = cat.__array__()
781-
cat._codes = _get_codes_for_values(values, new_categories)
780+
codes = _recode_for_categories(self.codes, self.categories,
781+
new_categories)
782+
cat._codes = codes
782783
cat._categories = new_categories
783784

784785
if ordered is None:
@@ -2113,6 +2114,38 @@ def _get_codes_for_values(values, categories):
21132114
return coerce_indexer_dtype(t.lookup(vals), cats)
21142115

21152116

2117+
def _recode_for_categories(codes, old_categories, new_categories):
2118+
"""
2119+
Convert a set of codes for to a new set of categories
2120+
2121+
Parameters
2122+
----------
2123+
codes : array
2124+
old_categories, new_categories : Index
2125+
2126+
Returns
2127+
-------
2128+
new_codes : array
2129+
2130+
Examples
2131+
--------
2132+
>>> old_cat = pd.Index(['b', 'a', 'c'])
2133+
>>> new_cat = pd.Index(['a', 'b'])
2134+
>>> codes = np.array([0, 1, 1, 2])
2135+
>>> _recode_for_categories(codes, old_cat, new_cat)
2136+
array([ 1, 0, 0, -1])
2137+
"""
2138+
from pandas.core.algorithms import take_1d
2139+
2140+
if len(old_categories) == 0:
2141+
# All null anyway, so just retain the nulls
2142+
return codes
2143+
indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),
2144+
new_categories)
2145+
new_codes = take_1d(indexer, codes.copy(), fill_value=-1)
2146+
return new_codes
2147+
2148+
21162149
def _convert_to_list_like(list_like):
21172150
if hasattr(list_like, "dtype"):
21182151
return list_like

pandas/core/dtypes/concat.py

+3-8
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False):
314314
Categories (3, object): [b, c, a]
315315
"""
316316
from pandas import Index, Categorical, CategoricalIndex, Series
317+
from pandas.core.categorical import _recode_for_categories
317318

318319
if len(to_union) == 0:
319320
raise ValueError('No Categoricals to union')
@@ -359,14 +360,8 @@ def _maybe_unwrap(x):
359360

360361
new_codes = []
361362
for c in to_union:
362-
if len(c.categories) > 0:
363-
indexer = categories.get_indexer(c.categories)
364-
365-
from pandas.core.algorithms import take_1d
366-
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
367-
else:
368-
# must be all NaN
369-
new_codes.append(c.codes)
363+
new_codes.append(_recode_for_categories(c.codes, c.categories,
364+
categories))
370365
new_codes = np.concatenate(new_codes)
371366
else:
372367
# ordered - to show a proper error message

pandas/tests/test_categorical.py

+62
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
Interval, IntervalIndex)
2727
from pandas.compat import range, lrange, u, PY3, PYPY
2828
from pandas.core.config import option_context
29+
from pandas.core.categorical import _recode_for_categories
2930

3031

3132
class TestCategorical(object):
@@ -963,6 +964,67 @@ def test_rename_categories(self):
963964
with pytest.raises(ValueError):
964965
cat.rename_categories([1, 2])
965966

967+
@pytest.mark.parametrize('codes, old, new, expected', [
968+
([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]),
969+
([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]),
970+
([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]),
971+
([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]),
972+
([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]),
973+
([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]),
974+
([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]),
975+
([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]),
976+
([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]),
977+
([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]),
978+
([-1, -1], [], ['a', 'b'], [-1, -1]),
979+
([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]),
980+
])
981+
def test_recode_to_categories(self, codes, old, new, expected):
982+
codes = np.asanyarray(codes, dtype=np.int8)
983+
expected = np.asanyarray(expected, dtype=np.int8)
984+
old = Index(old)
985+
new = Index(new)
986+
result = _recode_for_categories(codes, old, new)
987+
tm.assert_numpy_array_equal(result, expected)
988+
989+
def test_recode_to_categories_large(self):
990+
N = 1000
991+
codes = np.arange(N)
992+
old = Index(codes)
993+
expected = np.arange(N - 1, -1, -1, dtype=np.int16)
994+
new = Index(expected)
995+
result = _recode_for_categories(codes, old, new)
996+
tm.assert_numpy_array_equal(result, expected)
997+
998+
@pytest.mark.parametrize('values, categories, new_categories', [
999+
# No NaNs, same cats, same order
1000+
(['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
1001+
# No NaNs, same cats, different order
1002+
(['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
1003+
# Same, unsorted
1004+
(['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
1005+
# No NaNs, same cats, different order
1006+
(['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
1007+
# NaNs
1008+
(['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
1009+
(['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
1010+
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
1011+
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
1012+
# Introduce NaNs
1013+
(['a', 'b', 'c'], ['a', 'b'], ['a']),
1014+
(['a', 'b', 'c'], ['a', 'b'], ['b']),
1015+
(['b', 'a', 'c'], ['a', 'b'], ['a']),
1016+
(['b', 'a', 'c'], ['a', 'b'], ['a']),
1017+
# No overlap
1018+
(['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
1019+
])
1020+
@pytest.mark.parametrize('ordered', [True, False])
1021+
def test_set_categories_many(self, values, categories, new_categories,
1022+
ordered):
1023+
c = Categorical(values, categories)
1024+
expected = Categorical(values, new_categories, ordered)
1025+
result = c.set_categories(new_categories, ordered=ordered)
1026+
tm.assert_categorical_equal(result, expected)
1027+
9661028
def test_reorder_categories(self):
9671029
cat = Categorical(["a", "b", "c", "a"], ordered=True)
9681030
old = cat.copy()

0 commit comments

Comments
 (0)