Skip to content

Commit 8c740fd

Browse files
committed
PERF: Avoid values in Categorical.set_categories
Mater: ```python In [1]: import pandas as pd; import numpy as np In [2]: arr = ['s%04d' % i for i in np.random.randint(0, 500000 // 10, size=500000)]; s = pd.Series(arr).astype('category') In [3]: %timeit s.cat.set_categories(s.cat.categories) 68.5 ms ± 846 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) ``` HEAD: ```python In [1]: import pandas as pd; import numpy as np In [2]: arr = ['s%04d' % i for i in np.random.randint(0, 500000 // 10, size=500000)] s = pd.Series(arr).astype('category') In [3]: %timeit s.cat.set_categories(s.cat.categories) 7.43 ms ± 110 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) ``` Closes pandas-dev#17508 (cherry picked from commit 9b311f4)
1 parent 35ccad0 commit 8c740fd

File tree

5 files changed

+104
-3
lines changed

5 files changed

+104
-3
lines changed

asv_bench/benchmarks/categoricals.py

+3
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ def time_value_counts_dropna(self):
6767
def time_rendering(self):
6868
str(self.sel)
6969

70+
def time_set_categories(self):
71+
self.ts.cat.set_categories(self.ts.cat.categories[::2])
72+
7073

7174
class Categoricals3(object):
7275
goal_time = 0.2

doc/source/merging.rst

+3-1
Original file line numberDiff line numberDiff line change
@@ -842,9 +842,11 @@ The right frame.
842842

843843
.. ipython:: python
844844
845+
from pandas.api.types import CategoricalDtype
846+
845847
right = pd.DataFrame({
846848
'X': pd.Series(['foo', 'bar'],
847-
dtype=pd.api.types.CategoricalDtype(['foo', 'bar'])),
849+
dtype=CategoricalDtype(['foo', 'bar'])),
848850
'Z': [1, 2]
849851
})
850852
right

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,7 @@ Performance Improvements
440440

441441
- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`)
442442
- :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`)
443+
- Improved performance of :meth:`Categorical.set_categories` by not materializing the values (:issue:`17508`)
443444
- :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`)
444445

445446
.. _whatsnew_0210.bug_fixes:

pandas/core/categorical.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -752,8 +752,9 @@ def set_categories(self, new_categories, ordered=None, rename=False,
752752
# remove all _codes which are larger and set to -1/NaN
753753
self._codes[self._codes >= len(new_dtype.categories)] = -1
754754
else:
755-
values = cat.__array__()
756-
cat._codes = _get_codes_for_values(values, new_dtype.categories)
755+
codes = _recode_for_categories(self.codes, self.categories,
756+
new_dtype.categories)
757+
cat._codes = codes
757758
cat._dtype = new_dtype
758759

759760
if not inplace:
@@ -2095,6 +2096,38 @@ def _get_codes_for_values(values, categories):
20952096
return coerce_indexer_dtype(t.lookup(vals), cats)
20962097

20972098

2099+
def _recode_for_categories(codes, old_categories, new_categories):
2100+
"""
2101+
Convert a set of codes for to a new set of categories
2102+
2103+
Parameters
2104+
----------
2105+
codes : array
2106+
old_categories, new_categories : Index
2107+
2108+
Returns
2109+
-------
2110+
new_codes : array
2111+
2112+
Examples
2113+
--------
2114+
>>> old_cat = pd.Index(['b', 'a', 'c'])
2115+
>>> new_cat = pd.Index(['a', 'b'])
2116+
>>> codes = np.array([0, 1, 1, 2])
2117+
>>> _recode_for_categories(codes, old_cat, new_cat)
2118+
array([ 1, 0, 0, -1])
2119+
"""
2120+
if len(old_categories) == 0:
2121+
# All null anyway, so just retain the nulls
2122+
return codes
2123+
new_codes = codes.copy()
2124+
mapping = new_categories.get_indexer_for(old_categories)
2125+
mapping = coerce_indexer_dtype(mapping, new_categories)
2126+
new_codes = mapping[codes]
2127+
new_codes[codes == -1] = -1
2128+
return new_codes
2129+
2130+
20982131
def _convert_to_list_like(list_like):
20992132
if hasattr(list_like, "dtype"):
21002133
return list_like

pandas/tests/test_categorical.py

+62
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626
Interval, IntervalIndex)
2727
from pandas.compat import range, lrange, u, PY3, PYPY
2828
from pandas.core.config import option_context
29+
from pandas.core.categorical import _recode_for_categories
2930

3031

3132
class TestCategorical(object):
@@ -1063,6 +1064,67 @@ def test_rename_categories(self):
10631064
with pytest.raises(ValueError):
10641065
cat.rename_categories([1, 2])
10651066

1067+
@pytest.mark.parametrize('codes, old, new, expected', [
1068+
([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]),
1069+
([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]),
1070+
([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]),
1071+
([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]),
1072+
([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]),
1073+
([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]),
1074+
([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]),
1075+
([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]),
1076+
([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]),
1077+
([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]),
1078+
([-1, -1], [], ['a', 'b'], [-1, -1]),
1079+
([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]),
1080+
])
1081+
def test_recode_to_categories(self, codes, old, new, expected):
1082+
codes = np.asanyarray(codes, dtype=np.int8)
1083+
expected = np.asanyarray(expected, dtype=np.int8)
1084+
old = Index(old)
1085+
new = Index(new)
1086+
result = _recode_for_categories(codes, old, new)
1087+
tm.assert_numpy_array_equal(result, expected)
1088+
1089+
def test_recode_to_categories_large(self):
1090+
N = 1000
1091+
codes = np.arange(N)
1092+
old = Index(codes)
1093+
expected = np.arange(N - 1, -1, -1, dtype=np.int16)
1094+
new = Index(expected)
1095+
result = _recode_for_categories(codes, old, new)
1096+
tm.assert_numpy_array_equal(result, expected)
1097+
1098+
@pytest.mark.parametrize('values, categories, new_categories', [
1099+
# No NaNs, same cats, same order
1100+
(['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
1101+
# No NaNs, same cats, different order
1102+
(['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
1103+
# Same, unsorted
1104+
(['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
1105+
# No NaNs, same cats, different order
1106+
(['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
1107+
# NaNs
1108+
(['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
1109+
(['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
1110+
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
1111+
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
1112+
# Introduce NaNs
1113+
(['a', 'b', 'c'], ['a', 'b'], ['a']),
1114+
(['a', 'b', 'c'], ['a', 'b'], ['b']),
1115+
(['b', 'a', 'c'], ['a', 'b'], ['a']),
1116+
(['b', 'a', 'c'], ['a', 'b'], ['a']),
1117+
# No overlap
1118+
(['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
1119+
])
1120+
@pytest.mark.parametrize('ordered', [True, False])
1121+
def test_set_categories_many(self, values, categories, new_categories,
1122+
ordered):
1123+
c = Categorical(values, categories)
1124+
expected = Categorical(values, new_categories, ordered)
1125+
result = c.set_categories(new_categories, ordered=ordered)
1126+
tm.assert_categorical_equal(result, expected)
1127+
10661128
def test_reorder_categories(self):
10671129
cat = Categorical(["a", "b", "c", "a"], ordered=True)
10681130
old = cat.copy()

0 commit comments

Comments
 (0)