Skip to content

PERF: Avoid materializing values in Categorical.set_categories #17515

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 14, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions asv_bench/benchmarks/categoricals.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,9 @@ def time_value_counts_dropna(self):
def time_rendering(self):
str(self.sel)

def time_set_categories(self):
self.ts.cat.set_categories(self.ts.cat.categories[::2])


class Categoricals3(object):
goal_time = 0.2
Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.21.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -467,6 +467,7 @@ Performance Improvements

- Improved performance of instantiating :class:`SparseDataFrame` (:issue:`16773`)
- :attr:`Series.dt` no longer performs frequency inference, yielding a large speedup when accessing the attribute (:issue:`17210`)
- Improved performance of :meth:`Categorical.set_categories` by not materializing the values (:issue:`17508`)
- :attr:`Timestamp.microsecond` no longer re-computes on attribute access (:issue:`17331`)

.. _whatsnew_0210.bug_fixes:
Expand Down
37 changes: 35 additions & 2 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -777,8 +777,9 @@ def set_categories(self, new_categories, ordered=None, rename=False,
# remove all _codes which are larger and set to -1/NaN
self._codes[self._codes >= len(new_categories)] = -1
else:
values = cat.__array__()
cat._codes = _get_codes_for_values(values, new_categories)
codes = _recode_for_categories(self.codes, self.categories,
new_categories)
cat._codes = codes
cat._categories = new_categories

if ordered is None:
Expand Down Expand Up @@ -2113,6 +2114,38 @@ def _get_codes_for_values(values, categories):
return coerce_indexer_dtype(t.lookup(vals), cats)


def _recode_for_categories(codes, old_categories, new_categories):
"""
Convert a set of codes for to a new set of categories
Parameters
----------
codes : array
old_categories, new_categories : Index
Returns
-------
new_codes : array
Examples
--------
>>> old_cat = pd.Index(['b', 'a', 'c'])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

isn't this just a special case of union_categoricals?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

these both remap codes; seems like they've should share

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. There was a little bit I could extract from union_categorical. Simplified things a bit too by using take_1d. See my latest commit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks nice.

>>> new_cat = pd.Index(['a', 'b'])
>>> codes = np.array([0, 1, 1, 2])
>>> _recode_for_categories(codes, old_cat, new_cat)
array([ 1, 0, 0, -1])
"""
from pandas.core.algorithms import take_1d

if len(old_categories) == 0:
# All null anyway, so just retain the nulls
return codes
indexer = coerce_indexer_dtype(new_categories.get_indexer(old_categories),
new_categories)
new_codes = take_1d(indexer, codes.copy(), fill_value=-1)
return new_codes


def _convert_to_list_like(list_like):
if hasattr(list_like, "dtype"):
return list_like
Expand Down
11 changes: 3 additions & 8 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,7 @@ def union_categoricals(to_union, sort_categories=False, ignore_order=False):
Categories (3, object): [b, c, a]
"""
from pandas import Index, Categorical, CategoricalIndex, Series
from pandas.core.categorical import _recode_for_categories

if len(to_union) == 0:
raise ValueError('No Categoricals to union')
Expand Down Expand Up @@ -359,14 +360,8 @@ def _maybe_unwrap(x):

new_codes = []
for c in to_union:
if len(c.categories) > 0:
indexer = categories.get_indexer(c.categories)

from pandas.core.algorithms import take_1d
new_codes.append(take_1d(indexer, c.codes, fill_value=-1))
else:
# must be all NaN
new_codes.append(c.codes)
new_codes.append(_recode_for_categories(c.codes, c.categories,
categories))
new_codes = np.concatenate(new_codes)
else:
# ordered - to show a proper error message
Expand Down
62 changes: 62 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
Interval, IntervalIndex)
from pandas.compat import range, lrange, u, PY3, PYPY
from pandas.core.config import option_context
from pandas.core.categorical import _recode_for_categories


class TestCategorical(object):
Expand Down Expand Up @@ -963,6 +964,67 @@ def test_rename_categories(self):
with pytest.raises(ValueError):
cat.rename_categories([1, 2])

@pytest.mark.parametrize('codes, old, new, expected', [
([0, 1], ['a', 'b'], ['a', 'b'], [0, 1]),
([0, 1], ['b', 'a'], ['b', 'a'], [0, 1]),
([0, 1], ['a', 'b'], ['b', 'a'], [1, 0]),
([0, 1], ['b', 'a'], ['a', 'b'], [1, 0]),
([0, 1, 0, 1], ['a', 'b'], ['a', 'b', 'c'], [0, 1, 0, 1]),
([0, 1, 2, 2], ['a', 'b', 'c'], ['a', 'b'], [0, 1, -1, -1]),
([0, 1, -1], ['a', 'b', 'c'], ['a', 'b', 'c'], [0, 1, -1]),
([0, 1, -1], ['a', 'b', 'c'], ['b'], [-1, 0, -1]),
([0, 1, -1], ['a', 'b', 'c'], ['d'], [-1, -1, -1]),
([0, 1, -1], ['a', 'b', 'c'], [], [-1, -1, -1]),
([-1, -1], [], ['a', 'b'], [-1, -1]),
([1, 0], ['b', 'a'], ['a', 'b'], [0, 1]),
])
def test_recode_to_categories(self, codes, old, new, expected):
codes = np.asanyarray(codes, dtype=np.int8)
expected = np.asanyarray(expected, dtype=np.int8)
old = Index(old)
new = Index(new)
result = _recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)

def test_recode_to_categories_large(self):
N = 1000
codes = np.arange(N)
old = Index(codes)
expected = np.arange(N - 1, -1, -1, dtype=np.int16)
new = Index(expected)
result = _recode_for_categories(codes, old, new)
tm.assert_numpy_array_equal(result, expected)

@pytest.mark.parametrize('values, categories, new_categories', [
# No NaNs, same cats, same order
(['a', 'b', 'a'], ['a', 'b'], ['a', 'b'],),
# No NaNs, same cats, different order
(['a', 'b', 'a'], ['a', 'b'], ['b', 'a'],),
# Same, unsorted
(['b', 'a', 'a'], ['a', 'b'], ['a', 'b'],),
# No NaNs, same cats, different order
(['b', 'a', 'a'], ['a', 'b'], ['b', 'a'],),
# NaNs
(['a', 'b', 'c'], ['a', 'b'], ['a', 'b']),
(['a', 'b', 'c'], ['a', 'b'], ['b', 'a']),
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
(['b', 'a', 'c'], ['a', 'b'], ['a', 'b']),
# Introduce NaNs
(['a', 'b', 'c'], ['a', 'b'], ['a']),
(['a', 'b', 'c'], ['a', 'b'], ['b']),
(['b', 'a', 'c'], ['a', 'b'], ['a']),
(['b', 'a', 'c'], ['a', 'b'], ['a']),
# No overlap
(['a', 'b', 'c'], ['a', 'b'], ['d', 'e']),
])
@pytest.mark.parametrize('ordered', [True, False])
def test_set_categories_many(self, values, categories, new_categories,
ordered):
c = Categorical(values, categories)
expected = Categorical(values, new_categories, ordered)
result = c.set_categories(new_categories, ordered=ordered)
tm.assert_categorical_equal(result, expected)

def test_reorder_categories(self):
cat = Categorical(["a", "b", "c", "a"], ordered=True)
old = cat.copy()
Expand Down