Skip to content

Commit 80be9b5

Browse files
rtlee9jreback
authored andcommitted
Improve dictionary map performance on category series, fixes #23785 (#26015)
1 parent 7eff627 commit 80be9b5

File tree

3 files changed

+27
-11
lines changed

3 files changed

+27
-11
lines changed

asv_bench/benchmarks/series_methods.py

+18-7
Original file line numberDiff line numberDiff line change
@@ -145,16 +145,27 @@ def time_searchsorted(self, dtype):
145145

146146
class Map:
147147

148-
params = ['dict', 'Series']
148+
params = (['dict', 'Series', 'lambda'], ['object', 'category', 'int'])
149149
param_names = 'mapper'
150150

151-
def setup(self, mapper):
151+
def setup(self, mapper, dtype):
152152
map_size = 1000
153-
map_data = Series(map_size - np.arange(map_size))
154-
self.map_data = map_data if mapper == 'Series' else map_data.to_dict()
155-
self.s = Series(np.random.randint(0, map_size, 10000))
156-
157-
def time_map(self, mapper):
153+
map_data = Series(map_size - np.arange(map_size), dtype=dtype)
154+
155+
# construct mapper
156+
if mapper == 'Series':
157+
self.map_data = map_data
158+
elif mapper == 'dict':
159+
self.map_data = map_data.to_dict()
160+
elif mapper == 'lambda':
161+
map_dict = map_data.to_dict()
162+
self.map_data = lambda x: map_dict[x]
163+
else:
164+
raise NotImplementedError
165+
166+
self.s = Series(np.random.randint(0, map_size, 10000), dtype=dtype)
167+
168+
def time_map(self, mapper, *args, **kwargs):
158169
self.s.map(self.map_data)
159170

160171

doc/source/whatsnew/v0.25.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -245,10 +245,11 @@ Performance Improvements
245245
- Improved performance of :meth:`pandas.core.groupby.GroupBy.quantile` (:issue:`20405`)
246246
- Improved performance of :meth:`read_csv` by faster tokenizing and faster parsing of small float numbers (:issue:`25784`)
247247
- Improved performance of :meth:`read_csv` by faster parsing of N/A and boolean values (:issue:`25804`)
248-
- Imporved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`)
248+
- Improved performance of :meth:`IntervalIndex.is_monotonic`, :meth:`IntervalIndex.is_monotonic_increasing` and :meth:`IntervalIndex.is_monotonic_decreasing` by removing conversion to :class:`MultiIndex` (:issue:`24813`)
249249
- Improved performance of :meth:`DataFrame.to_csv` when writing datetime dtypes (:issue:`25708`)
250250
- Improved performance of :meth:`read_csv` by much faster parsing of ``MM/YYYY`` and ``DD/MM/YYYY`` datetime formats (:issue:`25922`)
251251
- Improved performance of nanops for dtypes that cannot store NaNs. Speedup is particularly prominent for :meth:`Series.all` and :meth:`Series.any` (:issue:`25070`)
252+
- Improved performance of :meth:`Series.map` for dictionary mappers on categorical series by mapping the categories instead of mapping all values (:issue:`23785`)
252253

253254
.. _whatsnew_0250.bug_fixes:
254255

pandas/core/base.py

+7-3
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,9 @@
1616
from pandas.util._validators import validate_bool_kwarg
1717

1818
from pandas.core.dtypes.common import (
19-
is_datetime64_ns_dtype, is_datetime64tz_dtype, is_datetimelike,
20-
is_extension_array_dtype, is_extension_type, is_list_like, is_object_dtype,
21-
is_scalar, is_timedelta64_ns_dtype)
19+
is_categorical_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype,
20+
is_datetimelike, is_extension_array_dtype, is_extension_type, is_list_like,
21+
is_object_dtype, is_scalar, is_timedelta64_ns_dtype)
2222
from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries
2323
from pandas.core.dtypes.missing import isna
2424

@@ -1183,6 +1183,10 @@ def _map_values(self, mapper, na_action=None):
11831183
if isinstance(mapper, ABCSeries):
11841184
# Since values were input this means we came from either
11851185
# a dict or a series and mapper should be an index
1186+
if is_categorical_dtype(self._values):
1187+
# use the built in categorical series mapper which saves
1188+
# time by mapping the categories instead of all values
1189+
return self._values.map(mapper)
11861190
if is_extension_type(self.dtype):
11871191
values = self._values
11881192
else:

0 commit comments

Comments
 (0)