Skip to content

Commit 467ee2b

Browse files
nateyoderjreback
authored andcommitted
Allow indices to be mapped through through dictionaries or series (#15081)
1 parent 412988e commit 467ee2b

File tree

18 files changed

+386
-93
lines changed

18 files changed

+386
-93
lines changed

asv_bench/benchmarks/series_methods.py

+24
Original file line numberDiff line numberDiff line change
@@ -123,6 +123,30 @@ def time_series_dropna_datetime(self):
123123
self.s.dropna()
124124

125125

126+
class series_map_dict(object):
127+
goal_time = 0.2
128+
129+
def setup(self):
130+
map_size = 1000
131+
self.s = Series(np.random.randint(0, map_size, 10000))
132+
self.map_dict = {i: map_size - i for i in range(map_size)}
133+
134+
def time_series_map_dict(self):
135+
self.s.map(self.map_dict)
136+
137+
138+
class series_map_series(object):
139+
goal_time = 0.2
140+
141+
def setup(self):
142+
map_size = 1000
143+
self.s = Series(np.random.randint(0, map_size, 10000))
144+
self.map_series = Series(map_size - np.arange(map_size))
145+
146+
def time_series_map_series(self):
147+
self.s.map(self.map_series)
148+
149+
126150
class series_clip(object):
127151
goal_time = 0.2
128152

doc/source/whatsnew/v0.22.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ Other API Changes
7575
- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`)
7676
- `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`)
7777
- :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`)
78+
- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`).
7879
- :func:`Dataframe.unstack` will now default to filling with ``np.nan`` for ``object`` columns. (:issue:`12815`)
7980
- :class:`IntervalIndex` constructor will raise if the ``closed`` parameter conflicts with how the input data is inferred to be closed (:issue:`18421`)
8081

@@ -108,6 +109,7 @@ Performance Improvements
108109
- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`)
109110
- :class`DateOffset` arithmetic performance is improved (:issue:`18218`)
110111
- Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`)
112+
- Improved performance of ``.map()`` with a ``Series/dict`` input (:issue:`15081`)
111113
- The overriden ``Timedelta`` properties of days, seconds and microseconds have been removed, leveraging their built-in Python versions instead (:issue:`18242`)
112114
- ``Series`` construction will reduce the number of copies made of the input data in certain cases (:issue:`17449`)
113115
- Improved performance of :func:`Series.dt.date` and :func:`DatetimeIndex.date` (:issue:`18058`)

pandas/core/base.py

+75-2
Original file line numberDiff line numberDiff line change
@@ -12,11 +12,12 @@
1212
is_object_dtype,
1313
is_list_like,
1414
is_scalar,
15-
is_datetimelike)
15+
is_datetimelike,
16+
is_extension_type)
1617

1718
from pandas.util._validators import validate_bool_kwarg
1819

19-
from pandas.core import common as com
20+
from pandas.core import common as com, algorithms
2021
import pandas.core.nanops as nanops
2122
import pandas._libs.lib as lib
2223
from pandas.compat.numpy import function as nv
@@ -838,6 +839,78 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
838839
klass=self.__class__.__name__, op=name))
839840
return func(**kwds)
840841

842+
def _map_values(self, mapper, na_action=None):
843+
"""An internal function that maps values using the input
844+
correspondence (which can be a dict, Series, or function).
845+
846+
Parameters
847+
----------
848+
mapper : function, dict, or Series
849+
The input correspondence object
850+
na_action : {None, 'ignore'}
851+
If 'ignore', propagate NA values, without passing them to the
852+
mapping function
853+
854+
Returns
855+
-------
856+
applied : Union[Index, MultiIndex], inferred
857+
The output of the mapping function applied to the index.
858+
If the function returns a tuple with more than one element
859+
a MultiIndex will be returned.
860+
861+
"""
862+
863+
# we can fastpath dict/Series to an efficient map
864+
# as we know that we are not going to have to yield
865+
# python types
866+
if isinstance(mapper, dict):
867+
if hasattr(mapper, '__missing__'):
868+
# If a dictionary subclass defines a default value method,
869+
# convert mapper to a lookup function (GH #15999).
870+
dict_with_default = mapper
871+
mapper = lambda x: dict_with_default[x]
872+
else:
873+
# Dictionary does not have a default. Thus it's safe to
874+
# convert to an Series for efficiency.
875+
# we specify the keys here to handle the
876+
# possibility that they are tuples
877+
from pandas import Series
878+
mapper = Series(mapper, index=mapper.keys())
879+
880+
if isinstance(mapper, ABCSeries):
881+
# Since values were input this means we came from either
882+
# a dict or a series and mapper should be an index
883+
if is_extension_type(self.dtype):
884+
values = self._values
885+
else:
886+
values = self.values
887+
888+
indexer = mapper.index.get_indexer(values)
889+
new_values = algorithms.take_1d(mapper._values, indexer)
890+
891+
return new_values
892+
893+
# we must convert to python types
894+
if is_extension_type(self.dtype):
895+
values = self._values
896+
if na_action is not None:
897+
raise NotImplementedError
898+
map_f = lambda values, f: values.map(f)
899+
else:
900+
values = self.astype(object)
901+
values = getattr(values, 'values', values)
902+
if na_action == 'ignore':
903+
def map_f(values, f):
904+
return lib.map_infer_mask(values, f,
905+
isna(values).view(np.uint8))
906+
else:
907+
map_f = lib.map_infer
908+
909+
# mapper is a function
910+
new_values = map_f(values, mapper)
911+
912+
return new_values
913+
841914
def value_counts(self, normalize=False, sort=True, ascending=False,
842915
bins=None, dropna=True):
843916
"""

pandas/core/dtypes/cast.py

+35
Original file line numberDiff line numberDiff line change
@@ -1127,3 +1127,38 @@ def cast_scalar_to_array(shape, value, dtype=None):
11271127
values.fill(fill_value)
11281128

11291129
return values
1130+
1131+
1132+
def construct_1d_arraylike_from_scalar(value, length, dtype):
1133+
"""
1134+
create a np.ndarray / pandas type of specified shape and dtype
1135+
filled with values
1136+
1137+
Parameters
1138+
----------
1139+
value : scalar value
1140+
length : int
1141+
dtype : pandas_dtype / np.dtype
1142+
1143+
Returns
1144+
-------
1145+
np.ndarray / pandas type of length, filled with value
1146+
1147+
"""
1148+
if is_datetimetz(dtype):
1149+
from pandas import DatetimeIndex
1150+
subarr = DatetimeIndex([value] * length, dtype=dtype)
1151+
elif is_categorical_dtype(dtype):
1152+
from pandas import Categorical
1153+
subarr = Categorical([value] * length)
1154+
else:
1155+
if not isinstance(dtype, (np.dtype, type(np.dtype))):
1156+
dtype = dtype.dtype
1157+
1158+
# coerce if we have nan for an integer dtype
1159+
if is_integer_dtype(dtype) and isna(value):
1160+
dtype = np.float64
1161+
subarr = np.empty(length, dtype=dtype)
1162+
subarr.fill(value)
1163+
1164+
return subarr

pandas/core/dtypes/missing.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -369,13 +369,14 @@ def _maybe_fill(arr, fill_value=np.nan):
369369
return arr
370370

371371

372-
def na_value_for_dtype(dtype):
372+
def na_value_for_dtype(dtype, compat=True):
373373
"""
374374
Return a dtype compat na value
375375
376376
Parameters
377377
----------
378378
dtype : string / dtype
379+
compat : boolean, default True
379380
380381
Returns
381382
-------
@@ -389,7 +390,9 @@ def na_value_for_dtype(dtype):
389390
elif is_float_dtype(dtype):
390391
return np.nan
391392
elif is_integer_dtype(dtype):
392-
return 0
393+
if compat:
394+
return 0
395+
return np.nan
393396
elif is_bool_dtype(dtype):
394397
return False
395398
return np.nan

pandas/core/indexes/base.py

+43-10
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@
1313
from pandas.compat.numpy import function as nv
1414
from pandas import compat
1515

16-
1716
from pandas.core.dtypes.generic import (
1817
ABCSeries,
1918
ABCMultiIndex,
@@ -2827,6 +2826,27 @@ def get_indexer_for(self, target, **kwargs):
28272826
indexer, _ = self.get_indexer_non_unique(target, **kwargs)
28282827
return indexer
28292828

2829+
_index_shared_docs['_get_values_from_dict'] = """
2830+
Return the values of the input dictionary in the order the keys are
2831+
in the index. np.nan is returned for index values not in the
2832+
dictionary.
2833+
2834+
Parameters
2835+
----------
2836+
data : dict
2837+
The dictionary from which to extract the values
2838+
2839+
Returns
2840+
-------
2841+
np.array
2842+
2843+
"""
2844+
2845+
@Appender(_index_shared_docs['_get_values_from_dict'])
2846+
def _get_values_from_dict(self, data):
2847+
return lib.fast_multiget(data, self.values,
2848+
default=np.nan)
2849+
28302850
def _maybe_promote(self, other):
28312851
# A hack, but it works
28322852
from pandas.core.indexes.datetimes import DatetimeIndex
@@ -2865,13 +2885,15 @@ def groupby(self, values):
28652885

28662886
return result
28672887

2868-
def map(self, mapper):
2869-
"""Apply mapper function to an index.
2888+
def map(self, mapper, na_action=None):
2889+
"""Map values of Series using input correspondence
28702890
28712891
Parameters
28722892
----------
2873-
mapper : callable
2874-
Function to be applied.
2893+
mapper : function, dict, or Series
2894+
na_action : {None, 'ignore'}
2895+
If 'ignore', propagate NA values, without passing them to the
2896+
mapping function
28752897
28762898
Returns
28772899
-------
@@ -2881,15 +2903,26 @@ def map(self, mapper):
28812903
a MultiIndex will be returned.
28822904
28832905
"""
2906+
28842907
from .multi import MultiIndex
2885-
mapped_values = self._arrmap(self.values, mapper)
2908+
new_values = super(Index, self)._map_values(
2909+
mapper, na_action=na_action)
28862910
attributes = self._get_attributes_dict()
2887-
if mapped_values.size and isinstance(mapped_values[0], tuple):
2888-
return MultiIndex.from_tuples(mapped_values,
2889-
names=attributes.get('name'))
2911+
if new_values.size and isinstance(new_values[0], tuple):
2912+
if isinstance(self, MultiIndex):
2913+
names = self.names
2914+
elif attributes.get('name'):
2915+
names = [attributes.get('name')] * len(new_values[0])
2916+
else:
2917+
names = None
2918+
return MultiIndex.from_tuples(new_values,
2919+
names=names)
28902920

28912921
attributes['copy'] = False
2892-
return Index(mapped_values, **attributes)
2922+
2923+
# we infer the result types based on the
2924+
# returned values
2925+
return Index(new_values, **attributes)
28932926

28942927
def isin(self, values, level=None):
28952928
"""

pandas/core/indexes/datetimelike.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -136,7 +136,7 @@ def equals(self, other):
136136
elif not isinstance(other, type(self)):
137137
try:
138138
other = type(self)(other)
139-
except:
139+
except Exception:
140140
return False
141141

142142
if not is_dtype_equal(self.dtype, other.dtype):
@@ -352,7 +352,7 @@ def map(self, f):
352352

353353
# Try to use this result if we can
354354
if isinstance(result, np.ndarray):
355-
self._shallow_copy(result)
355+
result = Index(result)
356356

357357
if not isinstance(result, Index):
358358
raise TypeError('The map function must return an Index object')
@@ -698,6 +698,14 @@ def __rsub__(self, other):
698698
def _add_delta(self, other):
699699
return NotImplemented
700700

701+
@Appender(_index_shared_docs['_get_values_from_dict'])
702+
def _get_values_from_dict(self, data):
703+
if len(data):
704+
return np.array([data.get(i, np.nan)
705+
for i in self.asobject.values])
706+
707+
return np.array([np.nan])
708+
701709
def _add_delta_td(self, other):
702710
# add a delta of a timedeltalike
703711
# return the i8 result view

pandas/core/indexes/datetimes.py

+11
Original file line numberDiff line numberDiff line change
@@ -1457,6 +1457,17 @@ def get_value_maybe_box(self, series, key):
14571457
key, tz=self.tz)
14581458
return _maybe_box(self, values, series, key)
14591459

1460+
@Appender(_index_shared_docs['_get_values_from_dict'])
1461+
def _get_values_from_dict(self, data):
1462+
if len(data):
1463+
# coerce back to datetime objects for lookup
1464+
data = com._dict_compat(data)
1465+
return lib.fast_multiget(data,
1466+
self.asobject.values,
1467+
default=np.nan)
1468+
1469+
return np.array([np.nan])
1470+
14601471
def get_loc(self, key, method=None, tolerance=None):
14611472
"""
14621473
Get integer location for requested label

0 commit comments

Comments
 (0)