Skip to content

Allow indices to be mapped through through dictionaries or series #15081

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 22 commits into from
Nov 25, 2017
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c8d2a80
Allow index.map() to accept series and dictionary inputs in addition …
nateyoder Oct 27, 2016
9a67ffd
add test to make sure dictionaries with missing keys work
nateyoder Oct 27, 2016
2b70597
Refactor the code to work with period time time delta indices
nateyoder Jan 7, 2017
80ca2e2
Makes changes based on feedback from @jreback
nateyoder Jan 22, 2017
00165c4
Using numpy array from return if possible
nateyoder Apr 22, 2017
2f019c5
Update tests to include empty maps and NaT values
nateyoder Apr 22, 2017
f6a2404
Refactor map to use common code for series and index when possible an…
nateyoder Apr 22, 2017
0c72a38
Fix bug related to converting a dictionary to a MultiIndex instead of…
nateyoder Apr 23, 2017
73c276b
pep8 fixes
nateyoder Apr 24, 2017
a858467
Address comments from @jreback add new tests; skip tests on IntervalI…
nateyoder May 21, 2017
68479a3
Fix issues from merge
nateyoder May 21, 2017
80a4c9c
Fix issues from merge
nateyoder Oct 31, 2017
30e7e7a
replace _constructor
jreback Nov 1, 2017
25bba86
Merge branch 'master' into PR_TOOL_MERGE_PR_15081
jreback Nov 19, 2017
51c5b2b
move whatsnew
jreback Nov 19, 2017
8f0198e
unify _map_values a bit more
jreback Nov 19, 2017
80fad28
use an efficient path for mapping
jreback Nov 19, 2017
5006b24
restore name of mapper in .map
jreback Nov 19, 2017
4134641
more renaming
jreback Nov 19, 2017
096b0e6
Merge branch 'master' into PR_TOOL_MERGE_PR_15081
jreback Nov 24, 2017
c0f3b76
review comments
jreback Nov 24, 2017
dd0b7e9
handle empty maps
jreback Nov 24, 2017
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions asv_bench/benchmarks/series_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,30 @@ def time_series_dropna_datetime(self):
self.s.dropna()


class series_map_dict(object):
goal_time = 0.2

def setup(self):
map_size = 1000
self.s = Series(np.random.randint(0, map_size, 10000))
self.map_dict = {i: map_size - i for i in range(map_size)}

def time_series_map_dict(self):
self.s.map(self.map_dict)


class series_map_series(object):
goal_time = 0.2

def setup(self):
map_size = 1000
self.s = Series(np.random.randint(0, map_size, 10000))
self.map_series = Series(map_size - np.arange(map_size))

def time_series_map_series(self):
self.s.map(self.map_series)


class series_clip(object):
goal_time = 0.2

Expand Down
3 changes: 2 additions & 1 deletion doc/source/whatsnew/v0.22.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ Other API Changes
- :class:`CacheableOffset` and :class:`WeekDay` are no longer available in the ``pandas.tseries.offsets`` module (:issue:`17830`)
- `tseries.frequencies.get_freq_group()` and `tseries.frequencies.DAYS` are removed from the public API (:issue:`18034`)
- :func:`Series.truncate` and :func:`DataFrame.truncate` will raise a ``ValueError`` if the index is not sorted instead of an unhelpful ``KeyError`` (:issue:`17935`)
- :func:`Index.map` can now accept ``Series`` and dictionary input objects (:issue:`12756`).


.. _whatsnew_0220.deprecations:
Expand Down Expand Up @@ -77,7 +78,7 @@ Performance Improvements
- Added a keyword argument, ``cache``, to :func:`to_datetime` that improved the performance of converting duplicate datetime arguments (:issue:`11665`)
- :class`DateOffset` arithmetic performance is improved (:issue:`18218`)
- Converting a ``Series`` of ``Timedelta`` objects to days, seconds, etc... sped up through vectorization of underlying methods (:issue:`18092`)
-
- Improved performance of ``.map()`` with a ``Series/dict`` input (:issue:`15081`)

.. _whatsnew_0220.docs:

Expand Down
74 changes: 72 additions & 2 deletions pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,12 @@
is_object_dtype,
is_list_like,
is_scalar,
is_datetimelike)
is_datetimelike,
is_extension_type)

from pandas.util._validators import validate_bool_kwarg

from pandas.core import common as com
from pandas.core import common as com, algorithms
import pandas.core.nanops as nanops
import pandas._libs.lib as lib
from pandas.compat.numpy import function as nv
Expand Down Expand Up @@ -838,6 +839,75 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None,
klass=self.__class__.__name__, op=name))
return func(**kwds)

def _map_values(self, arg, na_action=None):
"""An internal function that maps values using the input
correspondence (which can be a dict, Series, or function).

Parameters
----------
arg : function, dict, or Series
The input correspondence object
na_action : {None, 'ignore'}
If 'ignore', propagate NA values, without passing them to the
mapping function

Returns
-------
applied : Union[Index, MultiIndex], inferred
The output of the mapping function applied to the index.
If the function returns a tuple with more than one element
a MultiIndex will be returned.

"""

# we can fastpath dict/Series to an efficient map
# as we know that we are not going to have to yield
# python types
if isinstance(arg, dict):
if hasattr(arg, '__missing__'):
# If a dictionary subclass defines a default value method,
# convert arg to a lookup function (GH #15999).
dict_with_default = arg
arg = lambda x: dict_with_default[x]
else:
# Dictionary does not have a default. Thus it's safe to
# convert to an Series for efficiency.
from pandas import Series
arg = Series(arg, index=arg.keys())

if isinstance(arg, ABCSeries):
# Since values were input this means we came from either
# a dict or a series and arg should be an index
if is_extension_type(self.dtype):
values = self._values
else:
values = self.values

indexer = arg.index.get_indexer(values)
new_values = algorithms.take_1d(arg._values, indexer)
return new_values

# we must convert to python types
if is_extension_type(self.dtype):
values = self._values
if na_action is not None:
raise NotImplementedError
map_f = lambda values, f: values.map(f)
else:
values = self.astype(object)
values = getattr(values, 'values', values)
if na_action == 'ignore':
def map_f(values, f):
return lib.map_infer_mask(values, f,
isna(values).view(np.uint8))
else:
map_f = lib.map_infer

# arg is a function
new_values = map_f(values, arg)

return new_values

def value_counts(self, normalize=False, sort=True, ascending=False,
bins=None, dropna=True):
"""
Expand Down
51 changes: 41 additions & 10 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from pandas.compat.numpy import function as nv
from pandas import compat


from pandas.core.dtypes.generic import (
ABCSeries,
ABCMultiIndex,
Expand Down Expand Up @@ -2821,6 +2820,27 @@ def get_indexer_for(self, target, **kwargs):
indexer, _ = self.get_indexer_non_unique(target, **kwargs)
return indexer

_index_shared_docs['_get_values_from_dict'] = """
Return the values of the input dictionary in the order the keys are
in the index. np.nan is returned for index values not in the
dictionary.

Parameters
----------
data : dict
The dictionary from which to extract the values

Returns
-------
np.array

"""

@Appender(_index_shared_docs['_get_values_from_dict'])
def _get_values_from_dict(self, data):
return lib.fast_multiget(data, self.values,
default=np.nan)

def _maybe_promote(self, other):
# A hack, but it works
from pandas.core.indexes.datetimes import DatetimeIndex
Expand Down Expand Up @@ -2859,13 +2879,16 @@ def groupby(self, values):

return result

def map(self, mapper):
"""Apply mapper function to an index.
def map(self, arg, na_action=None):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

technically need to deprecate and rename mapper -> arg add a versionadded (0.21.0) for na_action.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this something I need to do? If so can you show me how you've done this in the past? Otherwise should I just change arg -> mapper?

"""Map values of Series using input correspondence (which can be a
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The first sentence should be a single line so that it looks OK in the autosummary table.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

dict, Series, or function)

Parameters
----------
mapper : callable
Function to be applied.
arg : function, dict, or Series
na_action : {None, 'ignore'}
If 'ignore', propagate NA values, without passing them to the
mapping function

Returns
-------
Expand All @@ -2875,15 +2898,23 @@ def map(self, mapper):
a MultiIndex will be returned.

"""

from .multi import MultiIndex
mapped_values = self._arrmap(self.values, mapper)
new_values = super(Index, self)._map_values(
arg, na_action=na_action)
attributes = self._get_attributes_dict()
if mapped_values.size and isinstance(mapped_values[0], tuple):
return MultiIndex.from_tuples(mapped_values,
names=attributes.get('name'))
if new_values.size and isinstance(new_values[0], tuple):
if isinstance(self, MultiIndex):
names = self.names
elif attributes.get('name'):
names = [attributes.get('name')] * len(new_values[0])
else:
names = None
return MultiIndex.from_tuples(new_values,
names=names)

attributes['copy'] = False
return Index(mapped_values, **attributes)
return Index(new_values, **attributes)

def isin(self, values, level=None):
"""
Expand Down
10 changes: 9 additions & 1 deletion pandas/core/indexes/datetimelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,7 +352,7 @@ def map(self, f):

# Try to use this result if we can
if isinstance(result, np.ndarray):
self._shallow_copy(result)
result = Index(result)

if not isinstance(result, Index):
raise TypeError('The map function must return an Index object')
Expand Down Expand Up @@ -698,6 +698,14 @@ def __rsub__(self, other):
def _add_delta(self, other):
return NotImplemented

@Appender(_index_shared_docs['_get_values_from_dict'])
def _get_values_from_dict(self, data):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this seems odd that you are returning np.nan for a datetimelike. do tests hit this?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would assume it does get hit indirectly as it was essentially pulled into the subclasses from the pd.Series instantiation. It is essentially replicating the behavior that was previously in lines 188-203 of pandas/core/series.py and allowing each object to handle it as seen fit rather than doing it in an if conditional. Personally this way make more sense to me but since I ended up not making use of this functionality outside of the Series instantiation I'm happy to revert the changes if you'd prefer.

if len(data):
return np.array([data.get(i, np.nan)
for i in self.asobject.values])

return np.array([np.nan])

def _add_delta_td(self, other):
# add a delta of a timedeltalike
# return the i8 result view
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/indexes/datetimes.py
Original file line number Diff line number Diff line change
Expand Up @@ -1451,6 +1451,17 @@ def get_value_maybe_box(self, series, key):
key, tz=self.tz)
return _maybe_box(self, values, series, key)

@Appender(_index_shared_docs['_get_values_from_dict'])
def _get_values_from_dict(self, data):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if this is needed for datetimeliek then push this to the super class rather than having 2 impls.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See the above comment.

if len(data):
# coerce back to datetime objects for lookup
data = com._dict_compat(data)
return lib.fast_multiget(data,
self.asobject.values,
default=np.nan)

return np.array([np.nan])

def get_loc(self, key, method=None, tolerance=None):
"""
Get integer location for requested label
Expand Down
56 changes: 4 additions & 52 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
_maybe_match_name,
SettingWithCopyError,
_maybe_box_datetimelike,
_dict_compat,
standardize_mapping,
_any_none)
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
Expand Down Expand Up @@ -202,23 +201,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None,
index = Index(data)
else:
index = Index(_try_sort(data))

try:
if isinstance(index, DatetimeIndex):
if len(data):
# coerce back to datetime objects for lookup
data = _dict_compat(data)
data = lib.fast_multiget(data,
index.asobject.values,
default=np.nan)
else:
data = np.nan
# GH #12169
elif isinstance(index, (PeriodIndex, TimedeltaIndex)):
data = ([data.get(i, np.nan) for i in index]
if data else np.nan)
else:
data = lib.fast_multiget(data, index.values,
default=np.nan)
data = index._get_values_from_dict(data)
except TypeError:
data = ([data.get(i, np.nan) for i in index]
if data else np.nan)
Expand Down Expand Up @@ -2337,41 +2322,8 @@ def map(self, arg, na_action=None):
3 0
dtype: int64
"""

if is_extension_type(self.dtype):
values = self._values
if na_action is not None:
raise NotImplementedError
map_f = lambda values, f: values.map(f)
else:
values = self.asobject

if na_action == 'ignore':
def map_f(values, f):
return lib.map_infer_mask(values, f,
isna(values).view(np.uint8))
else:
map_f = lib.map_infer

if isinstance(arg, dict):
if hasattr(arg, '__missing__'):
# If a dictionary subclass defines a default value method,
# convert arg to a lookup function (GH #15999).
dict_with_default = arg
arg = lambda x: dict_with_default[x]
else:
# Dictionary does not have a default. Thus it's safe to
# convert to an indexed series for efficiency.
arg = self._constructor(arg, index=arg.keys())

if isinstance(arg, Series):
# arg is a Series
indexer = arg.index.get_indexer(values)
new_values = algorithms.take_1d(arg._values, indexer)
else:
# arg is a function
new_values = map_f(values, arg)

new_values = super(Series, self)._map_values(
arg, na_action=na_action)
return self._constructor(new_values,
index=self.index).__finalize__(self)

Expand Down
26 changes: 26 additions & 0 deletions pandas/tests/indexes/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,3 +996,29 @@ def test_searchsorted_monotonic(self, indices):
# non-monotonic should raise.
with pytest.raises(ValueError):
indices._searchsorted_monotonic(value, side='left')

def test_map(self):
index = self.create_index()
# From output of UInt64Index mapping can't infer that we
# shouldn't default to Int64
if isinstance(index, UInt64Index):
expected = Index(index.values.tolist())
else:
expected = index

tm.assert_index_equal(index.map(lambda x: x), expected)

identity_dict = {x: x for x in index}
tm.assert_index_equal(index.map(identity_dict), expected)

# Use values to work around MultiIndex instantiation of series
identity_series = Series(expected.values, index=index)
tm.assert_index_equal(index.map(identity_series), expected)

# empty mappable
nan_index = pd.Index([np.nan] * len(index))
series_map = pd.Series()
tm.assert_index_equal(index.map(series_map), nan_index)

dict_map = {}
tm.assert_index_equal(index.map(dict_map), nan_index)
Loading