-
-
Notifications
You must be signed in to change notification settings - Fork 18.5k
Allow indices to be mapped through through dictionaries or series #15081
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 19 commits
c8d2a80
9a67ffd
2b70597
80ca2e2
00165c4
2f019c5
f6a2404
0c72a38
73c276b
a858467
68479a3
80a4c9c
30e7e7a
25bba86
51c5b2b
8f0198e
80fad28
5006b24
4134641
096b0e6
c0f3b76
dd0b7e9
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,11 +12,12 @@ | |
is_object_dtype, | ||
is_list_like, | ||
is_scalar, | ||
is_datetimelike) | ||
is_datetimelike, | ||
is_extension_type) | ||
|
||
from pandas.util._validators import validate_bool_kwarg | ||
|
||
from pandas.core import common as com | ||
from pandas.core import common as com, algorithms | ||
import pandas.core.nanops as nanops | ||
import pandas._libs.lib as lib | ||
from pandas.compat.numpy import function as nv | ||
|
@@ -838,6 +839,75 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, | |
klass=self.__class__.__name__, op=name)) | ||
return func(**kwds) | ||
|
||
def _map_values(self, mapper, na_action=None): | ||
"""An internal function that maps values using the input | ||
correspondence (which can be a dict, Series, or function). | ||
|
||
Parameters | ||
---------- | ||
mapper : function, dict, or Series | ||
The input correspondence object | ||
na_action : {None, 'ignore'} | ||
If 'ignore', propagate NA values, without passing them to the | ||
mapping function | ||
|
||
Returns | ||
------- | ||
applied : Union[Index, MultiIndex], inferred | ||
The output of the mapping function applied to the index. | ||
If the function returns a tuple with more than one element | ||
a MultiIndex will be returned. | ||
|
||
""" | ||
|
||
# we can fastpath dict/Series to an efficient map | ||
# as we know that we are not going to have to yield | ||
# python types | ||
if isinstance(mapper, dict): | ||
if hasattr(mapper, '__missing__'): | ||
# If a dictionary subclass defines a default value method, | ||
# convert mapper to a lookup function (GH #15999). | ||
dict_with_default = mapper | ||
mapper = lambda x: dict_with_default[x] | ||
else: | ||
# Dictionary does not have a default. Thus it's safe to | ||
# convert to an Series for efficiency. | ||
from pandas import Series | ||
mapper = Series(mapper, index=mapper.keys()) | ||
|
||
if isinstance(mapper, ABCSeries): | ||
# Since values were input this means we came from either | ||
# a dict or a series and mapper should be an index | ||
if is_extension_type(self.dtype): | ||
values = self._values | ||
else: | ||
values = self.values | ||
|
||
indexer = mapper.index.get_indexer(values) | ||
new_values = algorithms.take_1d(mapper._values, indexer) | ||
return new_values | ||
|
||
# we must convert to python types | ||
if is_extension_type(self.dtype): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you add a doc-string There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 👍 |
||
values = self._values | ||
if na_action is not None: | ||
raise NotImplementedError | ||
map_f = lambda values, f: values.map(f) | ||
else: | ||
values = self.astype(object) | ||
values = getattr(values, 'values', values) | ||
if na_action == 'ignore': | ||
def map_f(values, f): | ||
return lib.map_infer_mask(values, f, | ||
isna(values).view(np.uint8)) | ||
else: | ||
map_f = lib.map_infer | ||
|
||
# mapper is a function | ||
new_values = map_f(values, mapper) | ||
|
||
return new_values | ||
|
||
def value_counts(self, normalize=False, sort=True, ascending=False, | ||
bins=None, dropna=True): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,6 @@ | |
from pandas.compat.numpy import function as nv | ||
from pandas import compat | ||
|
||
|
||
from pandas.core.dtypes.generic import ( | ||
ABCSeries, | ||
ABCMultiIndex, | ||
|
@@ -2821,6 +2820,27 @@ def get_indexer_for(self, target, **kwargs): | |
indexer, _ = self.get_indexer_non_unique(target, **kwargs) | ||
return indexer | ||
|
||
_index_shared_docs['_get_values_from_dict'] = """ | ||
Return the values of the input dictionary in the order the keys are | ||
in the index. np.nan is returned for index values not in the | ||
dictionary. | ||
|
||
Parameters | ||
---------- | ||
data : dict | ||
The dictionary from which to extract the values | ||
|
||
Returns | ||
------- | ||
np.array | ||
|
||
""" | ||
|
||
@Appender(_index_shared_docs['_get_values_from_dict']) | ||
def _get_values_from_dict(self, data): | ||
return lib.fast_multiget(data, self.values, | ||
default=np.nan) | ||
|
||
def _maybe_promote(self, other): | ||
# A hack, but it works | ||
from pandas.core.indexes.datetimes import DatetimeIndex | ||
|
@@ -2859,13 +2879,16 @@ def groupby(self, values): | |
|
||
return result | ||
|
||
def map(self, mapper): | ||
"""Apply mapper function to an index. | ||
def map(self, mapper, na_action=None): | ||
"""Map values of Series using input correspondence (which can be a | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The first sentence should be a single line so that it looks OK in the autosummary table. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
||
dict, Series, or function) | ||
|
||
Parameters | ||
---------- | ||
mapper : callable | ||
Function to be applied. | ||
mapper : function, dict, or Series | ||
na_action : {None, 'ignore'} | ||
If 'ignore', propagate NA values, without passing them to the | ||
mapping function | ||
|
||
Returns | ||
------- | ||
|
@@ -2875,15 +2898,23 @@ def map(self, mapper): | |
a MultiIndex will be returned. | ||
|
||
""" | ||
|
||
from .multi import MultiIndex | ||
mapped_values = self._arrmap(self.values, mapper) | ||
new_values = super(Index, self)._map_values( | ||
mapper, na_action=na_action) | ||
attributes = self._get_attributes_dict() | ||
if mapped_values.size and isinstance(mapped_values[0], tuple): | ||
return MultiIndex.from_tuples(mapped_values, | ||
names=attributes.get('name')) | ||
if new_values.size and isinstance(new_values[0], tuple): | ||
if isinstance(self, MultiIndex): | ||
names = self.names | ||
elif attributes.get('name'): | ||
names = [attributes.get('name')] * len(new_values[0]) | ||
else: | ||
names = None | ||
return MultiIndex.from_tuples(new_values, | ||
names=names) | ||
|
||
attributes['copy'] = False | ||
return Index(mapped_values, **attributes) | ||
return Index(new_values, **attributes) | ||
|
||
def isin(self, values, level=None): | ||
""" | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -352,7 +352,7 @@ def map(self, f): | |
|
||
# Try to use this result if we can | ||
if isinstance(result, np.ndarray): | ||
self._shallow_copy(result) | ||
result = Index(result) | ||
|
||
if not isinstance(result, Index): | ||
raise TypeError('The map function must return an Index object') | ||
|
@@ -698,6 +698,14 @@ def __rsub__(self, other): | |
def _add_delta(self, other): | ||
return NotImplemented | ||
|
||
@Appender(_index_shared_docs['_get_values_from_dict']) | ||
def _get_values_from_dict(self, data): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this seems odd that you are returning np.nan for a datetimelike. do tests hit this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would assume it does get hit indirectly as it was essentially pulled into the subclasses from the pd.Series instantiation. It is essentially replicating the behavior that was previously in lines 188-203 of pandas/core/series.py and allowing each object to handle it as seen fit rather than doing it in an if conditional. Personally this way make more sense to me but since I ended up not making use of this functionality outside of the Series instantiation I'm happy to revert the changes if you'd prefer. |
||
if len(data): | ||
return np.array([data.get(i, np.nan) | ||
for i in self.asobject.values]) | ||
|
||
return np.array([np.nan]) | ||
|
||
def _add_delta_td(self, other): | ||
# add a delta of a timedeltalike | ||
# return the i8 result view | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1451,6 +1451,17 @@ def get_value_maybe_box(self, series, key): | |
key, tz=self.tz) | ||
return _maybe_box(self, values, series, key) | ||
|
||
@Appender(_index_shared_docs['_get_values_from_dict']) | ||
def _get_values_from_dict(self, data): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. if this is needed for datetimeliek then push this to the super class rather than having 2 impls. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. See the above comment. |
||
if len(data): | ||
# coerce back to datetime objects for lookup | ||
data = com._dict_compat(data) | ||
return lib.fast_multiget(data, | ||
self.asobject.values, | ||
default=np.nan) | ||
|
||
return np.array([np.nan]) | ||
|
||
def get_loc(self, key, method=None, tolerance=None): | ||
""" | ||
Get integer location for requested label | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,7 +44,6 @@ | |
_maybe_match_name, | ||
SettingWithCopyError, | ||
_maybe_box_datetimelike, | ||
_dict_compat, | ||
standardize_mapping, | ||
_any_none) | ||
from pandas.core.index import (Index, MultiIndex, InvalidIndexError, | ||
|
@@ -202,23 +201,9 @@ def __init__(self, data=None, index=None, dtype=None, name=None, | |
index = Index(data) | ||
else: | ||
index = Index(_try_sort(data)) | ||
|
||
try: | ||
if isinstance(index, DatetimeIndex): | ||
if len(data): | ||
# coerce back to datetime objects for lookup | ||
data = _dict_compat(data) | ||
data = lib.fast_multiget(data, | ||
index.asobject.values, | ||
default=np.nan) | ||
else: | ||
data = np.nan | ||
# GH #12169 | ||
elif isinstance(index, (PeriodIndex, TimedeltaIndex)): | ||
data = ([data.get(i, np.nan) for i in index] | ||
if data else np.nan) | ||
else: | ||
data = lib.fast_multiget(data, index.values, | ||
default=np.nan) | ||
data = index._get_values_from_dict(data) | ||
except TypeError: | ||
data = ([data.get(i, np.nan) for i in index] | ||
if data else np.nan) | ||
|
@@ -2245,14 +2230,14 @@ def unstack(self, level=-1, fill_value=None): | |
# ---------------------------------------------------------------------- | ||
# function application | ||
|
||
def map(self, arg, na_action=None): | ||
def map(self, mapper, na_action=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add a release note for this in API-breaking changes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. reverted (was an error) |
||
""" | ||
Map values of Series using input correspondence (which can be | ||
a dict, Series, or function) | ||
|
||
Parameters | ||
---------- | ||
arg : function, dict, or Series | ||
mapper : function, dict, or Series | ||
na_action : {None, 'ignore'} | ||
If 'ignore', propagate NA values, without passing them to the | ||
mapping function | ||
|
@@ -2285,7 +2270,7 @@ def map(self, arg, na_action=None): | |
two bar | ||
three baz | ||
|
||
If `arg` is a dictionary, return a new Series with values converted | ||
If `mapper` is a dictionary, return a new Series with values converted | ||
according to the dictionary's mapping: | ||
|
||
>>> z = {1: 'A', 2: 'B', 3: 'C'} | ||
|
@@ -2322,7 +2307,7 @@ def map(self, arg, na_action=None): | |
|
||
Notes | ||
----- | ||
When `arg` is a dictionary, values in Series that are not in the | ||
When `mapper` is a dictionary, values in Series that are not in the | ||
dictionary (as keys) are converted to ``NaN``. However, if the | ||
dictionary is a ``dict`` subclass that defines ``__missing__`` (i.e. | ||
provides a method for default values), then this default is used | ||
|
@@ -2337,41 +2322,8 @@ def map(self, arg, na_action=None): | |
3 0 | ||
dtype: int64 | ||
""" | ||
|
||
if is_extension_type(self.dtype): | ||
values = self._values | ||
if na_action is not None: | ||
raise NotImplementedError | ||
map_f = lambda values, f: values.map(f) | ||
else: | ||
values = self.asobject | ||
|
||
if na_action == 'ignore': | ||
def map_f(values, f): | ||
return lib.map_infer_mask(values, f, | ||
isna(values).view(np.uint8)) | ||
else: | ||
map_f = lib.map_infer | ||
|
||
if isinstance(arg, dict): | ||
if hasattr(arg, '__missing__'): | ||
# If a dictionary subclass defines a default value method, | ||
# convert arg to a lookup function (GH #15999). | ||
dict_with_default = arg | ||
arg = lambda x: dict_with_default[x] | ||
else: | ||
# Dictionary does not have a default. Thus it's safe to | ||
# convert to an indexed series for efficiency. | ||
arg = self._constructor(arg, index=arg.keys()) | ||
|
||
if isinstance(arg, Series): | ||
# arg is a Series | ||
indexer = arg.index.get_indexer(values) | ||
new_values = algorithms.take_1d(arg._values, indexer) | ||
else: | ||
# arg is a function | ||
new_values = map_f(values, arg) | ||
|
||
new_values = super(Series, self)._map_values( | ||
mapper, na_action=na_action) | ||
return self._constructor(new_values, | ||
index=self.index).__finalize__(self) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is passing
index=mapper.keys()
necessary?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nope
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
actually this is to handle some odd-ball cases where you have tuples as keys in the dictionary