Skip to content

Commit ecc6ead

Browse files
authored
REF: Add ExtensionArray.map (#51809)
* REF: Add ExtensionArray.map * add gh number * add tests * update after comments * add back pylint
1 parent b7a6133 commit ecc6ead

File tree

13 files changed

+169
-89
lines changed

13 files changed

+169
-89
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ Other enhancements
3232
- :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`)
3333
- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
3434
- Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
35+
- :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`).
3536

3637
.. ---------------------------------------------------------------------------
3738
.. _whatsnew_210.notable_bug_fixes:

pandas/core/algorithms.py

+73
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
is_categorical_dtype,
4747
is_complex_dtype,
4848
is_datetime64_dtype,
49+
is_dict_like,
4950
is_extension_array_dtype,
5051
is_float_dtype,
5152
is_integer,
@@ -1678,3 +1679,75 @@ def union_with_duplicates(
16781679
unique_vals = ensure_wrapped_if_datetimelike(unique_vals)
16791680
repeats = final_count.reindex(unique_vals).values
16801681
return np.repeat(unique_vals, repeats)
1682+
1683+
1684+
def map_array(
1685+
arr: ArrayLike, mapper, na_action: Literal["ignore"] | None = None
1686+
) -> np.ndarray | ExtensionArray | Index:
1687+
"""
1688+
Map values using an input mapping or function.
1689+
1690+
Parameters
1691+
----------
1692+
mapper : function, dict, or Series
1693+
Mapping correspondence.
1694+
na_action : {None, 'ignore'}, default None
1695+
If 'ignore', propagate NA values, without passing them to the
1696+
mapping correspondence.
1697+
1698+
Returns
1699+
-------
1700+
Union[ndarray, Index, ExtensionArray]
1701+
The output of the mapping function applied to the array.
1702+
If the function returns a tuple with more than one element
1703+
a MultiIndex will be returned.
1704+
"""
1705+
if na_action not in (None, "ignore"):
1706+
msg = f"na_action must either be 'ignore' or None, {na_action} was passed"
1707+
raise ValueError(msg)
1708+
1709+
# we can fastpath dict/Series to an efficient map
1710+
# as we know that we are not going to have to yield
1711+
# python types
1712+
if is_dict_like(mapper):
1713+
if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
1714+
# If a dictionary subclass defines a default value method,
1715+
# convert mapper to a lookup function (GH #15999).
1716+
dict_with_default = mapper
1717+
mapper = lambda x: dict_with_default[
1718+
np.nan if isinstance(x, float) and np.isnan(x) else x
1719+
]
1720+
else:
1721+
# Dictionary does not have a default. Thus it's safe to
1722+
# convert to an Series for efficiency.
1723+
# we specify the keys here to handle the
1724+
# possibility that they are tuples
1725+
1726+
# The return value of mapping with an empty mapper is
1727+
# expected to be pd.Series(np.nan, ...). As np.nan is
1728+
# of dtype float64 the return value of this method should
1729+
# be float64 as well
1730+
from pandas import Series
1731+
1732+
if len(mapper) == 0:
1733+
mapper = Series(mapper, dtype=np.float64)
1734+
else:
1735+
mapper = Series(mapper)
1736+
1737+
if isinstance(mapper, ABCSeries):
1738+
if na_action == "ignore":
1739+
mapper = mapper[mapper.index.notna()]
1740+
1741+
# Since values were input this means we came from either
1742+
# a dict or a series and mapper should be an index
1743+
indexer = mapper.index.get_indexer(arr)
1744+
new_values = take_nd(mapper._values, indexer)
1745+
1746+
return new_values
1747+
1748+
# we must convert to python types
1749+
values = arr.astype(object, copy=False)
1750+
if na_action is None:
1751+
return lib.map_infer(values, mapper)
1752+
else:
1753+
return lib.map_infer_mask(values, mapper, isna(values).view(np.uint8))

pandas/core/apply.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -1068,8 +1068,7 @@ def apply_standard(self) -> DataFrame | Series:
10681068
return f(obj)
10691069

10701070
# row-wise access
1071-
if is_extension_array_dtype(obj.dtype) and hasattr(obj._values, "map"):
1072-
# GH#23179 some EAs do not have `map`
1071+
if is_extension_array_dtype(obj.dtype):
10731072
mapped = obj._values.map(f)
10741073
else:
10751074
values = obj.astype(object)._values

pandas/core/arrays/base.py

+24
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
from pandas.core.algorithms import (
6565
factorize_array,
6666
isin,
67+
map_array,
6768
mode,
6869
rank,
6970
unique,
@@ -178,6 +179,7 @@ class ExtensionArray:
178179
* factorize / _values_for_factorize
179180
* argsort, argmax, argmin / _values_for_argsort
180181
* searchsorted
182+
* map
181183
182184
The remaining methods implemented on this class should be performant,
183185
as they only compose abstract methods. Still, a more efficient
@@ -1704,6 +1706,28 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
17041706

17051707
return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs)
17061708

1709+
def map(self, mapper, na_action=None):
1710+
"""
1711+
Map values using an input mapping or function.
1712+
1713+
Parameters
1714+
----------
1715+
mapper : function, dict, or Series
1716+
Mapping correspondence.
1717+
na_action : {None, 'ignore'}, default None
1718+
If 'ignore', propagate NA values, without passing them to the
1719+
mapping correspondence. If 'ignore' is not supported, a
1720+
``NotImplementedError`` should be raised.
1721+
1722+
Returns
1723+
-------
1724+
Union[ndarray, Index, ExtensionArray]
1725+
The output of the mapping function applied to the array.
1726+
If the function returns a tuple with more than one element
1727+
a MultiIndex will be returned.
1728+
"""
1729+
return map_array(self, mapper, na_action=na_action)
1730+
17071731

17081732
class ExtensionArraySupportsAnyAll(ExtensionArray):
17091733
def any(self, *, skipna: bool = True) -> bool:

pandas/core/arrays/categorical.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -1199,7 +1199,7 @@ def remove_unused_categories(self) -> Categorical:
11991199

12001200
# ------------------------------------------------------------------
12011201

1202-
def map(self, mapper):
1202+
def map(self, mapper, na_action=None):
12031203
"""
12041204
Map categories using an input mapping or function.
12051205
@@ -1268,6 +1268,9 @@ def map(self, mapper):
12681268
>>> cat.map({'a': 'first', 'b': 'second'})
12691269
Index(['first', 'second', nan], dtype='object')
12701270
"""
1271+
if na_action is not None:
1272+
raise NotImplementedError
1273+
12711274
new_categories = self.categories.map(mapper)
12721275
try:
12731276
return self.from_codes(

pandas/core/arrays/datetimelike.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -750,7 +750,10 @@ def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarra
750750
# pandas assumes they're there.
751751

752752
@ravel_compat
753-
def map(self, mapper):
753+
def map(self, mapper, na_action=None):
754+
if na_action is not None:
755+
raise NotImplementedError
756+
754757
# TODO(GH-23179): Add ExtensionArray.map
755758
# Need to figure out if we want ExtensionArray.map first.
756759
# If so, then we can refactor IndexOpsMixin._map_values to

pandas/core/arrays/sparse/array.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -1271,14 +1271,17 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
12711271

12721272
return self._simple_new(sp_values, self.sp_index, dtype)
12731273

1274-
def map(self: SparseArrayT, mapper) -> SparseArrayT:
1274+
def map(self: SparseArrayT, mapper, na_action=None) -> SparseArrayT:
12751275
"""
12761276
Map categories using an input mapping or function.
12771277
12781278
Parameters
12791279
----------
12801280
mapper : dict, Series, callable
12811281
The correspondence from old values to new.
1282+
na_action : {None, 'ignore'}, default None
1283+
If 'ignore', propagate NA values, without passing them to the
1284+
mapping correspondence.
12821285
12831286
Returns
12841287
-------
@@ -1308,6 +1311,9 @@ def map(self: SparseArrayT, mapper) -> SparseArrayT:
13081311
IntIndex
13091312
Indices: array([1, 2], dtype=int32)
13101313
"""
1314+
if na_action is not None:
1315+
raise NotImplementedError
1316+
13111317
# this is used in apply.
13121318
# We get hit since we're an "is_extension_array_dtype" but regular extension
13131319
# types are not hit. This may be worth adding to the interface.

pandas/core/base.py

+13-84
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,6 @@
4040

4141
from pandas.core.dtypes.cast import can_hold_element
4242
from pandas.core.dtypes.common import (
43-
is_categorical_dtype,
44-
is_dict_like,
4543
is_extension_array_dtype,
4644
is_object_dtype,
4745
is_scalar,
@@ -78,7 +76,6 @@
7876
)
7977

8078
from pandas import (
81-
Categorical,
8279
Index,
8380
Series,
8481
)
@@ -882,87 +879,19 @@ def _map_values(self, mapper, na_action=None):
882879
If the function returns a tuple with more than one element
883880
a MultiIndex will be returned.
884881
"""
885-
# we can fastpath dict/Series to an efficient map
886-
# as we know that we are not going to have to yield
887-
# python types
888-
if is_dict_like(mapper):
889-
if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
890-
# If a dictionary subclass defines a default value method,
891-
# convert mapper to a lookup function (GH #15999).
892-
dict_with_default = mapper
893-
mapper = lambda x: dict_with_default[
894-
np.nan if isinstance(x, float) and np.isnan(x) else x
895-
]
896-
else:
897-
# Dictionary does not have a default. Thus it's safe to
898-
# convert to an Series for efficiency.
899-
# we specify the keys here to handle the
900-
# possibility that they are tuples
901-
902-
# The return value of mapping with an empty mapper is
903-
# expected to be pd.Series(np.nan, ...). As np.nan is
904-
# of dtype float64 the return value of this method should
905-
# be float64 as well
906-
from pandas import Series
907-
908-
if len(mapper) == 0:
909-
mapper = Series(mapper, dtype=np.float64)
910-
else:
911-
mapper = Series(mapper)
912-
913-
if isinstance(mapper, ABCSeries):
914-
if na_action not in (None, "ignore"):
915-
msg = (
916-
"na_action must either be 'ignore' or None, "
917-
f"{na_action} was passed"
918-
)
919-
raise ValueError(msg)
920-
921-
if na_action == "ignore":
922-
mapper = mapper[mapper.index.notna()]
923-
924-
# Since values were input this means we came from either
925-
# a dict or a series and mapper should be an index
926-
if is_categorical_dtype(self.dtype):
927-
# use the built in categorical series mapper which saves
928-
# time by mapping the categories instead of all values
929-
930-
cat = cast("Categorical", self._values)
931-
return cat.map(mapper)
932-
933-
values = self._values
934-
935-
indexer = mapper.index.get_indexer(values)
936-
new_values = algorithms.take_nd(mapper._values, indexer)
937-
938-
return new_values
939-
940-
# we must convert to python types
941-
if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
942-
# GH#23179 some EAs do not have `map`
943-
values = self._values
944-
if na_action is not None:
945-
raise NotImplementedError
946-
map_f = lambda values, f: values.map(f)
947-
else:
948-
values = self._values.astype(object)
949-
if na_action == "ignore":
950-
map_f = lambda values, f: lib.map_infer_mask(
951-
values, f, isna(values).view(np.uint8)
952-
)
953-
elif na_action is None:
954-
map_f = lib.map_infer
955-
else:
956-
msg = (
957-
"na_action must either be 'ignore' or None, "
958-
f"{na_action} was passed"
959-
)
960-
raise ValueError(msg)
961-
962-
# mapper is a function
963-
new_values = map_f(values, mapper)
964-
965-
return new_values
882+
arr = extract_array(self, extract_numpy=True, extract_range=True)
883+
884+
if is_extension_array_dtype(arr.dtype):
885+
# Item "IndexOpsMixin" of "Union[IndexOpsMixin, ExtensionArray,
886+
# ndarray[Any, Any]]" has no attribute "map"
887+
return arr.map(mapper, na_action=na_action) # type: ignore[union-attr]
888+
889+
# Argument 1 to "map_array" has incompatible type
890+
# "Union[IndexOpsMixin, ExtensionArray, ndarray[Any, Any]]";
891+
# expected "Union[ExtensionArray, ndarray[Any, Any]]"
892+
return algorithms.map_array(
893+
arr, mapper, na_action=na_action # type: ignore[arg-type]
894+
)
966895

967896
@final
968897
def value_counts(

pandas/tests/extension/base/methods.py

+6
Original file line numberDiff line numberDiff line change
@@ -88,6 +88,12 @@ def test_apply_simple_series(self, data):
8888
result = pd.Series(data).apply(id)
8989
assert isinstance(result, pd.Series)
9090

91+
@pytest.mark.parametrize("na_action", [None, "ignore"])
92+
def test_map(self, data, na_action):
93+
result = data.map(lambda x: x, na_action=na_action)
94+
expected = data.to_numpy()
95+
tm.assert_numpy_array_equal(result, expected)
96+
9197
def test_argsort(self, data_for_sorting):
9298
result = pd.Series(data_for_sorting).argsort()
9399
# argsort result gets passed to take, so should be np.intp

pandas/tests/extension/test_categorical.py

+9
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,15 @@ def test_combine_add(self, data_repeated):
184184
expected = pd.Series([a + val for a in list(orig_data1)])
185185
self.assert_series_equal(result, expected)
186186

187+
@pytest.mark.parametrize("na_action", [None, "ignore"])
188+
def test_map(self, data, na_action):
189+
if na_action is not None:
190+
with pytest.raises(NotImplementedError, match=""):
191+
data.map(lambda x: x, na_action=na_action)
192+
else:
193+
result = data.map(lambda x: x, na_action=na_action)
194+
self.assert_extension_array_equal(result, data)
195+
187196

188197
class TestCasting(base.BaseCastingTests):
189198
@pytest.mark.parametrize("cls", [Categorical, CategoricalIndex])

pandas/tests/extension/test_datetime.py

+9
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,15 @@ def test_combine_add(self, data_repeated):
116116
# Timestamp.__add__(Timestamp) not defined
117117
pass
118118

119+
@pytest.mark.parametrize("na_action", [None, "ignore"])
120+
def test_map(self, data, na_action):
121+
if na_action is not None:
122+
with pytest.raises(NotImplementedError, match=""):
123+
data.map(lambda x: x, na_action=na_action)
124+
else:
125+
result = data.map(lambda x: x, na_action=na_action)
126+
self.assert_extension_array_equal(result, data)
127+
119128

120129
class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests):
121130
pass

pandas/tests/extension/test_period.py

+9
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,15 @@ def test_diff(self, data, periods):
105105
else:
106106
super().test_diff(data, periods)
107107

108+
@pytest.mark.parametrize("na_action", [None, "ignore"])
109+
def test_map(self, data, na_action):
110+
if na_action is not None:
111+
with pytest.raises(NotImplementedError, match=""):
112+
data.map(lambda x: x, na_action=na_action)
113+
else:
114+
result = data.map(lambda x: x, na_action=na_action)
115+
self.assert_extension_array_equal(result, data)
116+
108117

109118
class TestInterface(BasePeriodTests, base.BaseInterfaceTests):
110119
pass

0 commit comments

Comments
 (0)