Skip to content

Commit 6b9bc30

Browse files
committed
ENH: add na_action to Categorical.map
1 parent c71645f commit 6b9bc30

File tree

8 files changed

+236
-42
lines changed

8 files changed

+236
-42
lines changed

doc/source/whatsnew/v2.1.0.rst

+2-1
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@ enhancement2
2929
Other enhancements
3030
^^^^^^^^^^^^^^^^^^
3131
- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
32+
- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter (:issue:``)
3233
-
3334
3435
.. ---------------------------------------------------------------------------
@@ -118,7 +119,7 @@ Bug fixes
118119

119120
Categorical
120121
^^^^^^^^^^^
121-
-
122+
- Bug in :meth:`Series.map` , where the value of the ``na_action`` parameter was not used if the series held a :class:`Categorical` (:issue:``).
122123
-
123124
124125
Datetimelike

pandas/core/arrays/categorical.py

+19-10
Original file line numberDiff line numberDiff line change
@@ -1198,7 +1198,7 @@ def remove_unused_categories(self) -> Categorical:
11981198

11991199
# ------------------------------------------------------------------
12001200

1201-
def map(self, mapper):
1201+
def map(self, mapper, na_action: Literal["ignore"] | None = "ignore"):
12021202
"""
12031203
Map categories using an input mapping or function.
12041204
@@ -1215,6 +1215,9 @@ def map(self, mapper):
12151215
----------
12161216
mapper : function, dict, or Series
12171217
Mapping correspondence.
1218+
na_action : {None, 'ignore'}, default 'ignore'
1219+
If 'ignore', propagate NaN values, without passing them to the
1220+
mapping correspondence.
12181221
12191222
Returns
12201223
-------
@@ -1267,17 +1270,23 @@ def map(self, mapper):
12671270
>>> cat.map({'a': 'first', 'b': 'second'})
12681271
Index(['first', 'second', nan], dtype='object')
12691272
"""
1273+
assert callable(mapper) or is_dict_like(mapper)
1274+
12701275
new_categories = self.categories.map(mapper)
1271-
try:
1272-
return self.from_codes(
1273-
self._codes.copy(), categories=new_categories, ordered=self.ordered
1274-
)
1275-
except ValueError:
1276-
# NA values are represented in self._codes with -1
1277-
# np.take causes NA values to take final element in new_categories
1278-
if np.any(self._codes == -1):
1279-
new_categories = new_categories.insert(len(new_categories), np.nan)
1276+
1277+
not_dictlike_and_no_nans = not (is_dict_like(mapper) and np.nan not in mapper)
1278+
1279+
if na_action is None and not_dictlike_and_no_nans and np.any(self._codes == -1):
1280+
na_value = mapper(np.nan) if callable(mapper) else mapper[np.nan]
1281+
new_categories = new_categories.insert(len(new_categories), na_value)
12801282
return np.take(new_categories, self._codes)
1283+
elif new_categories.is_unique and not new_categories.hasnans:
1284+
new_dtype = CategoricalDtype(new_categories, ordered=self.ordered)
1285+
return self.from_codes(self._codes.copy(), dtype=new_dtype)
1286+
1287+
if np.any(self._codes == -1):
1288+
new_categories = new_categories.insert(len(new_categories), np.nan)
1289+
return np.take(new_categories, self._codes)
12811290

12821291
__eq__ = _cat_compare_op(operator.eq)
12831292
__ne__ = _cat_compare_op(operator.ne)

pandas/core/base.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -937,14 +937,17 @@ def _map_values(self, mapper, na_action=None):
937937

938938
return new_values
939939

940-
# we must convert to python types
941-
if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
940+
if is_categorical_dtype(self.dtype):
941+
values = self._values
942+
map_f = lambda values, f: values.map(f, na_action=na_action)
943+
elif is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
942944
# GH#23179 some EAs do not have `map`
943945
values = self._values
944946
if na_action is not None:
945947
raise NotImplementedError
946948
map_f = lambda values, f: values.map(f)
947949
else:
950+
# we must convert to python types
948951
values = self._values.astype(object)
949952
if na_action == "ignore":
950953
map_f = lambda values, f: lib.map_infer_mask(

pandas/core/indexes/category.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
TYPE_CHECKING,
55
Any,
66
Hashable,
7+
Literal,
78
)
89

910
import numpy as np
@@ -402,7 +403,7 @@ def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
402403
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
403404
return self.categories._is_comparable_dtype(dtype)
404405

405-
def map(self, mapper):
406+
def map(self, mapper, na_action: Literal["ignore"] | None = None):
406407
"""
407408
Map values using input an input mapping or function.
408409
@@ -469,7 +470,7 @@ def map(self, mapper):
469470
>>> idx.map({'a': 'first', 'b': 'second'})
470471
Index(['first', 'second', nan], dtype='object')
471472
"""
472-
mapped = self._values.map(mapper)
473+
mapped = self._values.map(mapper, na_action=na_action)
473474
return Index(mapped, name=self.name)
474475

475476
def _concat(self, to_concat: list[Index], name: Hashable) -> Index:

pandas/tests/apply/test_invalid_arg.py

-8
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
from pandas.errors import SpecificationError
1616

1717
from pandas import (
18-
Categorical,
1918
DataFrame,
2019
Series,
2120
date_range,
@@ -76,13 +75,6 @@ def test_map_arg_is_dict_with_invalid_na_action_raises(input_na_action):
7675
s.map({1: 2}, na_action=input_na_action)
7776

7877

79-
def test_map_categorical_na_action():
80-
values = Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
81-
s = Series(values, name="XX", index=list("abcdefg"))
82-
with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN):
83-
s.map(lambda x: x, na_action="ignore")
84-
85-
8678
def test_map_datetimetz_na_action():
8779
values = date_range("2011-01-01", "2011-01-02", freq="H").tz_localize("Asia/Tokyo")
8880
s = Series(values, name="XX")

pandas/tests/apply/test_series_apply.py

+26-3
Original file line numberDiff line numberDiff line change
@@ -748,22 +748,45 @@ def test_map_box():
748748
tm.assert_series_equal(res, exp)
749749

750750

751-
def test_map_categorical():
751+
@pytest.mark.parametrize("na_action", [None, "ignore"])
752+
def test_map_categorical(na_action):
752753
values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
753754
s = Series(values, name="XX", index=list("abcdefg"))
754755

755-
result = s.map(lambda x: x.lower())
756+
result = s.map(lambda x: x.lower(), na_action=na_action)
756757
exp_values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True)
757758
exp = Series(exp_values, name="XX", index=list("abcdefg"))
758759
tm.assert_series_equal(result, exp)
759760
tm.assert_categorical_equal(result.values, exp_values)
760761

761-
result = s.map(lambda x: "A")
762+
result = s.map(lambda x: "A", na_action=na_action)
762763
exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
763764
tm.assert_series_equal(result, exp)
764765
assert result.dtype == object
765766

766767

768+
@pytest.mark.parametrize(
769+
"na_action, expected",
770+
(
771+
[None, Series(["A", "B", "nan"], name="XX")],
772+
[
773+
"ignore",
774+
Series(
775+
["A", "B", np.nan],
776+
name="XX",
777+
dtype=pd.CategoricalDtype(list("DCBA"), True),
778+
),
779+
],
780+
),
781+
)
782+
def test_map_categorical_na_action(na_action, expected):
783+
dtype = pd.CategoricalDtype(list("DCBA"), ordered=True)
784+
values = pd.Categorical(list("AB") + [np.nan], dtype=dtype)
785+
s = Series(values, name="XX")
786+
result = s.map(str, na_action=na_action)
787+
tm.assert_series_equal(result, expected)
788+
789+
767790
def test_map_datetimetz():
768791
values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize(
769792
"Asia/Tokyo"
+138
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,138 @@
1+
import numpy as np
2+
import pytest
3+
4+
import pandas as pd
5+
from pandas import (
6+
Categorical,
7+
Index,
8+
Series,
9+
)
10+
import pandas._testing as tm
11+
12+
13+
@pytest.fixture(params=[None, "ignore"])
14+
def na_action(request):
15+
return request.param
16+
17+
18+
class TestMap:
19+
@pytest.mark.parametrize(
20+
"data, categories",
21+
[
22+
(list("abcbca"), list("cab")),
23+
(pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)),
24+
],
25+
ids=["string", "interval"],
26+
)
27+
def test_map_str(self, data, categories, ordered, na_action):
28+
# GH 31202 - override base class since we want to maintain categorical/ordered
29+
cat = Categorical(data, categories=categories, ordered=ordered)
30+
result = cat.map(str, na_action=na_action)
31+
expected = Categorical(
32+
map(str, data), categories=map(str, categories), ordered=ordered
33+
)
34+
tm.assert_categorical_equal(result, expected)
35+
36+
def test_map(self, na_action):
37+
cat = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
38+
result = cat.map(lambda x: x.lower(), na_action=na_action)
39+
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
40+
tm.assert_categorical_equal(result, exp)
41+
42+
cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
43+
result = cat.map(lambda x: x.lower(), na_action=na_action)
44+
exp = Categorical(list("ababc"), categories=list("bac"), ordered=False)
45+
tm.assert_categorical_equal(result, exp)
46+
47+
# GH 12766: Return an index not an array
48+
result = cat.map(lambda x: 1, na_action=na_action)
49+
exp = Index(np.array([1] * 5, dtype=np.int64))
50+
tm.assert_index_equal(result, exp)
51+
52+
# change categories dtype
53+
cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
54+
55+
def f(x):
56+
return {"A": 10, "B": 20, "C": 30}.get(x)
57+
58+
result = cat.map(f, na_action=na_action)
59+
exp = Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False)
60+
tm.assert_categorical_equal(result, exp)
61+
62+
mapper = Series([10, 20, 30], index=["A", "B", "C"])
63+
result = cat.map(mapper, na_action=na_action)
64+
tm.assert_categorical_equal(result, exp)
65+
66+
result = cat.map({"A": 10, "B": 20, "C": 30}, na_action=na_action)
67+
tm.assert_categorical_equal(result, exp)
68+
69+
@pytest.mark.parametrize(
70+
("data", "f", "expected"),
71+
(
72+
([1, 1, np.nan], pd.isna, Index([False, False, True])),
73+
([1, 2, np.nan], pd.isna, Index([False, False, True])),
74+
([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
75+
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
76+
(
77+
[1, 1, np.nan],
78+
Series([False, False]),
79+
Categorical([False, False, np.nan]),
80+
),
81+
(
82+
[1, 2, np.nan],
83+
Series([False] * 3),
84+
Index([False, False, np.nan]),
85+
),
86+
),
87+
)
88+
def test_map_with_nan_none(self, data, f, expected): # GH 24241
89+
values = Categorical(data)
90+
result = values.map(f, na_action=None)
91+
if isinstance(expected, Categorical):
92+
tm.assert_categorical_equal(result, expected)
93+
else:
94+
tm.assert_index_equal(result, expected)
95+
96+
@pytest.mark.parametrize(
97+
("data", "f", "expected"),
98+
(
99+
([1, 1, np.nan], pd.isna, Categorical([False, False, np.nan])),
100+
([1, 2, np.nan], pd.isna, Index([False, False, np.nan])),
101+
([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
102+
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
103+
(
104+
[1, 1, np.nan],
105+
Series([False, False]),
106+
Categorical([False, False, np.nan]),
107+
),
108+
(
109+
[1, 2, np.nan],
110+
Series([False, False, False]),
111+
Index([False, False, np.nan]),
112+
),
113+
),
114+
)
115+
def test_map_with_nan_ignore(self, data, f, expected): # GH 24241
116+
values = Categorical(data)
117+
result = values.map(f, na_action="ignore")
118+
if data[1] == 1:
119+
tm.assert_categorical_equal(result, expected)
120+
else:
121+
tm.assert_index_equal(result, expected)
122+
123+
def test_map_with_dict_or_series(self, na_action):
124+
orig_values = ["a", "B", 1, "a"]
125+
new_values = ["one", 2, 3.0, "one"]
126+
cat = Categorical(orig_values)
127+
128+
mapper = Series(new_values[:-1], index=orig_values[:-1])
129+
result = cat.map(mapper, na_action=na_action)
130+
131+
# Order of categories in result can be different
132+
expected = Categorical(new_values, categories=[3.0, 2, "one"])
133+
tm.assert_categorical_equal(result, expected)
134+
135+
mapper = dict(zip(orig_values[:-1], new_values[:-1]))
136+
result = cat.map(mapper, na_action=na_action)
137+
# Order of categories in result can be different
138+
tm.assert_categorical_equal(result, expected)

pandas/tests/indexes/categorical/test_map.py

+43-16
Original file line numberDiff line numberDiff line change
@@ -78,25 +78,52 @@ def test_map_with_categorical_series(self):
7878
tm.assert_index_equal(a.map(c), exp)
7979

8080
@pytest.mark.parametrize(
81-
("data", "f"),
81+
("data", "f", "expected"),
8282
(
83-
([1, 1, np.nan], pd.isna),
84-
([1, 2, np.nan], pd.isna),
85-
([1, 1, np.nan], {1: False}),
86-
([1, 2, np.nan], {1: False, 2: False}),
87-
([1, 1, np.nan], Series([False, False])),
88-
([1, 2, np.nan], Series([False, False, False])),
83+
([1, 1, np.nan], pd.isna, CategoricalIndex([False, False, np.nan])),
84+
([1, 2, np.nan], pd.isna, Index([False, False, np.nan])),
85+
([1, 1, np.nan], {1: False}, CategoricalIndex([False, False, np.nan])),
86+
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
87+
(
88+
[1, 1, np.nan],
89+
Series([False, False]),
90+
CategoricalIndex([False, False, np.nan]),
91+
),
92+
(
93+
[1, 2, np.nan],
94+
Series([False, False, False]),
95+
Index([False, False, np.nan]),
96+
),
8997
),
9098
)
91-
def test_map_with_nan(self, data, f): # GH 24241
92-
values = pd.Categorical(data)
93-
result = values.map(f)
94-
if data[1] == 1:
95-
expected = pd.Categorical([False, False, np.nan])
96-
tm.assert_categorical_equal(result, expected)
97-
else:
98-
expected = Index([False, False, np.nan])
99-
tm.assert_index_equal(result, expected)
99+
def test_map_with_nan_ignore(self, data, f, expected): # GH 24241
100+
values = CategoricalIndex(data)
101+
result = values.map(f, na_action="ignore")
102+
tm.assert_index_equal(result, expected)
103+
104+
@pytest.mark.parametrize(
105+
("data", "f", "expected"),
106+
(
107+
([1, 1, np.nan], pd.isna, Index([False, False, True])),
108+
([1, 2, np.nan], pd.isna, Index([False, False, True])),
109+
([1, 1, np.nan], {1: False}, CategoricalIndex([False, False, np.nan])),
110+
([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
111+
(
112+
[1, 1, np.nan],
113+
Series([False, False]),
114+
CategoricalIndex([False, False, np.nan]),
115+
),
116+
(
117+
[1, 2, np.nan],
118+
Series([False, False, False]),
119+
Index([False, False, np.nan]),
120+
),
121+
),
122+
)
123+
def test_map_with_nan_none(self, data, f, expected): # GH 24241
124+
values = CategoricalIndex(data)
125+
result = values.map(f, na_action=None)
126+
tm.assert_index_equal(result, expected)
100127

101128
def test_map_with_dict_or_series(self):
102129
orig_values = ["a", "B", 1, "a"]

0 commit comments

Comments
 (0)