Skip to content

Commit 8cdef8a

Browse files
committed
ENH: add na_action to Categorical.map & CategoricalIndex.map (pandas-dev#51645)
* ENH: add na_action to Categorical.map * add GH numbers * pre-commit issues * map Categorical with Series * REF: simplify .map * pass test_map * fix whatsnew * cleanups * pre-commit * deprecate Categorical.map(na_action=ignore) * fix docstrings * fix rebase * simplity implementation * fix warn * fix comments
1 parent 40c48b7 commit 8cdef8a

File tree

11 files changed

+383
-147
lines changed

11 files changed

+383
-147
lines changed

doc/source/whatsnew/v2.1.0.rst

+6-1
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,9 @@ enhancement2
2828

2929
Other enhancements
3030
^^^^^^^^^^^^^^^^^^
31+
- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter.
32+
:meth:`Categorical.map` implicitly had a default value of ``"ignore"`` for ``na_action``. This has formally been deprecated and will be changed to ``None`` in the future.
33+
Also notice that :meth:`Series.map` has default ``na_action=None`` and calls to series with categorical data will now use ``na_action=None`` unless explicitly set otherwise (:issue:`44279`)
3134
- Implemented ``__pandas_priority__`` to allow custom types to take precedence over :class:`DataFrame`, :class:`Series`, :class:`Index`, or :class:`ExtensionArray` for arithmetic operations, :ref:`see the developer guide <extending.pandas_priority>` (:issue:`48347`)
3235
- :meth:`MultiIndex.sort_values` now supports ``na_position`` (:issue:`51612`)
3336
- :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`)
@@ -38,7 +41,9 @@ Other enhancements
3841
- Improved error message when creating a DataFrame with empty data (0 rows), no index and an incorrect number of columns. (:issue:`52084`)
3942
- :meth:`DataFrame.applymap` now uses the :meth:`~api.extensions.ExtensionArray.map` method of underlying :class:`api.extensions.ExtensionArray` instances (:issue:`52219`)
4043
- :meth:`arrays.SparseArray.map` now supports ``na_action`` (:issue:`52096`).
44+
- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter (:issue:`44279`)
4145
- Add dtype of categories to ``repr`` information of :class:`CategoricalDtype` (:issue:`52179`)
46+
-
4247

4348
.. ---------------------------------------------------------------------------
4449
.. _whatsnew_210.notable_bug_fixes:
@@ -151,7 +156,7 @@ Bug fixes
151156

152157
Categorical
153158
^^^^^^^^^^^
154-
-
159+
- Bug in :meth:`Series.map` , where the value of the ``na_action`` parameter was not used if the series held a :class:`Categorical` (:issue:`22527`).
155160
-
156161

157162
Datetimelike

pandas/core/apply.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141

4242
from pandas.core.dtypes.cast import is_nested_object
4343
from pandas.core.dtypes.common import (
44+
is_categorical_dtype,
4445
is_dict_like,
4546
is_list_like,
4647
is_sequence,
@@ -1084,7 +1085,12 @@ def apply_standard(self) -> DataFrame | Series:
10841085
return f(obj)
10851086

10861087
# row-wise access
1087-
mapped = obj._map_values(mapper=f, convert=self.convert_dtype)
1088+
# apply doesn't have a `na_action` keyword and for backward compat reasons
1089+
# we need to give `na_action="ignore"` for categorical data.
1090+
# TODO: remove the `na_action="ignore"` when that default has been changed in
1091+
# Categorical (GH51645).
1092+
action = "ignore" if is_categorical_dtype(obj) else None
1093+
mapped = obj._map_values(mapper=f, na_action=action, convert=self.convert_dtype)
10881094

10891095
if len(mapped) and isinstance(mapped[0], ABCSeries):
10901096
warnings.warn(

pandas/core/arrays/categorical.py

+45-18
Original file line numberDiff line numberDiff line change
@@ -1203,7 +1203,11 @@ def remove_unused_categories(self) -> Categorical:
12031203

12041204
# ------------------------------------------------------------------
12051205

1206-
def map(self, mapper, na_action=None):
1206+
def map(
1207+
self,
1208+
mapper,
1209+
na_action: Literal["ignore"] | None | lib.NoDefault = lib.no_default,
1210+
):
12071211
"""
12081212
Map categories using an input mapping or function.
12091213
@@ -1220,6 +1224,14 @@ def map(self, mapper, na_action=None):
12201224
----------
12211225
mapper : function, dict, or Series
12221226
Mapping correspondence.
1227+
na_action : {None, 'ignore'}, default 'ignore'
1228+
If 'ignore', propagate NaN values, without passing them to the
1229+
mapping correspondence.
1230+
1231+
.. deprecated:: 2.1.0
1232+
1233+
The default value of 'ignore' has been deprecated and will be changed to
1234+
None in the future.
12231235
12241236
Returns
12251237
-------
@@ -1243,10 +1255,10 @@ def map(self, mapper, na_action=None):
12431255
>>> cat
12441256
['a', 'b', 'c']
12451257
Categories (3, object): ['a', 'b', 'c']
1246-
>>> cat.map(lambda x: x.upper())
1258+
>>> cat.map(lambda x: x.upper(), na_action=None)
12471259
['A', 'B', 'C']
12481260
Categories (3, object): ['A', 'B', 'C']
1249-
>>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'})
1261+
>>> cat.map({'a': 'first', 'b': 'second', 'c': 'third'}, na_action=None)
12501262
['first', 'second', 'third']
12511263
Categories (3, object): ['first', 'second', 'third']
12521264
@@ -1257,35 +1269,50 @@ def map(self, mapper, na_action=None):
12571269
>>> cat
12581270
['a', 'b', 'c']
12591271
Categories (3, object): ['a' < 'b' < 'c']
1260-
>>> cat.map({'a': 3, 'b': 2, 'c': 1})
1272+
>>> cat.map({'a': 3, 'b': 2, 'c': 1}, na_action=None)
12611273
[3, 2, 1]
12621274
Categories (3, int64): [3 < 2 < 1]
12631275
12641276
If the mapping is not one-to-one an :class:`~pandas.Index` is returned:
12651277
1266-
>>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'})
1278+
>>> cat.map({'a': 'first', 'b': 'second', 'c': 'first'}, na_action=None)
12671279
Index(['first', 'second', 'first'], dtype='object')
12681280
12691281
If a `dict` is used, all unmapped categories are mapped to `NaN` and
12701282
the result is an :class:`~pandas.Index`:
12711283
1272-
>>> cat.map({'a': 'first', 'b': 'second'})
1284+
>>> cat.map({'a': 'first', 'b': 'second'}, na_action=None)
12731285
Index(['first', 'second', nan], dtype='object')
12741286
"""
1275-
if na_action is not None:
1276-
raise NotImplementedError
1287+
if na_action is lib.no_default:
1288+
warnings.warn(
1289+
"The default value of 'ignore' for the `na_action` parameter in "
1290+
"pandas.Categorical.map is deprecated and will be "
1291+
"changed to 'None' in a future version. Please set na_action to the "
1292+
"desired value to avoid seeing this warning",
1293+
FutureWarning,
1294+
stacklevel=find_stack_level(),
1295+
)
1296+
na_action = "ignore"
1297+
1298+
assert callable(mapper) or is_dict_like(mapper)
12771299

12781300
new_categories = self.categories.map(mapper)
1279-
try:
1280-
return self.from_codes(
1281-
self._codes.copy(), categories=new_categories, ordered=self.ordered
1282-
)
1283-
except ValueError:
1284-
# NA values are represented in self._codes with -1
1285-
# np.take causes NA values to take final element in new_categories
1286-
if np.any(self._codes == -1):
1287-
new_categories = new_categories.insert(len(new_categories), np.nan)
1288-
return np.take(new_categories, self._codes)
1301+
1302+
has_nans = np.any(self._codes == -1)
1303+
1304+
na_val = np.nan
1305+
if na_action is None and has_nans:
1306+
na_val = mapper(np.nan) if callable(mapper) else mapper.get(np.nan, np.nan)
1307+
1308+
if new_categories.is_unique and not new_categories.hasnans and na_val is np.nan:
1309+
new_dtype = CategoricalDtype(new_categories, ordered=self.ordered)
1310+
return self.from_codes(self._codes.copy(), dtype=new_dtype)
1311+
1312+
if has_nans:
1313+
new_categories = new_categories.insert(len(new_categories), na_val)
1314+
1315+
return np.take(new_categories, self._codes)
12891316

12901317
__eq__ = _cat_compare_op(operator.eq)
12911318
__ne__ = _cat_compare_op(operator.ne)

pandas/core/indexes/category.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
TYPE_CHECKING,
55
Any,
66
Hashable,
7+
Literal,
78
cast,
89
)
910

@@ -402,7 +403,7 @@ def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
402403
def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
403404
return self.categories._is_comparable_dtype(dtype)
404405

405-
def map(self, mapper):
406+
def map(self, mapper, na_action: Literal["ignore"] | None = None):
406407
"""
407408
Map values using input an input mapping or function.
408409
@@ -469,7 +470,7 @@ def map(self, mapper):
469470
>>> idx.map({'a': 'first', 'b': 'second'})
470471
Index(['first', 'second', nan], dtype='object')
471472
"""
472-
mapped = self._values.map(mapper)
473+
mapped = self._values.map(mapper, na_action=na_action)
473474
return Index(mapped, name=self.name)
474475

475476
def _concat(self, to_concat: list[Index], name: Hashable) -> Index:

pandas/tests/apply/test_invalid_arg.py

-8
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
from pandas.errors import SpecificationError
1616

1717
from pandas import (
18-
Categorical,
1918
DataFrame,
2019
Series,
2120
date_range,
@@ -76,13 +75,6 @@ def test_map_arg_is_dict_with_invalid_na_action_raises(input_na_action):
7675
s.map({1: 2}, na_action=input_na_action)
7776

7877

79-
def test_map_categorical_na_action():
80-
values = Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
81-
s = Series(values, name="XX", index=list("abcdefg"))
82-
with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN):
83-
s.map(lambda x: x, na_action="ignore")
84-
85-
8678
@pytest.mark.parametrize("method", ["apply", "agg", "transform"])
8779
@pytest.mark.parametrize("func", [{"A": {"B": "sum"}}, {"A": {"B": ["sum"]}}])
8880
def test_nested_renamer(frame_or_series, method, func):

pandas/tests/apply/test_series_apply.py

+33-7
Original file line numberDiff line numberDiff line change
@@ -656,12 +656,15 @@ def test_map_defaultdict_ignore_na():
656656
tm.assert_series_equal(result, expected)
657657

658658

659-
def test_map_categorical_na_ignore():
659+
@pytest.mark.parametrize(
660+
"na_action, expected",
661+
[(None, Series([10.0, 42.0, np.nan])), ("ignore", Series([10, np.nan, np.nan]))],
662+
)
663+
def test_map_categorical_na_ignore(na_action, expected):
660664
# GH#47527
661-
values = pd.Categorical([1, np.nan, 2], categories=[10, 1])
665+
values = pd.Categorical([1, np.nan, 2], categories=[10, 1, 2])
662666
ser = Series(values)
663-
result = ser.map({1: 10, np.nan: 42})
664-
expected = Series([10, np.nan, np.nan])
667+
result = ser.map({1: 10, np.nan: 42}, na_action=na_action)
665668
tm.assert_series_equal(result, expected)
666669

667670

@@ -755,22 +758,45 @@ def test_map_box():
755758
tm.assert_series_equal(res, exp)
756759

757760

758-
def test_map_categorical():
761+
@pytest.mark.parametrize("na_action", [None, "ignore"])
762+
def test_map_categorical(na_action):
759763
values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
760764
s = Series(values, name="XX", index=list("abcdefg"))
761765

762-
result = s.map(lambda x: x.lower())
766+
result = s.map(lambda x: x.lower(), na_action=na_action)
763767
exp_values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True)
764768
exp = Series(exp_values, name="XX", index=list("abcdefg"))
765769
tm.assert_series_equal(result, exp)
766770
tm.assert_categorical_equal(result.values, exp_values)
767771

768-
result = s.map(lambda x: "A")
772+
result = s.map(lambda x: "A", na_action=na_action)
769773
exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
770774
tm.assert_series_equal(result, exp)
771775
assert result.dtype == object
772776

773777

778+
@pytest.mark.parametrize(
779+
"na_action, expected",
780+
(
781+
[None, Series(["A", "B", "nan"], name="XX")],
782+
[
783+
"ignore",
784+
Series(
785+
["A", "B", np.nan],
786+
name="XX",
787+
dtype=pd.CategoricalDtype(list("DCBA"), True),
788+
),
789+
],
790+
),
791+
)
792+
def test_map_categorical_na_action(na_action, expected):
793+
dtype = pd.CategoricalDtype(list("DCBA"), ordered=True)
794+
values = pd.Categorical(list("AB") + [np.nan], dtype=dtype)
795+
s = Series(values, name="XX")
796+
result = s.map(str, na_action=na_action)
797+
tm.assert_series_equal(result, expected)
798+
799+
774800
def test_map_datetimetz():
775801
values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize(
776802
"Asia/Tokyo"

pandas/tests/arrays/categorical/test_analytics.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -300,16 +300,16 @@ def test_memory_usage(self):
300300

301301
def test_map(self):
302302
c = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
303-
result = c.map(lambda x: x.lower())
303+
result = c.map(lambda x: x.lower(), na_action=None)
304304
exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
305305
tm.assert_categorical_equal(result, exp)
306306

307307
c = Categorical(list("ABABC"), categories=list("ABC"), ordered=False)
308-
result = c.map(lambda x: x.lower())
308+
result = c.map(lambda x: x.lower(), na_action=None)
309309
exp = Categorical(list("ababc"), categories=list("abc"), ordered=False)
310310
tm.assert_categorical_equal(result, exp)
311311

312-
result = c.map(lambda x: 1)
312+
result = c.map(lambda x: 1, na_action=None)
313313
# GH 12766: Return an index not an array
314314
tm.assert_index_equal(result, Index(np.array([1] * 5, dtype=np.int64)))
315315

0 commit comments

Comments
 (0)