ENH: add na_action to Categorical.map

topper-123 · topper-123 · commit 6b9bc30c52cb · 2023-03-04T21:46:15.000Z
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -29,6 +29,7 @@ enhancement2
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 - Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
+- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter (:issue:``)
 -
 
 .. ---------------------------------------------------------------------------
@@ -118,7 +119,7 @@ Bug fixes
 
 Categorical
 ^^^^^^^^^^^
--
+- Bug in :meth:`Series.map` , where the value of the ``na_action`` parameter was not used if the series held a :class:`Categorical` (:issue:``).
 -
 
 Datetimelike
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1198,7 +1198,7 @@ def remove_unused_categories(self) -> Categorical:
 
     # ------------------------------------------------------------------
 
-    def map(self, mapper):
+    def map(self, mapper, na_action: Literal["ignore"] | None = "ignore"):
         """
         Map categories using an input mapping or function.
 
@@ -1215,6 +1215,9 @@ def map(self, mapper):
         ----------
         mapper : function, dict, or Series
             Mapping correspondence.
+        na_action : {None, 'ignore'}, default 'ignore'
+            If 'ignore', propagate NaN values, without passing them to the
+            mapping correspondence.
 
         Returns
         -------
@@ -1267,17 +1270,23 @@ def map(self, mapper):
         >>> cat.map({'a': 'first', 'b': 'second'})
         Index(['first', 'second', nan], dtype='object')
         """
+        assert callable(mapper) or is_dict_like(mapper)
+
         new_categories = self.categories.map(mapper)
-        try:
-            return self.from_codes(
-                self._codes.copy(), categories=new_categories, ordered=self.ordered
-            )
-        except ValueError:
-            # NA values are represented in self._codes with -1
-            # np.take causes NA values to take final element in new_categories
-            if np.any(self._codes == -1):
-                new_categories = new_categories.insert(len(new_categories), np.nan)
+
+        not_dictlike_and_no_nans = not (is_dict_like(mapper) and np.nan not in mapper)
+
+        if na_action is None and not_dictlike_and_no_nans and np.any(self._codes == -1):
+            na_value = mapper(np.nan) if callable(mapper) else mapper[np.nan]
+            new_categories = new_categories.insert(len(new_categories), na_value)
             return np.take(new_categories, self._codes)
+        elif new_categories.is_unique and not new_categories.hasnans:
+            new_dtype = CategoricalDtype(new_categories, ordered=self.ordered)
+            return self.from_codes(self._codes.copy(), dtype=new_dtype)
+
+        if np.any(self._codes == -1):
+            new_categories = new_categories.insert(len(new_categories), np.nan)
+        return np.take(new_categories, self._codes)
 
     __eq__ = _cat_compare_op(operator.eq)
     __ne__ = _cat_compare_op(operator.ne)
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -937,14 +937,17 @@ def _map_values(self, mapper, na_action=None):
 
             return new_values
 
-        # we must convert to python types
-        if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
+        if is_categorical_dtype(self.dtype):
+            values = self._values
+            map_f = lambda values, f: values.map(f, na_action=na_action)
+        elif is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
             # GH#23179 some EAs do not have `map`
             values = self._values
             if na_action is not None:
                 raise NotImplementedError
             map_f = lambda values, f: values.map(f)
         else:
+            # we must convert to python types
             values = self._values.astype(object)
             if na_action == "ignore":
                 map_f = lambda values, f: lib.map_infer_mask(
diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py
@@ -4,6 +4,7 @@
     TYPE_CHECKING,
     Any,
     Hashable,
+    Literal,
 )
 
 import numpy as np
@@ -402,7 +403,7 @@ def _maybe_cast_listlike_indexer(self, values) -> CategoricalIndex:
     def _is_comparable_dtype(self, dtype: DtypeObj) -> bool:
         return self.categories._is_comparable_dtype(dtype)
 
-    def map(self, mapper):
+    def map(self, mapper, na_action: Literal["ignore"] | None = None):
         """
         Map values using input an input mapping or function.
 
@@ -469,7 +470,7 @@ def map(self, mapper):
         >>> idx.map({'a': 'first', 'b': 'second'})
         Index(['first', 'second', nan], dtype='object')
         """
-        mapped = self._values.map(mapper)
+        mapped = self._values.map(mapper, na_action=na_action)
         return Index(mapped, name=self.name)
 
     def _concat(self, to_concat: list[Index], name: Hashable) -> Index:
diff --git a/pandas/tests/apply/test_invalid_arg.py b/pandas/tests/apply/test_invalid_arg.py
@@ -15,7 +15,6 @@
 from pandas.errors import SpecificationError
 
 from pandas import (
-    Categorical,
     DataFrame,
     Series,
     date_range,
@@ -76,13 +75,6 @@ def test_map_arg_is_dict_with_invalid_na_action_raises(input_na_action):
         s.map({1: 2}, na_action=input_na_action)
 
 
-def test_map_categorical_na_action():
-    values = Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
-    s = Series(values, name="XX", index=list("abcdefg"))
-    with pytest.raises(NotImplementedError, match=tm.EMPTY_STRING_PATTERN):
-        s.map(lambda x: x, na_action="ignore")
-
-
 def test_map_datetimetz_na_action():
     values = date_range("2011-01-01", "2011-01-02", freq="H").tz_localize("Asia/Tokyo")
     s = Series(values, name="XX")
diff --git a/pandas/tests/apply/test_series_apply.py b/pandas/tests/apply/test_series_apply.py
@@ -748,22 +748,45 @@ def test_map_box():
     tm.assert_series_equal(res, exp)
 
 
-def test_map_categorical():
+@pytest.mark.parametrize("na_action", [None, "ignore"])
+def test_map_categorical(na_action):
     values = pd.Categorical(list("ABBABCD"), categories=list("DCBA"), ordered=True)
     s = Series(values, name="XX", index=list("abcdefg"))
 
-    result = s.map(lambda x: x.lower())
+    result = s.map(lambda x: x.lower(), na_action=na_action)
     exp_values = pd.Categorical(list("abbabcd"), categories=list("dcba"), ordered=True)
     exp = Series(exp_values, name="XX", index=list("abcdefg"))
     tm.assert_series_equal(result, exp)
     tm.assert_categorical_equal(result.values, exp_values)
 
-    result = s.map(lambda x: "A")
+    result = s.map(lambda x: "A", na_action=na_action)
     exp = Series(["A"] * 7, name="XX", index=list("abcdefg"))
     tm.assert_series_equal(result, exp)
     assert result.dtype == object
 
 
+@pytest.mark.parametrize(
+    "na_action, expected",
+    (
+        [None, Series(["A", "B", "nan"], name="XX")],
+        [
+            "ignore",
+            Series(
+                ["A", "B", np.nan],
+                name="XX",
+                dtype=pd.CategoricalDtype(list("DCBA"), True),
+            ),
+        ],
+    ),
+)
+def test_map_categorical_na_action(na_action, expected):
+    dtype = pd.CategoricalDtype(list("DCBA"), ordered=True)
+    values = pd.Categorical(list("AB") + [np.nan], dtype=dtype)
+    s = Series(values, name="XX")
+    result = s.map(str, na_action=na_action)
+    tm.assert_series_equal(result, expected)
+
+
 def test_map_datetimetz():
     values = pd.date_range("2011-01-01", "2011-01-02", freq="H").tz_localize(
         "Asia/Tokyo"
diff --git a/pandas/tests/arrays/categorical/test_map.py b/pandas/tests/arrays/categorical/test_map.py
@@ -0,0 +1,138 @@
+import numpy as np
+import pytest
+
+import pandas as pd
+from pandas import (
+    Categorical,
+    Index,
+    Series,
+)
+import pandas._testing as tm
+
+
+@pytest.fixture(params=[None, "ignore"])
+def na_action(request):
+    return request.param
+
+
+class TestMap:
+    @pytest.mark.parametrize(
+        "data, categories",
+        [
+            (list("abcbca"), list("cab")),
+            (pd.interval_range(0, 3).repeat(3), pd.interval_range(0, 3)),
+        ],
+        ids=["string", "interval"],
+    )
+    def test_map_str(self, data, categories, ordered, na_action):
+        # GH 31202 - override base class since we want to maintain categorical/ordered
+        cat = Categorical(data, categories=categories, ordered=ordered)
+        result = cat.map(str, na_action=na_action)
+        expected = Categorical(
+            map(str, data), categories=map(str, categories), ordered=ordered
+        )
+        tm.assert_categorical_equal(result, expected)
+
+    def test_map(self, na_action):
+        cat = Categorical(list("ABABC"), categories=list("CBA"), ordered=True)
+        result = cat.map(lambda x: x.lower(), na_action=na_action)
+        exp = Categorical(list("ababc"), categories=list("cba"), ordered=True)
+        tm.assert_categorical_equal(result, exp)
+
+        cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
+        result = cat.map(lambda x: x.lower(), na_action=na_action)
+        exp = Categorical(list("ababc"), categories=list("bac"), ordered=False)
+        tm.assert_categorical_equal(result, exp)
+
+        # GH 12766: Return an index not an array
+        result = cat.map(lambda x: 1, na_action=na_action)
+        exp = Index(np.array([1] * 5, dtype=np.int64))
+        tm.assert_index_equal(result, exp)
+
+        # change categories dtype
+        cat = Categorical(list("ABABC"), categories=list("BAC"), ordered=False)
+
+        def f(x):
+            return {"A": 10, "B": 20, "C": 30}.get(x)
+
+        result = cat.map(f, na_action=na_action)
+        exp = Categorical([10, 20, 10, 20, 30], categories=[20, 10, 30], ordered=False)
+        tm.assert_categorical_equal(result, exp)
+
+        mapper = Series([10, 20, 30], index=["A", "B", "C"])
+        result = cat.map(mapper, na_action=na_action)
+        tm.assert_categorical_equal(result, exp)
+
+        result = cat.map({"A": 10, "B": 20, "C": 30}, na_action=na_action)
+        tm.assert_categorical_equal(result, exp)
+
+    @pytest.mark.parametrize(
+        ("data", "f", "expected"),
+        (
+            ([1, 1, np.nan], pd.isna, Index([False, False, True])),
+            ([1, 2, np.nan], pd.isna, Index([False, False, True])),
+            ([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
+            ([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
+            (
+                [1, 1, np.nan],
+                Series([False, False]),
+                Categorical([False, False, np.nan]),
+            ),
+            (
+                [1, 2, np.nan],
+                Series([False] * 3),
+                Index([False, False, np.nan]),
+            ),
+        ),
+    )
+    def test_map_with_nan_none(self, data, f, expected):  # GH 24241
+        values = Categorical(data)
+        result = values.map(f, na_action=None)
+        if isinstance(expected, Categorical):
+            tm.assert_categorical_equal(result, expected)
+        else:
+            tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        ("data", "f", "expected"),
+        (
+            ([1, 1, np.nan], pd.isna, Categorical([False, False, np.nan])),
+            ([1, 2, np.nan], pd.isna, Index([False, False, np.nan])),
+            ([1, 1, np.nan], {1: False}, Categorical([False, False, np.nan])),
+            ([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
+            (
+                [1, 1, np.nan],
+                Series([False, False]),
+                Categorical([False, False, np.nan]),
+            ),
+            (
+                [1, 2, np.nan],
+                Series([False, False, False]),
+                Index([False, False, np.nan]),
+            ),
+        ),
+    )
+    def test_map_with_nan_ignore(self, data, f, expected):  # GH 24241
+        values = Categorical(data)
+        result = values.map(f, na_action="ignore")
+        if data[1] == 1:
+            tm.assert_categorical_equal(result, expected)
+        else:
+            tm.assert_index_equal(result, expected)
+
+    def test_map_with_dict_or_series(self, na_action):
+        orig_values = ["a", "B", 1, "a"]
+        new_values = ["one", 2, 3.0, "one"]
+        cat = Categorical(orig_values)
+
+        mapper = Series(new_values[:-1], index=orig_values[:-1])
+        result = cat.map(mapper, na_action=na_action)
+
+        # Order of categories in result can be different
+        expected = Categorical(new_values, categories=[3.0, 2, "one"])
+        tm.assert_categorical_equal(result, expected)
+
+        mapper = dict(zip(orig_values[:-1], new_values[:-1]))
+        result = cat.map(mapper, na_action=na_action)
+        # Order of categories in result can be different
+        tm.assert_categorical_equal(result, expected)
diff --git a/pandas/tests/indexes/categorical/test_map.py b/pandas/tests/indexes/categorical/test_map.py
@@ -78,25 +78,52 @@ def test_map_with_categorical_series(self):
         tm.assert_index_equal(a.map(c), exp)
 
     @pytest.mark.parametrize(
-        ("data", "f"),
+        ("data", "f", "expected"),
         (
-            ([1, 1, np.nan], pd.isna),
-            ([1, 2, np.nan], pd.isna),
-            ([1, 1, np.nan], {1: False}),
-            ([1, 2, np.nan], {1: False, 2: False}),
-            ([1, 1, np.nan], Series([False, False])),
-            ([1, 2, np.nan], Series([False, False, False])),
+            ([1, 1, np.nan], pd.isna, CategoricalIndex([False, False, np.nan])),
+            ([1, 2, np.nan], pd.isna, Index([False, False, np.nan])),
+            ([1, 1, np.nan], {1: False}, CategoricalIndex([False, False, np.nan])),
+            ([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
+            (
+                [1, 1, np.nan],
+                Series([False, False]),
+                CategoricalIndex([False, False, np.nan]),
+            ),
+            (
+                [1, 2, np.nan],
+                Series([False, False, False]),
+                Index([False, False, np.nan]),
+            ),
         ),
     )
-    def test_map_with_nan(self, data, f):  # GH 24241
-        values = pd.Categorical(data)
-        result = values.map(f)
-        if data[1] == 1:
-            expected = pd.Categorical([False, False, np.nan])
-            tm.assert_categorical_equal(result, expected)
-        else:
-            expected = Index([False, False, np.nan])
-            tm.assert_index_equal(result, expected)
+    def test_map_with_nan_ignore(self, data, f, expected):  # GH 24241
+        values = CategoricalIndex(data)
+        result = values.map(f, na_action="ignore")
+        tm.assert_index_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        ("data", "f", "expected"),
+        (
+            ([1, 1, np.nan], pd.isna, Index([False, False, True])),
+            ([1, 2, np.nan], pd.isna, Index([False, False, True])),
+            ([1, 1, np.nan], {1: False}, CategoricalIndex([False, False, np.nan])),
+            ([1, 2, np.nan], {1: False, 2: False}, Index([False, False, np.nan])),
+            (
+                [1, 1, np.nan],
+                Series([False, False]),
+                CategoricalIndex([False, False, np.nan]),
+            ),
+            (
+                [1, 2, np.nan],
+                Series([False, False, False]),
+                Index([False, False, np.nan]),
+            ),
+        ),
+    )
+    def test_map_with_nan_none(self, data, f, expected):  # GH 24241
+        values = CategoricalIndex(data)
+        result = values.map(f, na_action=None)
+        tm.assert_index_equal(result, expected)
 
     def test_map_with_dict_or_series(self):
         orig_values = ["a", "B", 1, "a"]

Original file line number	Diff line number	Diff line change
`@@ -29,6 +29,7 @@ enhancement2`
`29`	`29`	`Other enhancements`
`30`	`30`	`^^^^^^^^^^^^^^^^^^`
`31`	`31`	- Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
	`32`	+- :meth:`Categorical.map` and :meth:`CategoricalIndex.map` now have a ``na_action`` parameter (:issue:``)
`32`	`33`	`-`
`33`	`34`
`34`	`35`	`.. ---------------------------------------------------------------------------`
`@@ -118,7 +119,7 @@ Bug fixes`
`118`	`119`
`119`	`120`	`Categorical`
`120`	`121`	`^^^^^^^^^^^`
`121`		`--`
	`122`	+- Bug in :meth:`Series.map` , where the value of the ``na_action`` parameter was not used if the series held a :class:`Categorical` (:issue:``).
`122`	`123`	`-`
`123`	`124`
`124`	`125`	`Datetimelike`