REF: Add ExtensionArray.map (#51809)

topper-123 · web-flow · commit ecc6ead06b54 · 2023-03-13T09:49:55.000-07:00
* REF: Add ExtensionArray.map

* add gh number

* add tests

* update after comments

* add back pylint
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -32,6 +32,7 @@ Other enhancements
 - :meth:`MultiIndex.sortlevel` and :meth:`Index.sortlevel` gained a new keyword ``na_position`` (:issue:`51612`)
 - Improve error message when setting :class:`DataFrame` with wrong number of columns through :meth:`DataFrame.isetitem` (:issue:`51701`)
 - Let :meth:`DataFrame.to_feather` accept a non-default :class:`Index` and non-string column names (:issue:`51787`)
+- :class:`api.extensions.ExtensionArray` now has a :meth:`~api.extensions.ExtensionArray.map` method (:issue:`51809`).
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_210.notable_bug_fixes:
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -46,6 +46,7 @@
     is_categorical_dtype,
     is_complex_dtype,
     is_datetime64_dtype,
+    is_dict_like,
     is_extension_array_dtype,
     is_float_dtype,
     is_integer,
@@ -1678,3 +1679,75 @@ def union_with_duplicates(
         unique_vals = ensure_wrapped_if_datetimelike(unique_vals)
     repeats = final_count.reindex(unique_vals).values
     return np.repeat(unique_vals, repeats)
+
+
+def map_array(
+    arr: ArrayLike, mapper, na_action: Literal["ignore"] | None = None
+) -> np.ndarray | ExtensionArray | Index:
+    """
+    Map values using an input mapping or function.
+
+    Parameters
+    ----------
+    mapper : function, dict, or Series
+        Mapping correspondence.
+    na_action : {None, 'ignore'}, default None
+        If 'ignore', propagate NA values, without passing them to the
+        mapping correspondence.
+
+    Returns
+    -------
+    Union[ndarray, Index, ExtensionArray]
+        The output of the mapping function applied to the array.
+        If the function returns a tuple with more than one element
+        a MultiIndex will be returned.
+    """
+    if na_action not in (None, "ignore"):
+        msg = f"na_action must either be 'ignore' or None, {na_action} was passed"
+        raise ValueError(msg)
+
+    # we can fastpath dict/Series to an efficient map
+    # as we know that we are not going to have to yield
+    # python types
+    if is_dict_like(mapper):
+        if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
+            # If a dictionary subclass defines a default value method,
+            # convert mapper to a lookup function (GH #15999).
+            dict_with_default = mapper
+            mapper = lambda x: dict_with_default[
+                np.nan if isinstance(x, float) and np.isnan(x) else x
+            ]
+        else:
+            # Dictionary does not have a default. Thus it's safe to
+            # convert to an Series for efficiency.
+            # we specify the keys here to handle the
+            # possibility that they are tuples
+
+            # The return value of mapping with an empty mapper is
+            # expected to be pd.Series(np.nan, ...). As np.nan is
+            # of dtype float64 the return value of this method should
+            # be float64 as well
+            from pandas import Series
+
+            if len(mapper) == 0:
+                mapper = Series(mapper, dtype=np.float64)
+            else:
+                mapper = Series(mapper)
+
+    if isinstance(mapper, ABCSeries):
+        if na_action == "ignore":
+            mapper = mapper[mapper.index.notna()]
+
+        # Since values were input this means we came from either
+        # a dict or a series and mapper should be an index
+        indexer = mapper.index.get_indexer(arr)
+        new_values = take_nd(mapper._values, indexer)
+
+        return new_values
+
+    # we must convert to python types
+    values = arr.astype(object, copy=False)
+    if na_action is None:
+        return lib.map_infer(values, mapper)
+    else:
+        return lib.map_infer_mask(values, mapper, isna(values).view(np.uint8))
diff --git a/pandas/core/apply.py b/pandas/core/apply.py
@@ -1068,8 +1068,7 @@ def apply_standard(self) -> DataFrame | Series:
                 return f(obj)
 
             # row-wise access
-            if is_extension_array_dtype(obj.dtype) and hasattr(obj._values, "map"):
-                # GH#23179 some EAs do not have `map`
+            if is_extension_array_dtype(obj.dtype):
                 mapped = obj._values.map(f)
             else:
                 values = obj.astype(object)._values
diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py
@@ -64,6 +64,7 @@
 from pandas.core.algorithms import (
     factorize_array,
     isin,
+    map_array,
     mode,
     rank,
     unique,
@@ -178,6 +179,7 @@ class ExtensionArray:
     * factorize / _values_for_factorize
     * argsort, argmax, argmin / _values_for_argsort
     * searchsorted
+    * map
 
     The remaining methods implemented on this class should be performant,
     as they only compose abstract methods. Still, a more efficient
@@ -1704,6 +1706,28 @@ def __array_ufunc__(self, ufunc: np.ufunc, method: str, *inputs, **kwargs):
 
         return arraylike.default_array_ufunc(self, ufunc, method, *inputs, **kwargs)
 
+    def map(self, mapper, na_action=None):
+        """
+        Map values using an input mapping or function.
+
+        Parameters
+        ----------
+        mapper : function, dict, or Series
+            Mapping correspondence.
+        na_action : {None, 'ignore'}, default None
+            If 'ignore', propagate NA values, without passing them to the
+            mapping correspondence. If 'ignore' is not supported, a
+            ``NotImplementedError`` should be raised.
+
+        Returns
+        -------
+        Union[ndarray, Index, ExtensionArray]
+            The output of the mapping function applied to the array.
+            If the function returns a tuple with more than one element
+            a MultiIndex will be returned.
+        """
+        return map_array(self, mapper, na_action=na_action)
+
 
 class ExtensionArraySupportsAnyAll(ExtensionArray):
     def any(self, *, skipna: bool = True) -> bool:
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1199,7 +1199,7 @@ def remove_unused_categories(self) -> Categorical:
 
     # ------------------------------------------------------------------
 
-    def map(self, mapper):
+    def map(self, mapper, na_action=None):
         """
         Map categories using an input mapping or function.
 
@@ -1268,6 +1268,9 @@ def map(self, mapper):
         >>> cat.map({'a': 'first', 'b': 'second'})
         Index(['first', 'second', nan], dtype='object')
         """
+        if na_action is not None:
+            raise NotImplementedError
+
         new_categories = self.categories.map(mapper)
         try:
             return self.from_codes(
diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py
@@ -750,7 +750,10 @@ def _unbox(self, other) -> np.int64 | np.datetime64 | np.timedelta64 | np.ndarra
     #  pandas assumes they're there.
 
     @ravel_compat
-    def map(self, mapper):
+    def map(self, mapper, na_action=None):
+        if na_action is not None:
+            raise NotImplementedError
+
         # TODO(GH-23179): Add ExtensionArray.map
         # Need to figure out if we want ExtensionArray.map first.
         # If so, then we can refactor IndexOpsMixin._map_values to
diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -1271,14 +1271,17 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
 
         return self._simple_new(sp_values, self.sp_index, dtype)
 
-    def map(self: SparseArrayT, mapper) -> SparseArrayT:
+    def map(self: SparseArrayT, mapper, na_action=None) -> SparseArrayT:
         """
         Map categories using an input mapping or function.
 
         Parameters
         ----------
         mapper : dict, Series, callable
             The correspondence from old values to new.
+        na_action : {None, 'ignore'}, default None
+            If 'ignore', propagate NA values, without passing them to the
+            mapping correspondence.
 
         Returns
         -------
@@ -1308,6 +1311,9 @@ def map(self: SparseArrayT, mapper) -> SparseArrayT:
         IntIndex
         Indices: array([1, 2], dtype=int32)
         """
+        if na_action is not None:
+            raise NotImplementedError
+
         # this is used in apply.
         # We get hit since we're an "is_extension_array_dtype" but regular extension
         # types are not hit. This may be worth adding to the interface.
diff --git a/pandas/core/base.py b/pandas/core/base.py
@@ -40,8 +40,6 @@
 
 from pandas.core.dtypes.cast import can_hold_element
 from pandas.core.dtypes.common import (
-    is_categorical_dtype,
-    is_dict_like,
     is_extension_array_dtype,
     is_object_dtype,
     is_scalar,
@@ -78,7 +76,6 @@
     )
 
     from pandas import (
-        Categorical,
         Index,
         Series,
     )
@@ -882,87 +879,19 @@ def _map_values(self, mapper, na_action=None):
             If the function returns a tuple with more than one element
             a MultiIndex will be returned.
         """
-        # we can fastpath dict/Series to an efficient map
-        # as we know that we are not going to have to yield
-        # python types
-        if is_dict_like(mapper):
-            if isinstance(mapper, dict) and hasattr(mapper, "__missing__"):
-                # If a dictionary subclass defines a default value method,
-                # convert mapper to a lookup function (GH #15999).
-                dict_with_default = mapper
-                mapper = lambda x: dict_with_default[
-                    np.nan if isinstance(x, float) and np.isnan(x) else x
-                ]
-            else:
-                # Dictionary does not have a default. Thus it's safe to
-                # convert to an Series for efficiency.
-                # we specify the keys here to handle the
-                # possibility that they are tuples
-
-                # The return value of mapping with an empty mapper is
-                # expected to be pd.Series(np.nan, ...). As np.nan is
-                # of dtype float64 the return value of this method should
-                # be float64 as well
-                from pandas import Series
-
-                if len(mapper) == 0:
-                    mapper = Series(mapper, dtype=np.float64)
-                else:
-                    mapper = Series(mapper)
-
-        if isinstance(mapper, ABCSeries):
-            if na_action not in (None, "ignore"):
-                msg = (
-                    "na_action must either be 'ignore' or None, "
-                    f"{na_action} was passed"
-                )
-                raise ValueError(msg)
-
-            if na_action == "ignore":
-                mapper = mapper[mapper.index.notna()]
-
-            # Since values were input this means we came from either
-            # a dict or a series and mapper should be an index
-            if is_categorical_dtype(self.dtype):
-                # use the built in categorical series mapper which saves
-                # time by mapping the categories instead of all values
-
-                cat = cast("Categorical", self._values)
-                return cat.map(mapper)
-
-            values = self._values
-
-            indexer = mapper.index.get_indexer(values)
-            new_values = algorithms.take_nd(mapper._values, indexer)
-
-            return new_values
-
-        # we must convert to python types
-        if is_extension_array_dtype(self.dtype) and hasattr(self._values, "map"):
-            # GH#23179 some EAs do not have `map`
-            values = self._values
-            if na_action is not None:
-                raise NotImplementedError
-            map_f = lambda values, f: values.map(f)
-        else:
-            values = self._values.astype(object)
-            if na_action == "ignore":
-                map_f = lambda values, f: lib.map_infer_mask(
-                    values, f, isna(values).view(np.uint8)
-                )
-            elif na_action is None:
-                map_f = lib.map_infer
-            else:
-                msg = (
-                    "na_action must either be 'ignore' or None, "
-                    f"{na_action} was passed"
-                )
-                raise ValueError(msg)
-
-        # mapper is a function
-        new_values = map_f(values, mapper)
-
-        return new_values
+        arr = extract_array(self, extract_numpy=True, extract_range=True)
+
+        if is_extension_array_dtype(arr.dtype):
+            # Item "IndexOpsMixin" of "Union[IndexOpsMixin, ExtensionArray,
+            # ndarray[Any, Any]]" has no attribute "map"
+            return arr.map(mapper, na_action=na_action)  # type: ignore[union-attr]
+
+        # Argument 1 to "map_array" has incompatible type
+        # "Union[IndexOpsMixin, ExtensionArray, ndarray[Any, Any]]";
+        # expected "Union[ExtensionArray, ndarray[Any, Any]]"
+        return algorithms.map_array(
+            arr, mapper, na_action=na_action  # type: ignore[arg-type]
+        )
 
     @final
     def value_counts(
diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py
@@ -88,6 +88,12 @@ def test_apply_simple_series(self, data):
         result = pd.Series(data).apply(id)
         assert isinstance(result, pd.Series)
 
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data, na_action):
+        result = data.map(lambda x: x, na_action=na_action)
+        expected = data.to_numpy()
+        tm.assert_numpy_array_equal(result, expected)
+
     def test_argsort(self, data_for_sorting):
         result = pd.Series(data_for_sorting).argsort()
         # argsort result gets passed to take, so should be np.intp
diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py
@@ -184,6 +184,15 @@ def test_combine_add(self, data_repeated):
         expected = pd.Series([a + val for a in list(orig_data1)])
         self.assert_series_equal(result, expected)
 
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data, na_action):
+        if na_action is not None:
+            with pytest.raises(NotImplementedError, match=""):
+                data.map(lambda x: x, na_action=na_action)
+        else:
+            result = data.map(lambda x: x, na_action=na_action)
+            self.assert_extension_array_equal(result, data)
+
 
 class TestCasting(base.BaseCastingTests):
     @pytest.mark.parametrize("cls", [Categorical, CategoricalIndex])
diff --git a/pandas/tests/extension/test_datetime.py b/pandas/tests/extension/test_datetime.py
@@ -116,6 +116,15 @@ def test_combine_add(self, data_repeated):
         # Timestamp.__add__(Timestamp) not defined
         pass
 
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data, na_action):
+        if na_action is not None:
+            with pytest.raises(NotImplementedError, match=""):
+                data.map(lambda x: x, na_action=na_action)
+        else:
+            result = data.map(lambda x: x, na_action=na_action)
+            self.assert_extension_array_equal(result, data)
+
 
 class TestInterface(BaseDatetimeTests, base.BaseInterfaceTests):
     pass
diff --git a/pandas/tests/extension/test_period.py b/pandas/tests/extension/test_period.py
@@ -105,6 +105,15 @@ def test_diff(self, data, periods):
         else:
             super().test_diff(data, periods)
 
+    @pytest.mark.parametrize("na_action", [None, "ignore"])
+    def test_map(self, data, na_action):
+        if na_action is not None:
+            with pytest.raises(NotImplementedError, match=""):
+                data.map(lambda x: x, na_action=na_action)
+        else:
+            result = data.map(lambda x: x, na_action=na_action)
+            self.assert_extension_array_equal(result, data)
+
 
 class TestInterface(BasePeriodTests, base.BaseInterfaceTests):
     pass
diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py