Merge remote-tracking branch 'upstream/main' into allow-mixed-iso

MarcoGorelli · MarcoGorelli · commit eb36d8ce6bdc · 2023-01-25T10:12:11.000Z
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
diff --git a/doc/source/reference/window.rst b/doc/source/reference/window.rst
@@ -6,9 +6,9 @@
 Window
 ======
 
-Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.rolling`, :func:`pandas.Series.rolling`, etc.
-Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding`, :func:`pandas.Series.expanding`, etc.
-ExponentialMovingWindow objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm`, :func:`pandas.Series.ewm`, etc.
+Rolling objects are returned by ``.rolling`` calls: :func:`pandas.DataFrame.rolling` and :func:`pandas.Series.rolling`.
+Expanding objects are returned by ``.expanding`` calls: :func:`pandas.DataFrame.expanding` and :func:`pandas.Series.expanding`.
+ExponentialMovingWindow objects are returned by ``.ewm`` calls: :func:`pandas.DataFrame.ewm` and :func:`pandas.Series.ewm`.
 
 .. _api.functions_rolling:
 
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -908,6 +908,7 @@ Performance improvements
 - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
 - Performance improvement in :func:`to_datetime` when parsing strings with timezone offsets (:issue:`50107`)
 - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
+- Performance improvement for :meth:`Series.replace` with categorical dtype (:issue:`49404`)
 - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
 - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`, :issue:`49178`)
 - Reduce memory usage of :meth:`DataFrame.to_pickle`/:meth:`Series.to_pickle` when using BZ2 or LZMA (:issue:`49068`)
@@ -952,6 +953,8 @@ Bug fixes
 Categorical
 ^^^^^^^^^^^
 - Bug in :meth:`Categorical.set_categories` losing dtype information (:issue:`48812`)
+- Bug in :meth:`Series.replace` with categorical dtype when ``to_replace`` values overlap with new values (:issue:`49404`)
+- Bug in :meth:`Series.replace` with categorical dtype losing nullable dtypes of underlying categories (:issue:`49404`)
 - Bug in :meth:`DataFrame.groupby` and :meth:`Series.groupby` would reorder categories when used as a grouper (:issue:`48749`)
 - Bug in :class:`Categorical` constructor when constructing from a :class:`Categorical` object and ``dtype="category"`` losing ordered-ness (:issue:`49309`)
 -
@@ -1132,6 +1135,7 @@ Groupby/resample/rolling
 - Bug in :class:`.DataFrameGroupBy` would raise when used with an empty DataFrame, categorical grouper, and ``dropna=False`` (:issue:`50634`)
 - Bug in :meth:`.SeriesGroupBy.value_counts` did not respect ``sort=False`` (:issue:`50482`)
 - Bug in :meth:`.DataFrameGroupBy.resample` raises ``KeyError`` when getting the result from a key list when resampling on time index (:issue:`50840`)
+- Bug in :meth:`.DataFrameGroupBy.transform` and :meth:`.SeriesGroupBy.transform` would raise incorrectly when grouper had ``axis=1`` for ``"ngroup"`` argument (:issue:`45986`)
 -
 
 Reshaping
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -985,7 +985,18 @@ def _accumulate(
         pyarrow_meth = getattr(pc, pyarrow_name, None)
         if pyarrow_meth is None:
             return super()._accumulate(name, skipna=skipna, **kwargs)
-        result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs)
+
+        data_to_accum = self._data
+
+        pa_dtype = data_to_accum.type
+        if pa.types.is_duration(pa_dtype):
+            data_to_accum = data_to_accum.cast(pa.int64())
+
+        result = pyarrow_meth(data_to_accum, skip_nulls=skipna, **kwargs)
+
+        if pa.types.is_duration(pa_dtype):
+            result = result.cast(pa_dtype)
+
         return type(self)(result)
 
     def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
@@ -1012,6 +1023,26 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs):
         ------
         TypeError : subclass does not define reductions
         """
+        pa_type = self._data.type
+
+        data_to_reduce = self._data
+
+        if name in ["any", "all"] and (
+            pa.types.is_integer(pa_type)
+            or pa.types.is_floating(pa_type)
+            or pa.types.is_duration(pa_type)
+        ):
+            # pyarrow only supports any/all for boolean dtype, we allow
+            #  for other dtypes, matching our non-pyarrow behavior
+
+            if pa.types.is_duration(pa_type):
+                data_to_cmp = self._data.cast(pa.int64())
+            else:
+                data_to_cmp = self._data
+
+            not_eq = pc.not_equal(data_to_cmp, 0)
+            data_to_reduce = not_eq
+
         if name == "sem":
 
             def pyarrow_meth(data, skip_nulls, **kwargs):
@@ -1033,8 +1064,9 @@ def pyarrow_meth(data, skip_nulls, **kwargs):
             if pyarrow_meth is None:
                 # Let ExtensionArray._reduce raise the TypeError
                 return super()._reduce(name, skipna=skipna, **kwargs)
+
         try:
-            result = pyarrow_meth(self._data, skip_nulls=skipna, **kwargs)
+            result = pyarrow_meth(data_to_reduce, skip_nulls=skipna, **kwargs)
         except (AttributeError, NotImplementedError, TypeError) as err:
             msg = (
                 f"'{type(self).__name__}' with dtype {self.dtype} "
diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py
@@ -1137,14 +1137,9 @@ def remove_categories(self, removals):
         if not is_list_like(removals):
             removals = [removals]
 
-        removal_set = set(removals)
-        not_included = removal_set - set(self.dtype.categories)
-        new_categories = [c for c in self.dtype.categories if c not in removal_set]
-
-        # GH 10156
-        if any(isna(removals)):
-            not_included = {x for x in not_included if notna(x)}
-            new_categories = [x for x in new_categories if notna(x)]
+        removals = {x for x in set(removals) if notna(x)}
+        new_categories = self.dtype.categories.difference(removals)
+        not_included = removals.difference(self.dtype.categories)
 
         if len(not_included) != 0:
             raise ValueError(f"removals must all be in old categories: {not_included}")
@@ -2273,42 +2268,28 @@ def isin(self, values) -> npt.NDArray[np.bool_]:
         return algorithms.isin(self.codes, code_values)
 
     def _replace(self, *, to_replace, value, inplace: bool = False):
+        from pandas import Index
+
         inplace = validate_bool_kwarg(inplace, "inplace")
         cat = self if inplace else self.copy()
 
-        # other cases, like if both to_replace and value are list-like or if
-        # to_replace is a dict, are handled separately in NDFrame
-        if not is_list_like(to_replace):
-            to_replace = [to_replace]
-
-        categories = cat.categories.tolist()
-        removals = set()
-        for replace_value in to_replace:
-            if value == replace_value:
-                continue
-            if replace_value not in cat.categories:
-                continue
-            if isna(value):
-                removals.add(replace_value)
-                continue
-
-            index = categories.index(replace_value)
-
-            if value in cat.categories:
-                value_index = categories.index(value)
-                cat._codes[cat._codes == index] = value_index
-                removals.add(replace_value)
-            else:
-                categories[index] = value
-                cat._set_categories(categories)
+        mask = isna(np.asarray(value))
+        if mask.any():
+            removals = np.asarray(to_replace)[mask]
+            removals = cat.categories[cat.categories.isin(removals)]
+            new_cat = cat.remove_categories(removals)
+            NDArrayBacked.__init__(cat, new_cat.codes, new_cat.dtype)
 
-        if len(removals):
-            new_categories = [c for c in categories if c not in removals]
-            new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)
-            codes = recode_for_categories(
-                cat.codes, cat.categories, new_dtype.categories
-            )
-            NDArrayBacked.__init__(cat, codes, new_dtype)
+        ser = cat.categories.to_series()
+        ser = ser.replace(to_replace=to_replace, value=value)
+
+        all_values = Index(ser)
+        new_categories = Index(ser.drop_duplicates(keep="first"))
+        new_codes = recode_for_categories(
+            cat._codes, all_values, new_categories, copy=False
+        )
+        new_dtype = CategoricalDtype(new_categories, ordered=self.dtype.ordered)
+        NDArrayBacked.__init__(cat, new_codes, new_dtype)
 
         if not inplace:
             return cat
diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
@@ -3394,7 +3394,7 @@ def ngroup(self, ascending: bool = True):
         dtype: int64
         """
         with self._group_selection_context():
-            index = self._selected_obj.index
+            index = self._selected_obj._get_axis(self.axis)
             comp_ids = self.grouper.group_info[0]
 
             dtype: type
diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py
@@ -536,12 +536,10 @@ def replace(
 
         if isinstance(values, Categorical):
             # TODO: avoid special-casing
+            # GH49404
             blk = self if inplace else self.copy()
-            # error: Item "ExtensionArray" of "Union[ndarray[Any, Any],
-            # ExtensionArray]" has no attribute "_replace"
-            blk.values._replace(  # type: ignore[union-attr]
-                to_replace=to_replace, value=value, inplace=True
-            )
+            values = cast(Categorical, blk.values)
+            values._replace(to_replace=to_replace, value=value, inplace=True)
             return [blk]
 
         if not self._can_hold_element(to_replace):
@@ -651,6 +649,14 @@ def replace_list(
         """
         values = self.values
 
+        if isinstance(values, Categorical):
+            # TODO: avoid special-casing
+            # GH49404
+            blk = self if inplace else self.copy()
+            values = cast(Categorical, blk.values)
+            values._replace(to_replace=src_list, value=dest_list, inplace=True)
+            return [blk]
+
         # Exclude anything that we know we won't contain
         pairs = [
             (x, y) for x, y in zip(src_list, dest_list) if self._can_hold_element(x)
diff --git a/pandas/tests/arrays/categorical/test_replace.py b/pandas/tests/arrays/categorical/test_replace.py
@@ -21,6 +21,11 @@
         ((5, 6), 2, [1, 2, 3], False),
         ([1], [2], [2, 2, 3], False),
         ([1, 4], [5, 2], [5, 2, 3], False),
+        # GH49404: overlap between to_replace and value
+        ([1, 2, 3], [2, 3, 4], [2, 3, 4], False),
+        # GH50872, GH46884: replace with null
+        (1, None, [None, 2, 3], False),
+        (1, pd.NA, [None, 2, 3], False),
         # check_categorical sorts categories, which crashes on mixed dtypes
         (3, "4", [1, 2, "4"], False),
         ([1, 2, "3"], "5", ["5", "5", 3], True),
@@ -65,3 +70,11 @@ def test_replace_categorical(to_replace, value, result, expected_error_msg):
 
     pd.Series(cat).replace(to_replace, value, inplace=True)
     tm.assert_categorical_equal(cat, expected)
+
+
+def test_replace_categorical_ea_dtype():
+    # GH49404
+    cat = Categorical(pd.array(["a", "b"], dtype="string"))
+    result = pd.Series(cat).replace(["a", "b"], ["c", pd.NA])._values
+    expected = Categorical(pd.array(["c", pd.NA], dtype="string"))
+    tm.assert_categorical_equal(result, expected)
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -372,16 +372,27 @@ def test_getitem_scalar(self, data):
 
 
 class TestBaseAccumulateTests(base.BaseAccumulateTests):
-    def check_accumulate(self, s, op_name, skipna):
-        result = getattr(s, op_name)(skipna=skipna).astype("Float64")
-        expected = getattr(s.astype("Float64"), op_name)(skipna=skipna)
+    def check_accumulate(self, ser, op_name, skipna):
+        result = getattr(ser, op_name)(skipna=skipna)
+
+        if ser.dtype.kind == "m":
+            # Just check that we match the integer behavior.
+            ser = ser.astype("int64[pyarrow]")
+            result = result.astype("int64[pyarrow]")
+
+        result = result.astype("Float64")
+        expected = getattr(ser.astype("Float64"), op_name)(skipna=skipna)
         self.assert_series_equal(result, expected, check_dtype=False)
 
     @pytest.mark.parametrize("skipna", [True, False])
     def test_accumulate_series_raises(self, data, all_numeric_accumulations, skipna):
         pa_type = data.dtype.pyarrow_dtype
         if (
-            (pa.types.is_integer(pa_type) or pa.types.is_floating(pa_type))
+            (
+                pa.types.is_integer(pa_type)
+                or pa.types.is_floating(pa_type)
+                or pa.types.is_duration(pa_type)
+            )
             and all_numeric_accumulations == "cumsum"
             and not pa_version_under9p0
         ):
@@ -423,9 +434,7 @@ def test_accumulate_series(self, data, all_numeric_accumulations, skipna, reques
                     raises=NotImplementedError,
                 )
             )
-        elif all_numeric_accumulations == "cumsum" and (
-            pa.types.is_duration(pa_type) or pa.types.is_boolean(pa_type)
-        ):
+        elif all_numeric_accumulations == "cumsum" and (pa.types.is_boolean(pa_type)):
             request.node.add_marker(
                 pytest.mark.xfail(
                     reason=f"{all_numeric_accumulations} not implemented for {pa_type}",
@@ -566,10 +575,24 @@ def test_reduce_series(
                 f"pyarrow={pa.__version__} for {pa_dtype}"
             ),
         )
-        if not pa.types.is_boolean(pa_dtype):
+        if pa.types.is_string(pa_dtype) or pa.types.is_binary(pa_dtype):
+            # We *might* want to make this behave like the non-pyarrow cases,
+            #  but have not yet decided.
             request.node.add_marker(xfail_mark)
+
         op_name = all_boolean_reductions
         ser = pd.Series(data)
+
+        if pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype):
+            # xref GH#34479 we support this in our non-pyarrow datetime64 dtypes,
+            #  but it isn't obvious we _should_.  For now, we keep the pyarrow
+            #  behavior which does not support this.
+
+            with pytest.raises(TypeError, match="does not support reduction"):
+                getattr(ser, op_name)(skipna=skipna)
+
+            return
+
         result = getattr(ser, op_name)(skipna=skipna)
         assert result is (op_name == "any")
 
diff --git a/pandas/tests/groupby/transform/test_transform.py b/pandas/tests/groupby/transform/test_transform.py
@@ -164,10 +164,6 @@ def test_transform_broadcast(tsframe, ts):
 def test_transform_axis_1(request, transformation_func):
     # GH 36308
 
-    if transformation_func == "ngroup":
-        msg = "ngroup fails with axis=1: #45986"
-        request.node.add_marker(pytest.mark.xfail(reason=msg))
-
     df = DataFrame({"a": [1, 2], "b": [3, 4], "c": [5, 6]}, index=["x", "y"])
     args = get_groupby_method_args(transformation_func, df)
     result = df.groupby([0, 0, 1], axis=1).transform(transformation_func, *args)