pandas-dev · jreback · Jan 17, 2022 · Jan 13, 2022 · Jan 13, 2022 · Jan 15, 2022
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -140,6 +140,7 @@ Slicing on a :class:`DataFrame` will not be affected.
 Other Deprecations
 ^^^^^^^^^^^^^^^^^^
 - Deprecated the keyword ``line_terminator`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv`, use ``lineterminator`` instead; this is for consistency with :func:`read_csv` and the standard library 'csv' module (:issue:`9568`)
+- Deprecated behavior of :meth:`SparseArray.astype`, :meth:`Series.astype`, and :meth:`DataFrame.astype` with :class:`SparseDtype` when passing a non-sparse ``dtype``. In a future version, this will cast to that non-sparse dtype instead of wrapping it in a :class:`SparseDtype` (:issue:`34457`)
 - Deprecated behavior of :meth:`DatetimeIndex.intersection` and :meth:`DatetimeIndex.symmetric_difference` (``union`` behavior was already deprecated in version 1.3.0) with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`, :issue:`45357`)
 - Deprecated :meth:`DataFrame.iteritems`, :meth:`Series.iteritems`, :meth:`HDFStore.iteritems` in favor of :meth:`DataFrame.items`, :meth:`Series.items`, :meth:`HDFStore.items`  (:issue:`45321`)
 -

diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py
@@ -848,8 +848,8 @@ def assert_extension_array_equal(
         left_na, right_na, obj="ExtensionArray NA mask", index_values=index_values
     )
 
-    left_valid = np.asarray(left[~left_na].astype(object))
-    right_valid = np.asarray(right[~right_na].astype(object))
+    left_valid = left[~left_na].to_numpy(dtype=object)
+    right_valid = right[~right_na].to_numpy(dtype=object)
     if check_exact:
         assert_numpy_array_equal(
             left_valid, right_valid, obj="ExtensionArray", index_values=index_values

diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py
@@ -1264,6 +1264,19 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
                 return self
             else:
                 return self.copy()
+
+        future_dtype = pandas_dtype(dtype)
+        if not isinstance(future_dtype, SparseDtype):
+            # GH#34457
+            warnings.warn(
+                "The behavior of .astype from SparseDtype to a non-sparse dtype "
+                "is deprecated. In a future version, this will return a non-sparse "
+                "array with the requested dtype. To retain the old behavior, use "
+                "`obj.astype(SparseDtype(dtype))`",
+                FutureWarning,
+                stacklevel=find_stack_level(),
+            )
+
         dtype = self.dtype.update_dtype(dtype)
         subtype = pandas_dtype(dtype._subtype_with_str)
         sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)

diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py
@@ -44,6 +44,8 @@ def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
         return arr
 
     if is_sparse(arr) and not is_sparse(dtype):
+        # TODO(2.0): remove special case once SparseArray.astype deprecation
+        #  is enforced.
         # problem case: SparseArray.astype(dtype) doesn't follow the specified
         # dtype exactly, but converts this to Sparse[dtype] -> first manually
         # convert to dense array

diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -529,7 +529,8 @@ def test_astype(self):
 
     def test_astype_bool(self):
         a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
-        result = a.astype(bool)
+        with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
+            result = a.astype(bool)
         expected = SparseArray(
             [True, False, False, True], dtype=SparseDtype(bool, False)
         )
@@ -546,7 +547,8 @@ def test_astype_all(self, any_real_numpy_dtype):
         vals = np.array([1, 2, 3])
         arr = SparseArray(vals, fill_value=1)
         typ = np.dtype(any_real_numpy_dtype)
-        res = arr.astype(typ)
+        with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
+            res = arr.astype(typ)
         assert res.dtype == SparseDtype(typ, 1)
         assert res.sp_values.dtype == typ
 
@@ -589,19 +591,30 @@ def test_astype_all(self, any_real_numpy_dtype):
         ],
     )
     def test_astype_more(self, arr, dtype, expected):
-        result = arr.astype(dtype)
+
+        if isinstance(dtype, SparseDtype):
+            warn = None
+        else:
+            warn = FutureWarning
+
+        with tm.assert_produces_warning(warn, match="astype from SparseDtype"):
+            result = arr.astype(dtype)
         tm.assert_sp_array_equal(result, expected)
 
     def test_astype_nan_raises(self):
         arr = SparseArray([1.0, np.nan])
         with pytest.raises(ValueError, match="Cannot convert non-finite"):
-            arr.astype(int)
+            msg = "astype from SparseDtype"
+            with tm.assert_produces_warning(FutureWarning, match=msg):
+                arr.astype(int)
 
     def test_astype_copy_false(self):
         # GH#34456 bug caused by using .view instead of .astype in astype_nansafe
         arr = SparseArray([1, 2, 3])
 
-        result = arr.astype(float, copy=False)
+        dtype = SparseDtype(float, 0)
+
+        result = arr.astype(dtype, copy=False)
         expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0)
         tm.assert_sp_array_equal(result, expected)
 

diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py
@@ -4,6 +4,7 @@
 import pytest
 
 import pandas as pd
+import pandas._testing as tm
 from pandas.api.extensions import ExtensionArray
 from pandas.core.internals.blocks import EABackedBlock
 from pandas.tests.extension.base.base import BaseExtensionTests
@@ -318,15 +319,22 @@ def test_unstack(self, data, index, obj):
                 alt = df.unstack(level=level).droplevel(0, axis=1)
                 self.assert_frame_equal(result, alt)
 
-            expected = ser.astype(object).unstack(
-                level=level, fill_value=data.dtype.na_value
-            )
-            if obj == "series" and not isinstance(ser.dtype, pd.SparseDtype):
+            if obj == "series":
+                is_sparse = isinstance(ser.dtype, pd.SparseDtype)
+            else:
+                is_sparse = isinstance(ser.dtypes.iat[0], pd.SparseDtype)
+            warn = None if not is_sparse else FutureWarning
+            with tm.assert_produces_warning(warn, match="astype from Sparse"):
+                obj_ser = ser.astype(object)
+
+            expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
+            if obj == "series" and not is_sparse:
                 # GH#34457 SparseArray.astype(object) gives Sparse[object]
                 #  instead of np.dtype(object)
                 assert (expected.dtypes == object).all()
 
-            result = result.astype(object)
+            with tm.assert_produces_warning(warn, match="astype from Sparse"):
+                result = result.astype(object)
 
             self.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py
@@ -150,6 +150,21 @@ def test_concat_mixed_dtypes(self, data):
         )
         self.assert_frame_equal(result, expected)
 
+    @pytest.mark.parametrize(
+        "columns",
+        [
+            ["A", "B"],
+            pd.MultiIndex.from_tuples(
+                [("A", "a"), ("A", "b")], names=["outer", "inner"]
+            ),
+        ],
+    )
+    def test_stack(self, data, columns):
+        with tm.assert_produces_warning(
+            FutureWarning, check_stacklevel=False, match="astype from Sparse"
+        ):
+            super().test_stack(data, columns)
+
     def test_concat_columns(self, data, na_value):
         self._check_unsupported(data)
         super().test_concat_columns(data, na_value)
@@ -394,7 +409,8 @@ def test_astype_object_series(self, all_data):
         # Unlike the base class, we do not expect the resulting Block
         #  to be ObjectBlock / resulting array to be np.dtype("object")
         ser = pd.Series(all_data, name="A")
-        result = ser.astype(object)
+        with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
+            result = ser.astype(object)
         assert is_object_dtype(result.dtype)
         assert is_object_dtype(result._mgr.array.dtype)
 
@@ -403,7 +419,8 @@ def test_astype_object_frame(self, all_data):
         #  to be ObjectBlock / resulting array to be np.dtype("object")
         df = pd.DataFrame({"A": all_data})
 
-        result = df.astype(object)
+        with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
+            result = df.astype(object)
         assert is_object_dtype(result._mgr.arrays[0].dtype)
 
         # earlier numpy raises TypeError on e.g. np.dtype(np.int64) == "Int64"
@@ -414,7 +431,8 @@ def test_astype_object_frame(self, all_data):
             assert not comp.any()
 
     def test_astype_str(self, data):
-        result = pd.Series(data[:5]).astype(str)
+        with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
+            result = pd.Series(data[:5]).astype(str)
         expected_dtype = SparseDtype(str, str(data.fill_value))
         expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype)
         self.assert_series_equal(result, expected)
@@ -518,4 +536,5 @@ class TestParsing(BaseSparseTests, base.BaseParsingTests):
     def test_EA_types(self, engine, data):
         expected_msg = r".*must implement _from_sequence_of_strings.*"
         with pytest.raises(NotImplementedError, match=expected_msg):
-            super().test_EA_types(engine, data)
+            with tm.assert_produces_warning(FutureWarning, match="astype from"):
+                super().test_EA_types(engine, data)
diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py
@@ -231,9 +231,16 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp):
             ([1], pd.SparseDtype()),
         ],
     )
-    def test_other_dtypes(self, data, dtype):
+    def test_other_dtypes(self, data, dtype, using_array_manager):
         df = DataFrame(data, dtype=dtype)
-        result = df._append(df.iloc[0]).iloc[-1]
+
+        warn = None
+        if using_array_manager and isinstance(dtype, pd.SparseDtype):
+            warn = FutureWarning
+
+        with tm.assert_produces_warning(warn, match="astype from SparseDtype"):
+            result = df._append(df.iloc[0]).iloc[-1]
+
         expected = Series(data, name=0, dtype=dtype)
         tm.assert_series_equal(result, expected)
 

diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py
@@ -541,11 +541,10 @@ def test_concat_sparse():
 
 def test_concat_dense_sparse():
     # GH 30668
-    a = Series(pd.arrays.SparseArray([1, None]), dtype=float)
+    dtype = pd.SparseDtype(np.float64, None)
+    a = Series(pd.arrays.SparseArray([1, None]), dtype=dtype)
     b = Series([1], dtype=float)
-    expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(
-        pd.SparseDtype(np.float64, None)
-    )
+    expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(dtype)
     result = concat([a, b], axis=0)
     tm.assert_series_equal(result, expected)