diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 2152b640e67f5..8eab16714d2f6 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -140,6 +140,7 @@ Slicing on a :class:`DataFrame` will not be affected. Other Deprecations ^^^^^^^^^^^^^^^^^^ - Deprecated the keyword ``line_terminator`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv`, use ``lineterminator`` instead; this is for consistency with :func:`read_csv` and the standard library 'csv' module (:issue:`9568`) +- Deprecated behavior of :meth:`SparseArray.astype`, :meth:`Series.astype`, and :meth:`DataFrame.astype` with :class:`SparseDtype` when passing a non-sparse ``dtype``. In a future version, this will cast to that non-sparse dtype instead of wrapping it in a :class:`SparseDtype` (:issue:`34457`) - Deprecated behavior of :meth:`DatetimeIndex.intersection` and :meth:`DatetimeIndex.symmetric_difference` (``union`` behavior was already deprecated in version 1.3.0) with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`, :issue:`45357`) - Deprecated :meth:`DataFrame.iteritems`, :meth:`Series.iteritems`, :meth:`HDFStore.iteritems` in favor of :meth:`DataFrame.items`, :meth:`Series.items`, :meth:`HDFStore.items` (:issue:`45321`) - diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index ea75af20bb0b6..84360403fe9c8 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -848,8 +848,8 @@ def assert_extension_array_equal( left_na, right_na, obj="ExtensionArray NA mask", index_values=index_values ) - left_valid = np.asarray(left[~left_na].astype(object)) - right_valid = np.asarray(right[~right_na].astype(object)) + left_valid = left[~left_na].to_numpy(dtype=object) + right_valid = right[~right_na].to_numpy(dtype=object) if check_exact: assert_numpy_array_equal( left_valid, right_valid, obj="ExtensionArray", index_values=index_values diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index a5451f96cd0e1..18621fa9fb68a 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -1264,6 +1264,19 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True): return self else: return self.copy() + + future_dtype = pandas_dtype(dtype) + if not isinstance(future_dtype, SparseDtype): + # GH#34457 + warnings.warn( + "The behavior of .astype from SparseDtype to a non-sparse dtype " + "is deprecated. In a future version, this will return a non-sparse " + "array with the requested dtype. To retain the old behavior, use " + "`obj.astype(SparseDtype(dtype))`", + FutureWarning, + stacklevel=find_stack_level(), + ) + dtype = self.dtype.update_dtype(dtype) subtype = pandas_dtype(dtype._subtype_with_str) sp_values = astype_nansafe(self.sp_values, subtype, copy=copy) diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index d3d755b205878..a547f047793a9 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -44,6 +44,8 @@ def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike: return arr if is_sparse(arr) and not is_sparse(dtype): + # TODO(2.0): remove special case once SparseArray.astype deprecation + # is enforced. # problem case: SparseArray.astype(dtype) doesn't follow the specified # dtype exactly, but converts this to Sparse[dtype] -> first manually # convert to dense array diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 0ebe03d9a1198..a8517535e7833 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -529,7 +529,8 @@ def test_astype(self): def test_astype_bool(self): a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0)) - result = a.astype(bool) + with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"): + result = a.astype(bool) expected = SparseArray( [True, False, False, True], dtype=SparseDtype(bool, False) ) @@ -546,7 +547,8 @@ def test_astype_all(self, any_real_numpy_dtype): vals = np.array([1, 2, 3]) arr = SparseArray(vals, fill_value=1) typ = np.dtype(any_real_numpy_dtype) - res = arr.astype(typ) + with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"): + res = arr.astype(typ) assert res.dtype == SparseDtype(typ, 1) assert res.sp_values.dtype == typ @@ -589,19 +591,30 @@ def test_astype_all(self, any_real_numpy_dtype): ], ) def test_astype_more(self, arr, dtype, expected): - result = arr.astype(dtype) + + if isinstance(dtype, SparseDtype): + warn = None + else: + warn = FutureWarning + + with tm.assert_produces_warning(warn, match="astype from SparseDtype"): + result = arr.astype(dtype) tm.assert_sp_array_equal(result, expected) def test_astype_nan_raises(self): arr = SparseArray([1.0, np.nan]) with pytest.raises(ValueError, match="Cannot convert non-finite"): - arr.astype(int) + msg = "astype from SparseDtype" + with tm.assert_produces_warning(FutureWarning, match=msg): + arr.astype(int) def test_astype_copy_false(self): # GH#34456 bug caused by using .view instead of .astype in astype_nansafe arr = SparseArray([1, 2, 3]) - result = arr.astype(float, copy=False) + dtype = SparseDtype(float, 0) + + result = arr.astype(dtype, copy=False) expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0) tm.assert_sp_array_equal(result, expected) diff --git a/pandas/tests/extension/base/reshaping.py b/pandas/tests/extension/base/reshaping.py index 45bc5b7caaf6f..babb2868a4421 100644 --- a/pandas/tests/extension/base/reshaping.py +++ b/pandas/tests/extension/base/reshaping.py @@ -4,6 +4,7 @@ import pytest import pandas as pd +import pandas._testing as tm from pandas.api.extensions import ExtensionArray from pandas.core.internals.blocks import EABackedBlock from pandas.tests.extension.base.base import BaseExtensionTests @@ -318,15 +319,22 @@ def test_unstack(self, data, index, obj): alt = df.unstack(level=level).droplevel(0, axis=1) self.assert_frame_equal(result, alt) - expected = ser.astype(object).unstack( - level=level, fill_value=data.dtype.na_value - ) - if obj == "series" and not isinstance(ser.dtype, pd.SparseDtype): + if obj == "series": + is_sparse = isinstance(ser.dtype, pd.SparseDtype) + else: + is_sparse = isinstance(ser.dtypes.iat[0], pd.SparseDtype) + warn = None if not is_sparse else FutureWarning + with tm.assert_produces_warning(warn, match="astype from Sparse"): + obj_ser = ser.astype(object) + + expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value) + if obj == "series" and not is_sparse: # GH#34457 SparseArray.astype(object) gives Sparse[object] # instead of np.dtype(object) assert (expected.dtypes == object).all() - result = result.astype(object) + with tm.assert_produces_warning(warn, match="astype from Sparse"): + result = result.astype(object) self.assert_frame_equal(result, expected) diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 5e2f452009e92..4f453986ad2c3 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -150,6 +150,21 @@ def test_concat_mixed_dtypes(self, data): ) self.assert_frame_equal(result, expected) + @pytest.mark.parametrize( + "columns", + [ + ["A", "B"], + pd.MultiIndex.from_tuples( + [("A", "a"), ("A", "b")], names=["outer", "inner"] + ), + ], + ) + def test_stack(self, data, columns): + with tm.assert_produces_warning( + FutureWarning, check_stacklevel=False, match="astype from Sparse" + ): + super().test_stack(data, columns) + def test_concat_columns(self, data, na_value): self._check_unsupported(data) super().test_concat_columns(data, na_value) @@ -394,7 +409,8 @@ def test_astype_object_series(self, all_data): # Unlike the base class, we do not expect the resulting Block # to be ObjectBlock / resulting array to be np.dtype("object") ser = pd.Series(all_data, name="A") - result = ser.astype(object) + with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"): + result = ser.astype(object) assert is_object_dtype(result.dtype) assert is_object_dtype(result._mgr.array.dtype) @@ -403,7 +419,8 @@ def test_astype_object_frame(self, all_data): # to be ObjectBlock / resulting array to be np.dtype("object") df = pd.DataFrame({"A": all_data}) - result = df.astype(object) + with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"): + result = df.astype(object) assert is_object_dtype(result._mgr.arrays[0].dtype) # earlier numpy raises TypeError on e.g. np.dtype(np.int64) == "Int64" @@ -414,7 +431,8 @@ def test_astype_object_frame(self, all_data): assert not comp.any() def test_astype_str(self, data): - result = pd.Series(data[:5]).astype(str) + with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"): + result = pd.Series(data[:5]).astype(str) expected_dtype = SparseDtype(str, str(data.fill_value)) expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype) self.assert_series_equal(result, expected) @@ -518,4 +536,5 @@ class TestParsing(BaseSparseTests, base.BaseParsingTests): def test_EA_types(self, engine, data): expected_msg = r".*must implement _from_sequence_of_strings.*" with pytest.raises(NotImplementedError, match=expected_msg): - super().test_EA_types(engine, data) + with tm.assert_produces_warning(FutureWarning, match="astype from"): + super().test_EA_types(engine, data) diff --git a/pandas/tests/frame/methods/test_append.py b/pandas/tests/frame/methods/test_append.py index 5cfad472e0134..d1c9c379759b5 100644 --- a/pandas/tests/frame/methods/test_append.py +++ b/pandas/tests/frame/methods/test_append.py @@ -231,9 +231,16 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp): ([1], pd.SparseDtype()), ], ) - def test_other_dtypes(self, data, dtype): + def test_other_dtypes(self, data, dtype, using_array_manager): df = DataFrame(data, dtype=dtype) - result = df._append(df.iloc[0]).iloc[-1] + + warn = None + if using_array_manager and isinstance(dtype, pd.SparseDtype): + warn = FutureWarning + + with tm.assert_produces_warning(warn, match="astype from SparseDtype"): + result = df._append(df.iloc[0]).iloc[-1] + expected = Series(data, name=0, dtype=dtype) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/reshape/concat/test_concat.py b/pandas/tests/reshape/concat/test_concat.py index a7b3c77e6ea0a..cafb119f0c4ba 100644 --- a/pandas/tests/reshape/concat/test_concat.py +++ b/pandas/tests/reshape/concat/test_concat.py @@ -541,11 +541,10 @@ def test_concat_sparse(): def test_concat_dense_sparse(): # GH 30668 - a = Series(pd.arrays.SparseArray([1, None]), dtype=float) + dtype = pd.SparseDtype(np.float64, None) + a = Series(pd.arrays.SparseArray([1, None]), dtype=dtype) b = Series([1], dtype=float) - expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype( - pd.SparseDtype(np.float64, None) - ) + expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(dtype) result = concat([a, b], axis=0) tm.assert_series_equal(result, expected)