Skip to content

DEPR: SparseArray.astype(dtype) respect non-sparse dtype #45339

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jan 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ Slicing on a :class:`DataFrame` will not be affected.
Other Deprecations
^^^^^^^^^^^^^^^^^^
- Deprecated the keyword ``line_terminator`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv`, use ``lineterminator`` instead; this is for consistency with :func:`read_csv` and the standard library 'csv' module (:issue:`9568`)
- Deprecated behavior of :meth:`SparseArray.astype`, :meth:`Series.astype`, and :meth:`DataFrame.astype` with :class:`SparseDtype` when passing a non-sparse ``dtype``. In a future version, this will cast to that non-sparse dtype instead of wrapping it in a :class:`SparseDtype` (:issue:`34457`)
- Deprecated behavior of :meth:`DatetimeIndex.intersection` and :meth:`DatetimeIndex.symmetric_difference` (``union`` behavior was already deprecated in version 1.3.0) with mixed timezones; in a future version both will be cast to UTC instead of object dtype (:issue:`39328`, :issue:`45357`)
- Deprecated :meth:`DataFrame.iteritems`, :meth:`Series.iteritems`, :meth:`HDFStore.iteritems` in favor of :meth:`DataFrame.items`, :meth:`Series.items`, :meth:`HDFStore.items` (:issue:`45321`)
-
Expand Down
4 changes: 2 additions & 2 deletions pandas/_testing/asserters.py
Original file line number Diff line number Diff line change
Expand Up @@ -848,8 +848,8 @@ def assert_extension_array_equal(
left_na, right_na, obj="ExtensionArray NA mask", index_values=index_values
)

left_valid = np.asarray(left[~left_na].astype(object))
right_valid = np.asarray(right[~right_na].astype(object))
left_valid = left[~left_na].to_numpy(dtype=object)
right_valid = right[~right_na].to_numpy(dtype=object)
if check_exact:
assert_numpy_array_equal(
left_valid, right_valid, obj="ExtensionArray", index_values=index_values
Expand Down
13 changes: 13 additions & 0 deletions pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -1264,6 +1264,19 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
return self
else:
return self.copy()

future_dtype = pandas_dtype(dtype)
if not isinstance(future_dtype, SparseDtype):
# GH#34457
warnings.warn(
"The behavior of .astype from SparseDtype to a non-sparse dtype "
"is deprecated. In a future version, this will return a non-sparse "
"array with the requested dtype. To retain the old behavior, use "
"`obj.astype(SparseDtype(dtype))`",
FutureWarning,
stacklevel=find_stack_level(),
)

dtype = self.dtype.update_dtype(dtype)
subtype = pandas_dtype(dtype._subtype_with_str)
sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)
Expand Down
2 changes: 2 additions & 0 deletions pandas/core/dtypes/concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,8 @@ def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
return arr

if is_sparse(arr) and not is_sparse(dtype):
# TODO(2.0): remove special case once SparseArray.astype deprecation
# is enforced.
# problem case: SparseArray.astype(dtype) doesn't follow the specified
# dtype exactly, but converts this to Sparse[dtype] -> first manually
# convert to dense array
Expand Down
23 changes: 18 additions & 5 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -529,7 +529,8 @@ def test_astype(self):

def test_astype_bool(self):
a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
result = a.astype(bool)
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
result = a.astype(bool)
expected = SparseArray(
[True, False, False, True], dtype=SparseDtype(bool, False)
)
Expand All @@ -546,7 +547,8 @@ def test_astype_all(self, any_real_numpy_dtype):
vals = np.array([1, 2, 3])
arr = SparseArray(vals, fill_value=1)
typ = np.dtype(any_real_numpy_dtype)
res = arr.astype(typ)
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
res = arr.astype(typ)
assert res.dtype == SparseDtype(typ, 1)
assert res.sp_values.dtype == typ

Expand Down Expand Up @@ -589,19 +591,30 @@ def test_astype_all(self, any_real_numpy_dtype):
],
)
def test_astype_more(self, arr, dtype, expected):
result = arr.astype(dtype)

if isinstance(dtype, SparseDtype):
warn = None
else:
warn = FutureWarning

with tm.assert_produces_warning(warn, match="astype from SparseDtype"):
result = arr.astype(dtype)
tm.assert_sp_array_equal(result, expected)

def test_astype_nan_raises(self):
arr = SparseArray([1.0, np.nan])
with pytest.raises(ValueError, match="Cannot convert non-finite"):
arr.astype(int)
msg = "astype from SparseDtype"
with tm.assert_produces_warning(FutureWarning, match=msg):
arr.astype(int)

def test_astype_copy_false(self):
# GH#34456 bug caused by using .view instead of .astype in astype_nansafe
arr = SparseArray([1, 2, 3])

result = arr.astype(float, copy=False)
dtype = SparseDtype(float, 0)

result = arr.astype(dtype, copy=False)
expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0)
tm.assert_sp_array_equal(result, expected)

Expand Down
18 changes: 13 additions & 5 deletions pandas/tests/extension/base/reshaping.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import pytest

import pandas as pd
import pandas._testing as tm
from pandas.api.extensions import ExtensionArray
from pandas.core.internals.blocks import EABackedBlock
from pandas.tests.extension.base.base import BaseExtensionTests
Expand Down Expand Up @@ -318,15 +319,22 @@ def test_unstack(self, data, index, obj):
alt = df.unstack(level=level).droplevel(0, axis=1)
self.assert_frame_equal(result, alt)

expected = ser.astype(object).unstack(
level=level, fill_value=data.dtype.na_value
)
if obj == "series" and not isinstance(ser.dtype, pd.SparseDtype):
if obj == "series":
is_sparse = isinstance(ser.dtype, pd.SparseDtype)
else:
is_sparse = isinstance(ser.dtypes.iat[0], pd.SparseDtype)
warn = None if not is_sparse else FutureWarning
with tm.assert_produces_warning(warn, match="astype from Sparse"):
obj_ser = ser.astype(object)

expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
if obj == "series" and not is_sparse:
# GH#34457 SparseArray.astype(object) gives Sparse[object]
# instead of np.dtype(object)
assert (expected.dtypes == object).all()

result = result.astype(object)
with tm.assert_produces_warning(warn, match="astype from Sparse"):
result = result.astype(object)

self.assert_frame_equal(result, expected)

Expand Down
27 changes: 23 additions & 4 deletions pandas/tests/extension/test_sparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,21 @@ def test_concat_mixed_dtypes(self, data):
)
self.assert_frame_equal(result, expected)

@pytest.mark.parametrize(
"columns",
[
["A", "B"],
pd.MultiIndex.from_tuples(
[("A", "a"), ("A", "b")], names=["outer", "inner"]
),
],
)
def test_stack(self, data, columns):
with tm.assert_produces_warning(
FutureWarning, check_stacklevel=False, match="astype from Sparse"
):
super().test_stack(data, columns)

def test_concat_columns(self, data, na_value):
self._check_unsupported(data)
super().test_concat_columns(data, na_value)
Expand Down Expand Up @@ -394,7 +409,8 @@ def test_astype_object_series(self, all_data):
# Unlike the base class, we do not expect the resulting Block
# to be ObjectBlock / resulting array to be np.dtype("object")
ser = pd.Series(all_data, name="A")
result = ser.astype(object)
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
result = ser.astype(object)
assert is_object_dtype(result.dtype)
assert is_object_dtype(result._mgr.array.dtype)

Expand All @@ -403,7 +419,8 @@ def test_astype_object_frame(self, all_data):
# to be ObjectBlock / resulting array to be np.dtype("object")
df = pd.DataFrame({"A": all_data})

result = df.astype(object)
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
result = df.astype(object)
assert is_object_dtype(result._mgr.arrays[0].dtype)

# earlier numpy raises TypeError on e.g. np.dtype(np.int64) == "Int64"
Expand All @@ -414,7 +431,8 @@ def test_astype_object_frame(self, all_data):
assert not comp.any()

def test_astype_str(self, data):
result = pd.Series(data[:5]).astype(str)
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
result = pd.Series(data[:5]).astype(str)
expected_dtype = SparseDtype(str, str(data.fill_value))
expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype)
self.assert_series_equal(result, expected)
Expand Down Expand Up @@ -518,4 +536,5 @@ class TestParsing(BaseSparseTests, base.BaseParsingTests):
def test_EA_types(self, engine, data):
expected_msg = r".*must implement _from_sequence_of_strings.*"
with pytest.raises(NotImplementedError, match=expected_msg):
super().test_EA_types(engine, data)
with tm.assert_produces_warning(FutureWarning, match="astype from"):
super().test_EA_types(engine, data)
11 changes: 9 additions & 2 deletions pandas/tests/frame/methods/test_append.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,9 +231,16 @@ def test_append_timestamps_aware_or_naive(self, tz_naive_fixture, timestamp):
([1], pd.SparseDtype()),
],
)
def test_other_dtypes(self, data, dtype):
def test_other_dtypes(self, data, dtype, using_array_manager):
df = DataFrame(data, dtype=dtype)
result = df._append(df.iloc[0]).iloc[-1]

warn = None
if using_array_manager and isinstance(dtype, pd.SparseDtype):
warn = FutureWarning

with tm.assert_produces_warning(warn, match="astype from SparseDtype"):
result = df._append(df.iloc[0]).iloc[-1]

expected = Series(data, name=0, dtype=dtype)
tm.assert_series_equal(result, expected)

Expand Down
7 changes: 3 additions & 4 deletions pandas/tests/reshape/concat/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -541,11 +541,10 @@ def test_concat_sparse():

def test_concat_dense_sparse():
# GH 30668
a = Series(pd.arrays.SparseArray([1, None]), dtype=float)
dtype = pd.SparseDtype(np.float64, None)
a = Series(pd.arrays.SparseArray([1, None]), dtype=dtype)
b = Series([1], dtype=float)
expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(
pd.SparseDtype(np.float64, None)
)
expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(dtype)
result = concat([a, b], axis=0)
tm.assert_series_equal(result, expected)

Expand Down