Skip to content

Commit 2187ddf

Browse files
committed
DEPR: SparseArray.astype(dtype) respect non-sparse dtype
1 parent ad19057 commit 2187ddf

File tree

8 files changed

+75
-20
lines changed

8 files changed

+75
-20
lines changed

doc/source/whatsnew/v1.5.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,7 @@ Other API changes
9595
Deprecations
9696
~~~~~~~~~~~~
9797
- Deprecated the keyword ``line_terminator`` in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv`, use ``lineterminator`` instead; this is for consistency with :func:`read_csv` and the standard library 'csv' module (:issue:`9568`)
98+
- Deprecated behavior of :meth:`SparseArray.astype`, :meth:`Series.astype`, and :meth:`DataFrame.astype` with :class:`SparseDtype` when passing a non-sparse ``dtype``. In a future version, this will cast to that non-sparse dtype instead of wrapping it in a :class:`SparseDtype` (:issue:`34457`)
9899
-
99100

100101
.. ---------------------------------------------------------------------------

pandas/_testing/asserters.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -848,8 +848,8 @@ def assert_extension_array_equal(
848848
left_na, right_na, obj="ExtensionArray NA mask", index_values=index_values
849849
)
850850

851-
left_valid = np.asarray(left[~left_na].astype(object))
852-
right_valid = np.asarray(right[~right_na].astype(object))
851+
left_valid = left[~left_na].to_numpy(dtype=object)
852+
right_valid = right[~right_na].to_numpy(dtype=object)
853853
if check_exact:
854854
assert_numpy_array_equal(
855855
left_valid, right_valid, obj="ExtensionArray", index_values=index_values

pandas/core/arrays/sparse/array.py

+13
Original file line numberDiff line numberDiff line change
@@ -1264,6 +1264,19 @@ def astype(self, dtype: AstypeArg | None = None, copy: bool = True):
12641264
return self
12651265
else:
12661266
return self.copy()
1267+
1268+
future_dtype = pandas_dtype(dtype)
1269+
if not isinstance(future_dtype, SparseDtype):
1270+
# GH#34457
1271+
warnings.warn(
1272+
"The behavior of .astype from SparseDtype to a non-sparse dtype "
1273+
"is deprecated. In a future version, this will return a non-sparse "
1274+
"array with the requested dtype. To retain the old behavior, use "
1275+
"`obj.astype(SparseDtype(dtype))`",
1276+
FutureWarning,
1277+
stacklevel=find_stack_level(),
1278+
)
1279+
12671280
dtype = self.dtype.update_dtype(dtype)
12681281
subtype = pandas_dtype(dtype._subtype_with_str)
12691282
sp_values = astype_nansafe(self.sp_values, subtype, copy=copy)

pandas/core/dtypes/concat.py

+2
Original file line numberDiff line numberDiff line change
@@ -53,6 +53,8 @@ def cast_to_common_type(arr: ArrayLike, dtype: DtypeObj) -> ArrayLike:
5353
return arr.astype(object, copy=False)
5454

5555
if is_sparse(arr) and not is_sparse(dtype):
56+
# TODO(2.0): remove special case once SparseArray.astype deprecation
57+
# is enforced.
5658
# problem case: SparseArray.astype(dtype) doesn't follow the specified
5759
# dtype exactly, but converts this to Sparse[dtype] -> first manually
5860
# convert to dense array

pandas/tests/arrays/sparse/test_array.py

+18-5
Original file line numberDiff line numberDiff line change
@@ -529,7 +529,8 @@ def test_astype(self):
529529

530530
def test_astype_bool(self):
531531
a = SparseArray([1, 0, 0, 1], dtype=SparseDtype(int, 0))
532-
result = a.astype(bool)
532+
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
533+
result = a.astype(bool)
533534
expected = SparseArray(
534535
[True, False, False, True], dtype=SparseDtype(bool, False)
535536
)
@@ -546,7 +547,8 @@ def test_astype_all(self, any_real_numpy_dtype):
546547
vals = np.array([1, 2, 3])
547548
arr = SparseArray(vals, fill_value=1)
548549
typ = np.dtype(any_real_numpy_dtype)
549-
res = arr.astype(typ)
550+
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
551+
res = arr.astype(typ)
550552
assert res.dtype == SparseDtype(typ, 1)
551553
assert res.sp_values.dtype == typ
552554

@@ -589,19 +591,30 @@ def test_astype_all(self, any_real_numpy_dtype):
589591
],
590592
)
591593
def test_astype_more(self, arr, dtype, expected):
592-
result = arr.astype(dtype)
594+
595+
if isinstance(dtype, SparseDtype):
596+
warn = None
597+
else:
598+
warn = FutureWarning
599+
600+
with tm.assert_produces_warning(warn, match="astype from SparseDtype"):
601+
result = arr.astype(dtype)
593602
tm.assert_sp_array_equal(result, expected)
594603

595604
def test_astype_nan_raises(self):
596605
arr = SparseArray([1.0, np.nan])
597606
with pytest.raises(ValueError, match="Cannot convert non-finite"):
598-
arr.astype(int)
607+
msg = "astype from SparseDtype"
608+
with tm.assert_produces_warning(FutureWarning, match=msg):
609+
arr.astype(int)
599610

600611
def test_astype_copy_false(self):
601612
# GH#34456 bug caused by using .view instead of .astype in astype_nansafe
602613
arr = SparseArray([1, 2, 3])
603614

604-
result = arr.astype(float, copy=False)
615+
dtype = SparseDtype(float, 0)
616+
617+
result = arr.astype(dtype, copy=False)
605618
expected = SparseArray([1.0, 2.0, 3.0], fill_value=0.0)
606619
tm.assert_sp_array_equal(result, expected)
607620

pandas/tests/extension/base/reshaping.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
import pytest
55

66
import pandas as pd
7+
import pandas._testing as tm
78
from pandas.api.extensions import ExtensionArray
89
from pandas.core.internals.blocks import EABackedBlock
910
from pandas.tests.extension.base.base import BaseExtensionTests
@@ -318,15 +319,22 @@ def test_unstack(self, data, index, obj):
318319
alt = df.unstack(level=level).droplevel(0, axis=1)
319320
self.assert_frame_equal(result, alt)
320321

321-
expected = ser.astype(object).unstack(
322-
level=level, fill_value=data.dtype.na_value
323-
)
324-
if obj == "series" and not isinstance(ser.dtype, pd.SparseDtype):
322+
if obj == "series":
323+
is_sparse = isinstance(ser.dtype, pd.SparseDtype)
324+
else:
325+
is_sparse = isinstance(ser.dtypes.iat[0], pd.SparseDtype)
326+
warn = None if not is_sparse else FutureWarning
327+
with tm.assert_produces_warning(warn, match="astype from Sparse"):
328+
obj_ser = ser.astype(object)
329+
330+
expected = obj_ser.unstack(level=level, fill_value=data.dtype.na_value)
331+
if obj == "series" and not is_sparse:
325332
# GH#34457 SparseArray.astype(object) gives Sparse[object]
326333
# instead of np.dtype(object)
327334
assert (expected.dtypes == object).all()
328335

329-
result = result.astype(object)
336+
with tm.assert_produces_warning(warn, match="astype from Sparse"):
337+
result = result.astype(object)
330338

331339
self.assert_frame_equal(result, expected)
332340

pandas/tests/extension/test_sparse.py

+23-4
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,21 @@ def test_concat_mixed_dtypes(self, data):
150150
)
151151
self.assert_frame_equal(result, expected)
152152

153+
@pytest.mark.parametrize(
154+
"columns",
155+
[
156+
["A", "B"],
157+
pd.MultiIndex.from_tuples(
158+
[("A", "a"), ("A", "b")], names=["outer", "inner"]
159+
),
160+
],
161+
)
162+
def test_stack(self, data, columns):
163+
with tm.assert_produces_warning(
164+
FutureWarning, check_stacklevel=False, match="astype from Sparse"
165+
):
166+
super().test_stack(data, columns)
167+
153168
def test_concat_columns(self, data, na_value):
154169
self._check_unsupported(data)
155170
super().test_concat_columns(data, na_value)
@@ -394,7 +409,8 @@ def test_astype_object_series(self, all_data):
394409
# Unlike the base class, we do not expect the resulting Block
395410
# to be ObjectBlock / resulting array to be np.dtype("object")
396411
ser = pd.Series(all_data, name="A")
397-
result = ser.astype(object)
412+
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
413+
result = ser.astype(object)
398414
assert is_object_dtype(result.dtype)
399415
assert is_object_dtype(result._mgr.array.dtype)
400416

@@ -403,7 +419,8 @@ def test_astype_object_frame(self, all_data):
403419
# to be ObjectBlock / resulting array to be np.dtype("object")
404420
df = pd.DataFrame({"A": all_data})
405421

406-
result = df.astype(object)
422+
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
423+
result = df.astype(object)
407424
assert is_object_dtype(result._mgr.arrays[0].dtype)
408425

409426
# earlier numpy raises TypeError on e.g. np.dtype(np.int64) == "Int64"
@@ -414,7 +431,8 @@ def test_astype_object_frame(self, all_data):
414431
assert not comp.any()
415432

416433
def test_astype_str(self, data):
417-
result = pd.Series(data[:5]).astype(str)
434+
with tm.assert_produces_warning(FutureWarning, match="astype from Sparse"):
435+
result = pd.Series(data[:5]).astype(str)
418436
expected_dtype = SparseDtype(str, str(data.fill_value))
419437
expected = pd.Series([str(x) for x in data[:5]], dtype=expected_dtype)
420438
self.assert_series_equal(result, expected)
@@ -518,4 +536,5 @@ class TestParsing(BaseSparseTests, base.BaseParsingTests):
518536
def test_EA_types(self, engine, data):
519537
expected_msg = r".*must implement _from_sequence_of_strings.*"
520538
with pytest.raises(NotImplementedError, match=expected_msg):
521-
super().test_EA_types(engine, data)
539+
with tm.assert_produces_warning(FutureWarning, match="astype from"):
540+
super().test_EA_types(engine, data)

pandas/tests/reshape/concat/test_concat.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -541,11 +541,10 @@ def test_concat_sparse():
541541

542542
def test_concat_dense_sparse():
543543
# GH 30668
544-
a = Series(pd.arrays.SparseArray([1, None]), dtype=float)
544+
dtype = pd.SparseDtype(np.float64, None)
545+
a = Series(pd.arrays.SparseArray([1, None]), dtype=dtype)
545546
b = Series([1], dtype=float)
546-
expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(
547-
pd.SparseDtype(np.float64, None)
548-
)
547+
expected = Series(data=[1, None, 1], index=[0, 1, 0]).astype(dtype)
549548
result = concat([a, b], axis=0)
550549
tm.assert_series_equal(result, expected)
551550

0 commit comments

Comments
 (0)