Skip to content

Commit 45eb702

Browse files
authored
DEPR: require SparseDtype.fill_value be compatible with SparseDtype.subtype (#53043)
* DEPR: require SparseDtype.fill_value be compatible with SparseDtype.subtype * filter more specific
1 parent fa69e14 commit 45eb702

File tree

5 files changed

+94
-51
lines changed

5 files changed

+94
-51
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,7 @@ Deprecations
261261
- Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`)
262262
- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
263263
- Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`)
264+
- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
264265
- Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
265266
-
266267

pandas/core/arrays/sparse/dtype.py

+40-12
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,23 @@
1818
ExtensionDtype,
1919
register_extension_dtype,
2020
)
21+
from pandas.core.dtypes.cast import can_hold_element
2122
from pandas.core.dtypes.common import (
2223
is_bool_dtype,
2324
is_object_dtype,
2425
is_scalar,
2526
is_string_dtype,
2627
pandas_dtype,
2728
)
29+
from pandas.core.dtypes.dtypes import CategoricalDtype
2830
from pandas.core.dtypes.missing import (
31+
is_valid_na_for_dtype,
2932
isna,
3033
na_value_for_dtype,
3134
)
3235

36+
from pandas.core.construction import ensure_wrapped_if_datetimelike
37+
3338
if TYPE_CHECKING:
3439
from pandas._typing import (
3540
Dtype,
@@ -164,18 +169,41 @@ def _check_fill_value(self):
164169
raise ValueError(
165170
f"fill_value must be a scalar. Got {self._fill_value} instead"
166171
)
167-
# TODO: Right now we can use Sparse boolean array
168-
# with any fill_value. Here was an attempt
169-
# to allow only 3 value: True, False or nan
170-
# but plenty test has failed.
171-
# see pull 44955
172-
# if self._is_boolean and not (
173-
# is_bool(self._fill_value) or isna(self._fill_value)
174-
# ):
175-
# raise ValueError(
176-
# "fill_value must be True, False or nan "
177-
# f"for boolean type. Got {self._fill_value} instead"
178-
# )
172+
173+
# GH#23124 require fill_value and subtype to match
174+
val = self._fill_value
175+
if isna(val):
176+
if not is_valid_na_for_dtype(val, self.subtype):
177+
warnings.warn(
178+
"Allowing arbitrary scalar fill_value in SparseDtype is "
179+
"deprecated. In a future version, the fill_value must be "
180+
"a valid value for the SparseDtype.subtype.",
181+
FutureWarning,
182+
stacklevel=find_stack_level(),
183+
)
184+
elif isinstance(self.subtype, CategoricalDtype):
185+
# TODO: is this even supported? It is reached in
186+
# test_dtype_sparse_with_fill_value_not_present_in_data
187+
if self.subtype.categories is None or val not in self.subtype.categories:
188+
warnings.warn(
189+
"Allowing arbitrary scalar fill_value in SparseDtype is "
190+
"deprecated. In a future version, the fill_value must be "
191+
"a valid value for the SparseDtype.subtype.",
192+
FutureWarning,
193+
stacklevel=find_stack_level(),
194+
)
195+
else:
196+
dummy = np.empty(0, dtype=self.subtype)
197+
dummy = ensure_wrapped_if_datetimelike(dummy)
198+
199+
if not can_hold_element(dummy, val):
200+
warnings.warn(
201+
"Allowing arbitrary scalar fill_value in SparseDtype is "
202+
"deprecated. In a future version, the fill_value must be "
203+
"a valid value for the SparseDtype.subtype.",
204+
FutureWarning,
205+
stacklevel=find_stack_level(),
206+
)
179207

180208
@property
181209
def _is_na_fill_value(self) -> bool:

pandas/tests/arrays/sparse/test_array.py

+8-20
Original file line numberDiff line numberDiff line change
@@ -52,33 +52,21 @@ def test_set_fill_value(self):
5252
arr.fill_value = 2
5353
assert arr.fill_value == 2
5454

55-
# TODO: this seems fine? You can construct an integer
56-
# sparsearray with NaN fill value, why not update one?
57-
# coerces to int
58-
# msg = "unable to set fill_value 3\\.1 to int64 dtype"
59-
# with pytest.raises(ValueError, match=msg):
60-
arr.fill_value = 3.1
55+
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
56+
with tm.assert_produces_warning(FutureWarning, match=msg):
57+
arr.fill_value = 3.1
6158
assert arr.fill_value == 3.1
6259

63-
# msg = "unable to set fill_value nan to int64 dtype"
64-
# with pytest.raises(ValueError, match=msg):
6560
arr.fill_value = np.nan
6661
assert np.isnan(arr.fill_value)
6762

6863
arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
6964
arr.fill_value = True
70-
assert arr.fill_value
71-
72-
# FIXME: don't leave commented-out
73-
# coerces to bool
74-
# TODO: we can construct an sparse array of bool
75-
# type and use as fill_value any value
76-
# msg = "fill_value must be True, False or nan"
77-
# with pytest.raises(ValueError, match=msg):
78-
# arr.fill_value = 0
79-
80-
# msg = "unable to set fill_value nan to bool dtype"
81-
# with pytest.raises(ValueError, match=msg):
65+
assert arr.fill_value is True
66+
67+
with tm.assert_produces_warning(FutureWarning, match=msg):
68+
arr.fill_value = 0
69+
8270
arr.fill_value = np.nan
8371
assert np.isnan(arr.fill_value)
8472

pandas/tests/arrays/sparse/test_dtype.py

+12-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import re
2+
import warnings
23

34
import numpy as np
45
import pytest
@@ -67,15 +68,22 @@ def test_nans_equal():
6768
assert b == a
6869

6970

70-
@pytest.mark.parametrize(
71-
"a, b",
72-
[
71+
with warnings.catch_warnings():
72+
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
73+
warnings.filterwarnings("ignore", msg, category=FutureWarning)
74+
75+
tups = [
7376
(SparseDtype("float64"), SparseDtype("float32")),
7477
(SparseDtype("float64"), SparseDtype("float64", 0)),
7578
(SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
7679
(SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
7780
(SparseDtype("float64"), np.dtype("float64")),
78-
],
81+
]
82+
83+
84+
@pytest.mark.parametrize(
85+
"a, b",
86+
tups,
7987
)
8088
def test_not_equal(a, b):
8189
assert a != b

pandas/tests/reshape/test_get_dummies.py

+33-15
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,10 @@ def test_get_dummies_basic(self, sparse, dtype):
5757
dtype=self.effective_dtype(dtype),
5858
)
5959
if sparse:
60-
expected = expected.apply(SparseArray, fill_value=0.0)
60+
if dtype.kind == "b":
61+
expected = expected.apply(SparseArray, fill_value=False)
62+
else:
63+
expected = expected.apply(SparseArray, fill_value=0.0)
6164
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
6265
tm.assert_frame_equal(result, expected)
6366

@@ -142,7 +145,10 @@ def test_get_dummies_include_na(self, sparse, dtype):
142145
{"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
143146
)
144147
if sparse:
145-
exp = exp.apply(SparseArray, fill_value=0.0)
148+
if dtype.kind == "b":
149+
exp = exp.apply(SparseArray, fill_value=False)
150+
else:
151+
exp = exp.apply(SparseArray, fill_value=0.0)
146152
tm.assert_frame_equal(res, exp)
147153

148154
# Sparse dataframes do not allow nan labelled columns, see #GH8822
@@ -155,7 +161,10 @@ def test_get_dummies_include_na(self, sparse, dtype):
155161
# hack (NaN handling in assert_index_equal)
156162
exp_na.columns = res_na.columns
157163
if sparse:
158-
exp_na = exp_na.apply(SparseArray, fill_value=0.0)
164+
if dtype.kind == "b":
165+
exp_na = exp_na.apply(SparseArray, fill_value=False)
166+
else:
167+
exp_na = exp_na.apply(SparseArray, fill_value=0.0)
159168
tm.assert_frame_equal(res_na, exp_na)
160169

161170
res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
@@ -174,7 +183,7 @@ def test_get_dummies_unicode(self, sparse):
174183
{"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
175184
)
176185
if sparse:
177-
exp = exp.apply(SparseArray, fill_value=0)
186+
exp = exp.apply(SparseArray, fill_value=False)
178187
tm.assert_frame_equal(res, exp)
179188

180189
def test_dataframe_dummies_all_obj(self, df, sparse):
@@ -216,7 +225,10 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
216225
result = get_dummies(df, sparse=sparse, dtype=dtype)
217226
if sparse:
218227
arr = SparseArray
219-
typ = SparseDtype(dtype, 0)
228+
if dtype.kind == "b":
229+
typ = SparseDtype(dtype, False)
230+
else:
231+
typ = SparseDtype(dtype, 0)
220232
else:
221233
arr = np.array
222234
typ = dtype
@@ -296,7 +308,7 @@ def test_dataframe_dummies_subset(self, df, sparse):
296308
expected[["C"]] = df[["C"]]
297309
if sparse:
298310
cols = ["from_A_a", "from_A_b"]
299-
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
311+
expected[cols] = expected[cols].astype(SparseDtype("bool", False))
300312
tm.assert_frame_equal(result, expected)
301313

302314
def test_dataframe_dummies_prefix_sep(self, df, sparse):
@@ -314,7 +326,7 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse):
314326
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
315327
if sparse:
316328
cols = ["A..a", "A..b", "B..b", "B..c"]
317-
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
329+
expected[cols] = expected[cols].astype(SparseDtype("bool", False))
318330

319331
tm.assert_frame_equal(result, expected)
320332

@@ -359,7 +371,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse):
359371
columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
360372
expected[columns] = expected[columns].astype(bool)
361373
if sparse:
362-
expected[columns] = expected[columns].astype(SparseDtype("bool", 0))
374+
expected[columns] = expected[columns].astype(SparseDtype("bool", False))
363375

364376
tm.assert_frame_equal(result, expected)
365377

@@ -371,7 +383,10 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype):
371383

372384
if sparse:
373385
arr = SparseArray
374-
typ = SparseDtype(dtype, 0)
386+
if dtype.kind == "b":
387+
typ = SparseDtype(dtype, False)
388+
else:
389+
typ = SparseDtype(dtype, 0)
375390
else:
376391
arr = np.array
377392
typ = dtype
@@ -399,7 +414,10 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
399414
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
400415
if sparse:
401416
arr = SparseArray
402-
typ = SparseDtype(dtype, 0)
417+
if dtype.kind == "b":
418+
typ = SparseDtype(dtype, False)
419+
else:
420+
typ = SparseDtype(dtype, 0)
403421
else:
404422
arr = np.array
405423
typ = dtype
@@ -456,7 +474,7 @@ def test_get_dummies_basic_drop_first(self, sparse):
456474

457475
result = get_dummies(s_list, drop_first=True, sparse=sparse)
458476
if sparse:
459-
expected = expected.apply(SparseArray, fill_value=0)
477+
expected = expected.apply(SparseArray, fill_value=False)
460478
tm.assert_frame_equal(result, expected)
461479

462480
result = get_dummies(s_series, drop_first=True, sparse=sparse)
@@ -490,7 +508,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
490508
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
491509
exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
492510
if sparse:
493-
exp = exp.apply(SparseArray, fill_value=0)
511+
exp = exp.apply(SparseArray, fill_value=False)
494512

495513
tm.assert_frame_equal(res, exp)
496514

@@ -499,7 +517,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
499517
["b", np.nan], axis=1
500518
)
501519
if sparse:
502-
exp_na = exp_na.apply(SparseArray, fill_value=0)
520+
exp_na = exp_na.apply(SparseArray, fill_value=False)
503521
tm.assert_frame_equal(res_na, exp_na)
504522

505523
res_just_na = get_dummies(
@@ -513,7 +531,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse):
513531
result = get_dummies(df, drop_first=True, sparse=sparse)
514532
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
515533
if sparse:
516-
expected = expected.apply(SparseArray, fill_value=0)
534+
expected = expected.apply(SparseArray, fill_value=False)
517535
tm.assert_frame_equal(result, expected)
518536

519537
def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
@@ -632,7 +650,7 @@ def test_get_dummies_duplicate_columns(self, df):
632650
def test_get_dummies_all_sparse(self):
633651
df = DataFrame({"A": [1, 2]})
634652
result = get_dummies(df, columns=["A"], sparse=True)
635-
dtype = SparseDtype("bool", 0)
653+
dtype = SparseDtype("bool", False)
636654
expected = DataFrame(
637655
{
638656
"A_1": SparseArray([1, 0], dtype=dtype),

0 commit comments

Comments
 (0)