DEPR: require SparseDtype.fill_value be compatible with SparseDtype.subtype (#53043)

jbrockmendel · web-flow · commit 45eb702c2a38 · 2023-05-11T13:24:01.000-07:00
* DEPR: require SparseDtype.fill_value be compatible with SparseDtype.subtype

* filter more specific
diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
@@ -261,6 +261,7 @@ Deprecations
 - Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`)
 - Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
 - Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`)
+- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
 - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
 -
 
diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
@@ -18,18 +18,23 @@
     ExtensionDtype,
     register_extension_dtype,
 )
+from pandas.core.dtypes.cast import can_hold_element
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_object_dtype,
     is_scalar,
     is_string_dtype,
     pandas_dtype,
 )
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import (
+    is_valid_na_for_dtype,
     isna,
     na_value_for_dtype,
 )
 
+from pandas.core.construction import ensure_wrapped_if_datetimelike
+
 if TYPE_CHECKING:
     from pandas._typing import (
         Dtype,
@@ -164,18 +169,41 @@ def _check_fill_value(self):
             raise ValueError(
                 f"fill_value must be a scalar. Got {self._fill_value} instead"
             )
-        # TODO: Right now we can use Sparse boolean array
-        #       with any fill_value. Here was an attempt
-        #       to allow only 3 value: True, False or nan
-        #       but plenty test has failed.
-        # see pull 44955
-        # if self._is_boolean and not (
-        #    is_bool(self._fill_value) or isna(self._fill_value)
-        # ):
-        #    raise ValueError(
-        #        "fill_value must be True, False or nan "
-        #        f"for boolean type. Got {self._fill_value} instead"
-        #    )
+
+        # GH#23124 require fill_value and subtype to match
+        val = self._fill_value
+        if isna(val):
+            if not is_valid_na_for_dtype(val, self.subtype):
+                warnings.warn(
+                    "Allowing arbitrary scalar fill_value in SparseDtype is "
+                    "deprecated. In a future version, the fill_value must be "
+                    "a valid value for the SparseDtype.subtype.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+        elif isinstance(self.subtype, CategoricalDtype):
+            # TODO: is this even supported?  It is reached in
+            #  test_dtype_sparse_with_fill_value_not_present_in_data
+            if self.subtype.categories is None or val not in self.subtype.categories:
+                warnings.warn(
+                    "Allowing arbitrary scalar fill_value in SparseDtype is "
+                    "deprecated. In a future version, the fill_value must be "
+                    "a valid value for the SparseDtype.subtype.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+        else:
+            dummy = np.empty(0, dtype=self.subtype)
+            dummy = ensure_wrapped_if_datetimelike(dummy)
+
+            if not can_hold_element(dummy, val):
+                warnings.warn(
+                    "Allowing arbitrary scalar fill_value in SparseDtype is "
+                    "deprecated. In a future version, the fill_value must be "
+                    "a valid value for the SparseDtype.subtype.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
 
     @property
     def _is_na_fill_value(self) -> bool:
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
@@ -52,33 +52,21 @@ def test_set_fill_value(self):
         arr.fill_value = 2
         assert arr.fill_value == 2
 
-        # TODO: this seems fine? You can construct an integer
-        # sparsearray with NaN fill value, why not update one?
-        # coerces to int
-        # msg = "unable to set fill_value 3\\.1 to int64 dtype"
-        # with pytest.raises(ValueError, match=msg):
-        arr.fill_value = 3.1
+        msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            arr.fill_value = 3.1
         assert arr.fill_value == 3.1
 
-        # msg = "unable to set fill_value nan to int64 dtype"
-        # with pytest.raises(ValueError, match=msg):
         arr.fill_value = np.nan
         assert np.isnan(arr.fill_value)
 
         arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
         arr.fill_value = True
-        assert arr.fill_value
-
-        # FIXME: don't leave commented-out
-        # coerces to bool
-        # TODO: we can construct an sparse array of bool
-        #      type and use as fill_value any value
-        # msg = "fill_value must be True, False or nan"
-        # with pytest.raises(ValueError, match=msg):
-        #    arr.fill_value = 0
-
-        # msg = "unable to set fill_value nan to bool dtype"
-        # with pytest.raises(ValueError, match=msg):
+        assert arr.fill_value is True
+
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            arr.fill_value = 0
+
         arr.fill_value = np.nan
         assert np.isnan(arr.fill_value)
 
diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
@@ -1,4 +1,5 @@
 import re
+import warnings
 
 import numpy as np
 import pytest
@@ -67,15 +68,22 @@ def test_nans_equal():
     assert b == a
 
 
-@pytest.mark.parametrize(
-    "a, b",
-    [
+with warnings.catch_warnings():
+    msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
+    warnings.filterwarnings("ignore", msg, category=FutureWarning)
+
+    tups = [
         (SparseDtype("float64"), SparseDtype("float32")),
         (SparseDtype("float64"), SparseDtype("float64", 0)),
         (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
         (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
         (SparseDtype("float64"), np.dtype("float64")),
-    ],
+    ]
+
+
+@pytest.mark.parametrize(
+    "a, b",
+    tups,
 )
 def test_not_equal(a, b):
     assert a != b
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
@@ -57,7 +57,10 @@ def test_get_dummies_basic(self, sparse, dtype):
             dtype=self.effective_dtype(dtype),
         )
         if sparse:
-            expected = expected.apply(SparseArray, fill_value=0.0)
+            if dtype.kind == "b":
+                expected = expected.apply(SparseArray, fill_value=False)
+            else:
+                expected = expected.apply(SparseArray, fill_value=0.0)
         result = get_dummies(s_list, sparse=sparse, dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
@@ -142,7 +145,10 @@ def test_get_dummies_include_na(self, sparse, dtype):
             {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
         )
         if sparse:
-            exp = exp.apply(SparseArray, fill_value=0.0)
+            if dtype.kind == "b":
+                exp = exp.apply(SparseArray, fill_value=False)
+            else:
+                exp = exp.apply(SparseArray, fill_value=0.0)
         tm.assert_frame_equal(res, exp)
 
         # Sparse dataframes do not allow nan labelled columns, see #GH8822
@@ -155,7 +161,10 @@ def test_get_dummies_include_na(self, sparse, dtype):
         # hack (NaN handling in assert_index_equal)
         exp_na.columns = res_na.columns
         if sparse:
-            exp_na = exp_na.apply(SparseArray, fill_value=0.0)
+            if dtype.kind == "b":
+                exp_na = exp_na.apply(SparseArray, fill_value=False)
+            else:
+                exp_na = exp_na.apply(SparseArray, fill_value=0.0)
         tm.assert_frame_equal(res_na, exp_na)
 
         res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
@@ -174,7 +183,7 @@ def test_get_dummies_unicode(self, sparse):
             {"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
         )
         if sparse:
-            exp = exp.apply(SparseArray, fill_value=0)
+            exp = exp.apply(SparseArray, fill_value=False)
         tm.assert_frame_equal(res, exp)
 
     def test_dataframe_dummies_all_obj(self, df, sparse):
@@ -216,7 +225,10 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
         result = get_dummies(df, sparse=sparse, dtype=dtype)
         if sparse:
             arr = SparseArray
-            typ = SparseDtype(dtype, 0)
+            if dtype.kind == "b":
+                typ = SparseDtype(dtype, False)
+            else:
+                typ = SparseDtype(dtype, 0)
         else:
             arr = np.array
             typ = dtype
@@ -296,7 +308,7 @@ def test_dataframe_dummies_subset(self, df, sparse):
         expected[["C"]] = df[["C"]]
         if sparse:
             cols = ["from_A_a", "from_A_b"]
-            expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
+            expected[cols] = expected[cols].astype(SparseDtype("bool", False))
         tm.assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_prefix_sep(self, df, sparse):
@@ -314,7 +326,7 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse):
         expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
         if sparse:
             cols = ["A..a", "A..b", "B..b", "B..c"]
-            expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
+            expected[cols] = expected[cols].astype(SparseDtype("bool", False))
 
         tm.assert_frame_equal(result, expected)
 
@@ -359,7 +371,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse):
         columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
         expected[columns] = expected[columns].astype(bool)
         if sparse:
-            expected[columns] = expected[columns].astype(SparseDtype("bool", 0))
+            expected[columns] = expected[columns].astype(SparseDtype("bool", False))
 
         tm.assert_frame_equal(result, expected)
 
@@ -371,7 +383,10 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype):
 
         if sparse:
             arr = SparseArray
-            typ = SparseDtype(dtype, 0)
+            if dtype.kind == "b":
+                typ = SparseDtype(dtype, False)
+            else:
+                typ = SparseDtype(dtype, 0)
         else:
             arr = np.array
             typ = dtype
@@ -399,7 +414,10 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
         result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
         if sparse:
             arr = SparseArray
-            typ = SparseDtype(dtype, 0)
+            if dtype.kind == "b":
+                typ = SparseDtype(dtype, False)
+            else:
+                typ = SparseDtype(dtype, 0)
         else:
             arr = np.array
             typ = dtype
@@ -456,7 +474,7 @@ def test_get_dummies_basic_drop_first(self, sparse):
 
         result = get_dummies(s_list, drop_first=True, sparse=sparse)
         if sparse:
-            expected = expected.apply(SparseArray, fill_value=0)
+            expected = expected.apply(SparseArray, fill_value=False)
         tm.assert_frame_equal(result, expected)
 
         result = get_dummies(s_series, drop_first=True, sparse=sparse)
@@ -490,7 +508,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
         res = get_dummies(s_NA, drop_first=True, sparse=sparse)
         exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
         if sparse:
-            exp = exp.apply(SparseArray, fill_value=0)
+            exp = exp.apply(SparseArray, fill_value=False)
 
         tm.assert_frame_equal(res, exp)
 
@@ -499,7 +517,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
             ["b", np.nan], axis=1
         )
         if sparse:
-            exp_na = exp_na.apply(SparseArray, fill_value=0)
+            exp_na = exp_na.apply(SparseArray, fill_value=False)
         tm.assert_frame_equal(res_na, exp_na)
 
         res_just_na = get_dummies(
@@ -513,7 +531,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse):
         result = get_dummies(df, drop_first=True, sparse=sparse)
         expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
         if sparse:
-            expected = expected.apply(SparseArray, fill_value=0)
+            expected = expected.apply(SparseArray, fill_value=False)
         tm.assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
@@ -632,7 +650,7 @@ def test_get_dummies_duplicate_columns(self, df):
     def test_get_dummies_all_sparse(self):
         df = DataFrame({"A": [1, 2]})
         result = get_dummies(df, columns=["A"], sparse=True)
-        dtype = SparseDtype("bool", 0)
+        dtype = SparseDtype("bool", False)
         expected = DataFrame(
             {
                 "A_1": SparseArray([1, 0], dtype=dtype),

Original file line number	Diff line number	Diff line change
`@@ -261,6 +261,7 @@ Deprecations`
`261`	`261`	- Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`)
`262`	`262`	- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
`263`	`263`	- Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`)
	`264`	+- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
`264`	`265`	- Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
`265`	`266`	`-`
`266`	`267`