From b6f171ebb4bad299f7b6060c0038b3fcdcc54a19 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 2 May 2023 13:09:13 -0700
Subject: [PATCH 1/2] DEPR: require SparseDtype.fill_value be compatible with
 SparseDtype.subtype

---
 doc/source/whatsnew/v2.1.0.rst            |  1 +
 pandas/core/arrays/sparse/dtype.py        | 52 +++++++++++++++++------
 pandas/tests/arrays/sparse/test_array.py  | 28 ++++--------
 pandas/tests/arrays/sparse/test_astype.py | 13 ++++--
 pandas/tests/arrays/sparse/test_dtype.py  | 15 +++++--
 pandas/tests/reshape/test_get_dummies.py  | 48 ++++++++++++++-------
 6 files changed, 102 insertions(+), 55 deletions(-)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index 84e011bf80da6..f2068ed0e641d 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -259,6 +259,7 @@ Deprecations
 - Deprecated unused "closed" and "normalize" keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`)
 - Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`)
 - Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
+- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
 - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
 -
 
diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py
index dadd161ceeb38..1017cf2112042 100644
--- a/pandas/core/arrays/sparse/dtype.py
+++ b/pandas/core/arrays/sparse/dtype.py
@@ -18,6 +18,7 @@
     ExtensionDtype,
     register_extension_dtype,
 )
+from pandas.core.dtypes.cast import can_hold_element
 from pandas.core.dtypes.common import (
     is_bool_dtype,
     is_object_dtype,
@@ -25,11 +26,15 @@
     is_string_dtype,
     pandas_dtype,
 )
+from pandas.core.dtypes.dtypes import CategoricalDtype
 from pandas.core.dtypes.missing import (
+    is_valid_na_for_dtype,
     isna,
     na_value_for_dtype,
 )
 
+from pandas.core.construction import ensure_wrapped_if_datetimelike
+
 if TYPE_CHECKING:
     from pandas._typing import (
         Dtype,
@@ -161,18 +166,41 @@ def _check_fill_value(self):
             raise ValueError(
                 f"fill_value must be a scalar. Got {self._fill_value} instead"
             )
-        # TODO: Right now we can use Sparse boolean array
-        #       with any fill_value. Here was an attempt
-        #       to allow only 3 value: True, False or nan
-        #       but plenty test has failed.
-        # see pull 44955
-        # if self._is_boolean and not (
-        #    is_bool(self._fill_value) or isna(self._fill_value)
-        # ):
-        #    raise ValueError(
-        #        "fill_value must be True, False or nan "
-        #        f"for boolean type. Got {self._fill_value} instead"
-        #    )
+
+        # GH#23124 require fill_value and subtype to match
+        val = self._fill_value
+        if isna(val):
+            if not is_valid_na_for_dtype(val, self.subtype):
+                warnings.warn(
+                    "Allowing arbitrary scalar fill_value in SparseDtype is "
+                    "deprecated. In a future version, the fill_value must be "
+                    "a valid value for the SparseDtype.subtype.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+        elif isinstance(self.subtype, CategoricalDtype):
+            # TODO: is this even supported?  It is reached in
+            #  test_dtype_sparse_with_fill_value_not_present_in_data
+            if self.subtype.categories is None or val not in self.subtype.categories:
+                warnings.warn(
+                    "Allowing arbitrary scalar fill_value in SparseDtype is "
+                    "deprecated. In a future version, the fill_value must be "
+                    "a valid value for the SparseDtype.subtype.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
+        else:
+            dummy = np.empty(0, dtype=self.subtype)
+            dummy = ensure_wrapped_if_datetimelike(dummy)
+
+            if not can_hold_element(dummy, val):
+                warnings.warn(
+                    "Allowing arbitrary scalar fill_value in SparseDtype is "
+                    "deprecated. In a future version, the fill_value must be "
+                    "a valid value for the SparseDtype.subtype.",
+                    FutureWarning,
+                    stacklevel=find_stack_level(),
+                )
 
     @property
     def _is_na_fill_value(self) -> bool:
diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py
index 4a0795137f80b..b8effc3eff1d1 100644
--- a/pandas/tests/arrays/sparse/test_array.py
+++ b/pandas/tests/arrays/sparse/test_array.py
@@ -52,33 +52,21 @@ def test_set_fill_value(self):
         arr.fill_value = 2
         assert arr.fill_value == 2
 
-        # TODO: this seems fine? You can construct an integer
-        # sparsearray with NaN fill value, why not update one?
-        # coerces to int
-        # msg = "unable to set fill_value 3\\.1 to int64 dtype"
-        # with pytest.raises(ValueError, match=msg):
-        arr.fill_value = 3.1
+        msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            arr.fill_value = 3.1
         assert arr.fill_value == 3.1
 
-        # msg = "unable to set fill_value nan to int64 dtype"
-        # with pytest.raises(ValueError, match=msg):
         arr.fill_value = np.nan
         assert np.isnan(arr.fill_value)
 
         arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
         arr.fill_value = True
-        assert arr.fill_value
-
-        # FIXME: don't leave commented-out
-        # coerces to bool
-        # TODO: we can construct an sparse array of bool
-        #      type and use as fill_value any value
-        # msg = "fill_value must be True, False or nan"
-        # with pytest.raises(ValueError, match=msg):
-        #    arr.fill_value = 0
-
-        # msg = "unable to set fill_value nan to bool dtype"
-        # with pytest.raises(ValueError, match=msg):
+        assert arr.fill_value is True
+
+        with tm.assert_produces_warning(FutureWarning, match=msg):
+            arr.fill_value = 0
+
         arr.fill_value = np.nan
         assert np.isnan(arr.fill_value)
 
diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py
index 86d69610059b3..446b256e1a186 100644
--- a/pandas/tests/arrays/sparse/test_astype.py
+++ b/pandas/tests/arrays/sparse/test_astype.py
@@ -140,8 +140,13 @@ def test_astype_dt64_to_int64(self):
 def test_dtype_sparse_with_fill_value_not_present_in_data():
     # GH 49987
     df = DataFrame([["a", 0], ["b", 1], ["b", 2]], columns=["A", "B"])
-    result = df["A"].astype(SparseDtype("category", fill_value="c"))
-    expected = Series(
-        ["a", "b", "b"], name="A", dtype=SparseDtype("object", fill_value="c")
-    )
+
+    msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
+    with tm.assert_produces_warning(FutureWarning, match=msg):
+        dtype1 = SparseDtype("category", fill_value="c")
+
+    result = df["A"].astype(dtype1)
+
+    dtype2 = SparseDtype("object", fill_value="c")
+    expected = Series(["a", "b", "b"], name="A", dtype=dtype2)
     tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
index 58fedbd3e4231..c893e9760800f 100644
--- a/pandas/tests/arrays/sparse/test_dtype.py
+++ b/pandas/tests/arrays/sparse/test_dtype.py
@@ -1,4 +1,5 @@
 import re
+import warnings
 
 import numpy as np
 import pytest
@@ -67,15 +68,21 @@ def test_nans_equal():
     assert b == a
 
 
-@pytest.mark.parametrize(
-    "a, b",
-    [
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore")
+
+    tups = [
         (SparseDtype("float64"), SparseDtype("float32")),
         (SparseDtype("float64"), SparseDtype("float64", 0)),
         (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
         (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
         (SparseDtype("float64"), np.dtype("float64")),
-    ],
+    ]
+
+
+@pytest.mark.parametrize(
+    "a, b",
+    tups,
 )
 def test_not_equal(a, b):
     assert a != b
diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py
index fab9b0a5d1846..6e943863072f1 100644
--- a/pandas/tests/reshape/test_get_dummies.py
+++ b/pandas/tests/reshape/test_get_dummies.py
@@ -57,7 +57,10 @@ def test_get_dummies_basic(self, sparse, dtype):
             dtype=self.effective_dtype(dtype),
         )
         if sparse:
-            expected = expected.apply(SparseArray, fill_value=0.0)
+            if dtype.kind == "b":
+                expected = expected.apply(SparseArray, fill_value=False)
+            else:
+                expected = expected.apply(SparseArray, fill_value=0.0)
         result = get_dummies(s_list, sparse=sparse, dtype=dtype)
         tm.assert_frame_equal(result, expected)
 
@@ -142,7 +145,10 @@ def test_get_dummies_include_na(self, sparse, dtype):
             {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
         )
         if sparse:
-            exp = exp.apply(SparseArray, fill_value=0.0)
+            if dtype.kind == "b":
+                exp = exp.apply(SparseArray, fill_value=False)
+            else:
+                exp = exp.apply(SparseArray, fill_value=0.0)
         tm.assert_frame_equal(res, exp)
 
         # Sparse dataframes do not allow nan labelled columns, see #GH8822
@@ -155,7 +161,10 @@ def test_get_dummies_include_na(self, sparse, dtype):
         # hack (NaN handling in assert_index_equal)
         exp_na.columns = res_na.columns
         if sparse:
-            exp_na = exp_na.apply(SparseArray, fill_value=0.0)
+            if dtype.kind == "b":
+                exp_na = exp_na.apply(SparseArray, fill_value=False)
+            else:
+                exp_na = exp_na.apply(SparseArray, fill_value=0.0)
         tm.assert_frame_equal(res_na, exp_na)
 
         res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
@@ -174,7 +183,7 @@ def test_get_dummies_unicode(self, sparse):
             {"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
         )
         if sparse:
-            exp = exp.apply(SparseArray, fill_value=0)
+            exp = exp.apply(SparseArray, fill_value=False)
         tm.assert_frame_equal(res, exp)
 
     def test_dataframe_dummies_all_obj(self, df, sparse):
@@ -216,7 +225,10 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
         result = get_dummies(df, sparse=sparse, dtype=dtype)
         if sparse:
             arr = SparseArray
-            typ = SparseDtype(dtype, 0)
+            if dtype.kind == "b":
+                typ = SparseDtype(dtype, False)
+            else:
+                typ = SparseDtype(dtype, 0)
         else:
             arr = np.array
             typ = dtype
@@ -296,7 +308,7 @@ def test_dataframe_dummies_subset(self, df, sparse):
         expected[["C"]] = df[["C"]]
         if sparse:
             cols = ["from_A_a", "from_A_b"]
-            expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
+            expected[cols] = expected[cols].astype(SparseDtype("bool", False))
         tm.assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_prefix_sep(self, df, sparse):
@@ -314,7 +326,7 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse):
         expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
         if sparse:
             cols = ["A..a", "A..b", "B..b", "B..c"]
-            expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
+            expected[cols] = expected[cols].astype(SparseDtype("bool", False))
 
         tm.assert_frame_equal(result, expected)
 
@@ -359,7 +371,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse):
         columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
         expected[columns] = expected[columns].astype(bool)
         if sparse:
-            expected[columns] = expected[columns].astype(SparseDtype("bool", 0))
+            expected[columns] = expected[columns].astype(SparseDtype("bool", False))
 
         tm.assert_frame_equal(result, expected)
 
@@ -371,7 +383,10 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype):
 
         if sparse:
             arr = SparseArray
-            typ = SparseDtype(dtype, 0)
+            if dtype.kind == "b":
+                typ = SparseDtype(dtype, False)
+            else:
+                typ = SparseDtype(dtype, 0)
         else:
             arr = np.array
             typ = dtype
@@ -399,7 +414,10 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
         result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
         if sparse:
             arr = SparseArray
-            typ = SparseDtype(dtype, 0)
+            if dtype.kind == "b":
+                typ = SparseDtype(dtype, False)
+            else:
+                typ = SparseDtype(dtype, 0)
         else:
             arr = np.array
             typ = dtype
@@ -456,7 +474,7 @@ def test_get_dummies_basic_drop_first(self, sparse):
 
         result = get_dummies(s_list, drop_first=True, sparse=sparse)
         if sparse:
-            expected = expected.apply(SparseArray, fill_value=0)
+            expected = expected.apply(SparseArray, fill_value=False)
         tm.assert_frame_equal(result, expected)
 
         result = get_dummies(s_series, drop_first=True, sparse=sparse)
@@ -490,7 +508,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
         res = get_dummies(s_NA, drop_first=True, sparse=sparse)
         exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
         if sparse:
-            exp = exp.apply(SparseArray, fill_value=0)
+            exp = exp.apply(SparseArray, fill_value=False)
 
         tm.assert_frame_equal(res, exp)
 
@@ -499,7 +517,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
             ["b", np.nan], axis=1
         )
         if sparse:
-            exp_na = exp_na.apply(SparseArray, fill_value=0)
+            exp_na = exp_na.apply(SparseArray, fill_value=False)
         tm.assert_frame_equal(res_na, exp_na)
 
         res_just_na = get_dummies(
@@ -513,7 +531,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse):
         result = get_dummies(df, drop_first=True, sparse=sparse)
         expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
         if sparse:
-            expected = expected.apply(SparseArray, fill_value=0)
+            expected = expected.apply(SparseArray, fill_value=False)
         tm.assert_frame_equal(result, expected)
 
     def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
@@ -632,7 +650,7 @@ def test_get_dummies_duplicate_columns(self, df):
     def test_get_dummies_all_sparse(self):
         df = DataFrame({"A": [1, 2]})
         result = get_dummies(df, columns=["A"], sparse=True)
-        dtype = SparseDtype("bool", 0)
+        dtype = SparseDtype("bool", False)
         expected = DataFrame(
             {
                 "A_1": SparseArray([1, 0], dtype=dtype),

From 1db460f78db48b35af0a99c7b47edb6ef77a35b1 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Wed, 3 May 2023 10:38:15 -0700
Subject: [PATCH 2/2] filter more specific

---
 pandas/tests/arrays/sparse/test_dtype.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py
index c893e9760800f..679d321418424 100644
--- a/pandas/tests/arrays/sparse/test_dtype.py
+++ b/pandas/tests/arrays/sparse/test_dtype.py
@@ -69,7 +69,8 @@ def test_nans_equal():
 
 
 with warnings.catch_warnings():
-    warnings.filterwarnings("ignore")
+    msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
+    warnings.filterwarnings("ignore", msg, category=FutureWarning)
 
     tups = [
         (SparseDtype("float64"), SparseDtype("float32")),