From b6f171ebb4bad299f7b6060c0038b3fcdcc54a19 Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 2 May 2023 13:09:13 -0700 Subject: [PATCH 1/2] DEPR: require SparseDtype.fill_value be compatible with SparseDtype.subtype --- doc/source/whatsnew/v2.1.0.rst | 1 + pandas/core/arrays/sparse/dtype.py | 52 +++++++++++++++++------ pandas/tests/arrays/sparse/test_array.py | 28 ++++-------- pandas/tests/arrays/sparse/test_astype.py | 13 ++++-- pandas/tests/arrays/sparse/test_dtype.py | 15 +++++-- pandas/tests/reshape/test_get_dummies.py | 48 ++++++++++++++------- 6 files changed, 102 insertions(+), 55 deletions(-) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index 84e011bf80da6..f2068ed0e641d 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -259,6 +259,7 @@ Deprecations - Deprecated unused "closed" and "normalize" keywords in the :class:`DatetimeIndex` constructor (:issue:`52628`) - Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`) - Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`) +- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`) - Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`) - diff --git a/pandas/core/arrays/sparse/dtype.py b/pandas/core/arrays/sparse/dtype.py index dadd161ceeb38..1017cf2112042 100644 --- a/pandas/core/arrays/sparse/dtype.py +++ b/pandas/core/arrays/sparse/dtype.py @@ -18,6 +18,7 @@ ExtensionDtype, register_extension_dtype, ) +from pandas.core.dtypes.cast import can_hold_element from pandas.core.dtypes.common import ( is_bool_dtype, is_object_dtype, @@ -25,11 +26,15 @@ is_string_dtype, pandas_dtype, ) +from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, isna, na_value_for_dtype, ) +from pandas.core.construction import ensure_wrapped_if_datetimelike + if TYPE_CHECKING: from pandas._typing import ( Dtype, @@ -161,18 +166,41 @@ def _check_fill_value(self): raise ValueError( f"fill_value must be a scalar. Got {self._fill_value} instead" ) - # TODO: Right now we can use Sparse boolean array - # with any fill_value. Here was an attempt - # to allow only 3 value: True, False or nan - # but plenty test has failed. - # see pull 44955 - # if self._is_boolean and not ( - # is_bool(self._fill_value) or isna(self._fill_value) - # ): - # raise ValueError( - # "fill_value must be True, False or nan " - # f"for boolean type. Got {self._fill_value} instead" - # ) + + # GH#23124 require fill_value and subtype to match + val = self._fill_value + if isna(val): + if not is_valid_na_for_dtype(val, self.subtype): + warnings.warn( + "Allowing arbitrary scalar fill_value in SparseDtype is " + "deprecated. In a future version, the fill_value must be " + "a valid value for the SparseDtype.subtype.", + FutureWarning, + stacklevel=find_stack_level(), + ) + elif isinstance(self.subtype, CategoricalDtype): + # TODO: is this even supported? It is reached in + # test_dtype_sparse_with_fill_value_not_present_in_data + if self.subtype.categories is None or val not in self.subtype.categories: + warnings.warn( + "Allowing arbitrary scalar fill_value in SparseDtype is " + "deprecated. In a future version, the fill_value must be " + "a valid value for the SparseDtype.subtype.", + FutureWarning, + stacklevel=find_stack_level(), + ) + else: + dummy = np.empty(0, dtype=self.subtype) + dummy = ensure_wrapped_if_datetimelike(dummy) + + if not can_hold_element(dummy, val): + warnings.warn( + "Allowing arbitrary scalar fill_value in SparseDtype is " + "deprecated. In a future version, the fill_value must be " + "a valid value for the SparseDtype.subtype.", + FutureWarning, + stacklevel=find_stack_level(), + ) @property def _is_na_fill_value(self) -> bool: diff --git a/pandas/tests/arrays/sparse/test_array.py b/pandas/tests/arrays/sparse/test_array.py index 4a0795137f80b..b8effc3eff1d1 100644 --- a/pandas/tests/arrays/sparse/test_array.py +++ b/pandas/tests/arrays/sparse/test_array.py @@ -52,33 +52,21 @@ def test_set_fill_value(self): arr.fill_value = 2 assert arr.fill_value == 2 - # TODO: this seems fine? You can construct an integer - # sparsearray with NaN fill value, why not update one? - # coerces to int - # msg = "unable to set fill_value 3\\.1 to int64 dtype" - # with pytest.raises(ValueError, match=msg): - arr.fill_value = 3.1 + msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + arr.fill_value = 3.1 assert arr.fill_value == 3.1 - # msg = "unable to set fill_value nan to int64 dtype" - # with pytest.raises(ValueError, match=msg): arr.fill_value = np.nan assert np.isnan(arr.fill_value) arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_) arr.fill_value = True - assert arr.fill_value - - # FIXME: don't leave commented-out - # coerces to bool - # TODO: we can construct an sparse array of bool - # type and use as fill_value any value - # msg = "fill_value must be True, False or nan" - # with pytest.raises(ValueError, match=msg): - # arr.fill_value = 0 - - # msg = "unable to set fill_value nan to bool dtype" - # with pytest.raises(ValueError, match=msg): + assert arr.fill_value is True + + with tm.assert_produces_warning(FutureWarning, match=msg): + arr.fill_value = 0 + arr.fill_value = np.nan assert np.isnan(arr.fill_value) diff --git a/pandas/tests/arrays/sparse/test_astype.py b/pandas/tests/arrays/sparse/test_astype.py index 86d69610059b3..446b256e1a186 100644 --- a/pandas/tests/arrays/sparse/test_astype.py +++ b/pandas/tests/arrays/sparse/test_astype.py @@ -140,8 +140,13 @@ def test_astype_dt64_to_int64(self): def test_dtype_sparse_with_fill_value_not_present_in_data(): # GH 49987 df = DataFrame([["a", 0], ["b", 1], ["b", 2]], columns=["A", "B"]) - result = df["A"].astype(SparseDtype("category", fill_value="c")) - expected = Series( - ["a", "b", "b"], name="A", dtype=SparseDtype("object", fill_value="c") - ) + + msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated" + with tm.assert_produces_warning(FutureWarning, match=msg): + dtype1 = SparseDtype("category", fill_value="c") + + result = df["A"].astype(dtype1) + + dtype2 = SparseDtype("object", fill_value="c") + expected = Series(["a", "b", "b"], name="A", dtype=dtype2) tm.assert_series_equal(result, expected) diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index 58fedbd3e4231..c893e9760800f 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -1,4 +1,5 @@ import re +import warnings import numpy as np import pytest @@ -67,15 +68,21 @@ def test_nans_equal(): assert b == a -@pytest.mark.parametrize( - "a, b", - [ +with warnings.catch_warnings(): + warnings.filterwarnings("ignore") + + tups = [ (SparseDtype("float64"), SparseDtype("float32")), (SparseDtype("float64"), SparseDtype("float64", 0)), (SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)), (SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)), (SparseDtype("float64"), np.dtype("float64")), - ], + ] + + +@pytest.mark.parametrize( + "a, b", + tups, ) def test_not_equal(a, b): assert a != b diff --git a/pandas/tests/reshape/test_get_dummies.py b/pandas/tests/reshape/test_get_dummies.py index fab9b0a5d1846..6e943863072f1 100644 --- a/pandas/tests/reshape/test_get_dummies.py +++ b/pandas/tests/reshape/test_get_dummies.py @@ -57,7 +57,10 @@ def test_get_dummies_basic(self, sparse, dtype): dtype=self.effective_dtype(dtype), ) if sparse: - expected = expected.apply(SparseArray, fill_value=0.0) + if dtype.kind == "b": + expected = expected.apply(SparseArray, fill_value=False) + else: + expected = expected.apply(SparseArray, fill_value=0.0) result = get_dummies(s_list, sparse=sparse, dtype=dtype) tm.assert_frame_equal(result, expected) @@ -142,7 +145,10 @@ def test_get_dummies_include_na(self, sparse, dtype): {"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype) ) if sparse: - exp = exp.apply(SparseArray, fill_value=0.0) + if dtype.kind == "b": + exp = exp.apply(SparseArray, fill_value=False) + else: + exp = exp.apply(SparseArray, fill_value=0.0) tm.assert_frame_equal(res, exp) # Sparse dataframes do not allow nan labelled columns, see #GH8822 @@ -155,7 +161,10 @@ def test_get_dummies_include_na(self, sparse, dtype): # hack (NaN handling in assert_index_equal) exp_na.columns = res_na.columns if sparse: - exp_na = exp_na.apply(SparseArray, fill_value=0.0) + if dtype.kind == "b": + exp_na = exp_na.apply(SparseArray, fill_value=False) + else: + exp_na = exp_na.apply(SparseArray, fill_value=0.0) tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype) @@ -174,7 +183,7 @@ def test_get_dummies_unicode(self, sparse): {"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]} ) if sparse: - exp = exp.apply(SparseArray, fill_value=0) + exp = exp.apply(SparseArray, fill_value=False) tm.assert_frame_equal(res, exp) def test_dataframe_dummies_all_obj(self, df, sparse): @@ -216,7 +225,10 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype): result = get_dummies(df, sparse=sparse, dtype=dtype) if sparse: arr = SparseArray - typ = SparseDtype(dtype, 0) + if dtype.kind == "b": + typ = SparseDtype(dtype, False) + else: + typ = SparseDtype(dtype, 0) else: arr = np.array typ = dtype @@ -296,7 +308,7 @@ def test_dataframe_dummies_subset(self, df, sparse): expected[["C"]] = df[["C"]] if sparse: cols = ["from_A_a", "from_A_b"] - expected[cols] = expected[cols].astype(SparseDtype("bool", 0)) + expected[cols] = expected[cols].astype(SparseDtype("bool", False)) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_prefix_sep(self, df, sparse): @@ -314,7 +326,7 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse): expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]] if sparse: cols = ["A..a", "A..b", "B..b", "B..c"] - expected[cols] = expected[cols].astype(SparseDtype("bool", 0)) + expected[cols] = expected[cols].astype(SparseDtype("bool", False)) tm.assert_frame_equal(result, expected) @@ -359,7 +371,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse): columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"] expected[columns] = expected[columns].astype(bool) if sparse: - expected[columns] = expected[columns].astype(SparseDtype("bool", 0)) + expected[columns] = expected[columns].astype(SparseDtype("bool", False)) tm.assert_frame_equal(result, expected) @@ -371,7 +383,10 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype): if sparse: arr = SparseArray - typ = SparseDtype(dtype, 0) + if dtype.kind == "b": + typ = SparseDtype(dtype, False) + else: + typ = SparseDtype(dtype, 0) else: arr = np.array typ = dtype @@ -399,7 +414,10 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype): result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1) if sparse: arr = SparseArray - typ = SparseDtype(dtype, 0) + if dtype.kind == "b": + typ = SparseDtype(dtype, False) + else: + typ = SparseDtype(dtype, 0) else: arr = np.array typ = dtype @@ -456,7 +474,7 @@ def test_get_dummies_basic_drop_first(self, sparse): result = get_dummies(s_list, drop_first=True, sparse=sparse) if sparse: - expected = expected.apply(SparseArray, fill_value=0) + expected = expected.apply(SparseArray, fill_value=False) tm.assert_frame_equal(result, expected) result = get_dummies(s_series, drop_first=True, sparse=sparse) @@ -490,7 +508,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse): res = get_dummies(s_NA, drop_first=True, sparse=sparse) exp = DataFrame({"b": [0, 1, 0]}, dtype=bool) if sparse: - exp = exp.apply(SparseArray, fill_value=0) + exp = exp.apply(SparseArray, fill_value=False) tm.assert_frame_equal(res, exp) @@ -499,7 +517,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse): ["b", np.nan], axis=1 ) if sparse: - exp_na = exp_na.apply(SparseArray, fill_value=0) + exp_na = exp_na.apply(SparseArray, fill_value=False) tm.assert_frame_equal(res_na, exp_na) res_just_na = get_dummies( @@ -513,7 +531,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse): result = get_dummies(df, drop_first=True, sparse=sparse) expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool) if sparse: - expected = expected.apply(SparseArray, fill_value=0) + expected = expected.apply(SparseArray, fill_value=False) tm.assert_frame_equal(result, expected) def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype): @@ -632,7 +650,7 @@ def test_get_dummies_duplicate_columns(self, df): def test_get_dummies_all_sparse(self): df = DataFrame({"A": [1, 2]}) result = get_dummies(df, columns=["A"], sparse=True) - dtype = SparseDtype("bool", 0) + dtype = SparseDtype("bool", False) expected = DataFrame( { "A_1": SparseArray([1, 0], dtype=dtype), From 1db460f78db48b35af0a99c7b47edb6ef77a35b1 Mon Sep 17 00:00:00 2001 From: Brock Date: Wed, 3 May 2023 10:38:15 -0700 Subject: [PATCH 2/2] filter more specific --- pandas/tests/arrays/sparse/test_dtype.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/arrays/sparse/test_dtype.py b/pandas/tests/arrays/sparse/test_dtype.py index c893e9760800f..679d321418424 100644 --- a/pandas/tests/arrays/sparse/test_dtype.py +++ b/pandas/tests/arrays/sparse/test_dtype.py @@ -69,7 +69,8 @@ def test_nans_equal(): with warnings.catch_warnings(): - warnings.filterwarnings("ignore") + msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated" + warnings.filterwarnings("ignore", msg, category=FutureWarning) tups = [ (SparseDtype("float64"), SparseDtype("float32")),