Skip to content

DEPR: require SparseDtype.fill_value be compatible with SparseDtype.subtype #53043

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 4 commits into from
May 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,7 @@ Deprecations
- Deprecated unused "closed" keyword in the :class:`TimedeltaIndex` constructor (:issue:`52628`)
- Deprecated logical operation between two non boolean :class:`Series` with different indexes always coercing the result to bool dtype. In a future version, this will maintain the return type of the inputs. (:issue:`52500`, :issue:`52538`)
- Deprecated allowing ``downcast`` keyword other than ``None``, ``False``, "infer", or a dict with these as values in :meth:`Series.fillna`, :meth:`DataFrame.fillna` (:issue:`40988`)
- Deprecated allowing arbitrary ``fill_value`` in :class:`SparseDtype`, in a future version the ``fill_value`` will need to be compatible with the ``dtype.subtype``, either a scalar that can be held by that subtype or ``NaN`` for integer or bool subtypes (:issue:`23124`)
- Deprecated constructing :class:`SparseArray` from scalar data, pass a sequence instead (:issue:`53039`)
-

Expand Down
52 changes: 40 additions & 12 deletions pandas/core/arrays/sparse/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,23 @@
ExtensionDtype,
register_extension_dtype,
)
from pandas.core.dtypes.cast import can_hold_element
from pandas.core.dtypes.common import (
is_bool_dtype,
is_object_dtype,
is_scalar,
is_string_dtype,
pandas_dtype,
)
from pandas.core.dtypes.dtypes import CategoricalDtype
from pandas.core.dtypes.missing import (
is_valid_na_for_dtype,
isna,
na_value_for_dtype,
)

from pandas.core.construction import ensure_wrapped_if_datetimelike

if TYPE_CHECKING:
from pandas._typing import (
Dtype,
Expand Down Expand Up @@ -164,18 +169,41 @@ def _check_fill_value(self):
raise ValueError(
f"fill_value must be a scalar. Got {self._fill_value} instead"
)
# TODO: Right now we can use Sparse boolean array
# with any fill_value. Here was an attempt
# to allow only 3 value: True, False or nan
# but plenty test has failed.
# see pull 44955
# if self._is_boolean and not (
# is_bool(self._fill_value) or isna(self._fill_value)
# ):
# raise ValueError(
# "fill_value must be True, False or nan "
# f"for boolean type. Got {self._fill_value} instead"
# )

# GH#23124 require fill_value and subtype to match
val = self._fill_value
if isna(val):
if not is_valid_na_for_dtype(val, self.subtype):
warnings.warn(
"Allowing arbitrary scalar fill_value in SparseDtype is "
"deprecated. In a future version, the fill_value must be "
"a valid value for the SparseDtype.subtype.",
FutureWarning,
stacklevel=find_stack_level(),
)
elif isinstance(self.subtype, CategoricalDtype):
# TODO: is this even supported? It is reached in
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could see it being reasonable to have a fill value for a Categorical that's not a current category but is the same type as the current categories

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

in principle i guess, but i think this case existing at all is wrong. SparseArray._sparse_values has to be np.ndarray, and the dtype is initialized with self._dtype = SparseDtype(sparse_values.dtype, fill_value)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will make a separate PR to disallow this case

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#53160 disallows this case

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should no longer be an issue now that #53160 is merged

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woops, should have updated to remove this check. will do in a follow-up

# test_dtype_sparse_with_fill_value_not_present_in_data
if self.subtype.categories is None or val not in self.subtype.categories:
warnings.warn(
"Allowing arbitrary scalar fill_value in SparseDtype is "
"deprecated. In a future version, the fill_value must be "
"a valid value for the SparseDtype.subtype.",
FutureWarning,
stacklevel=find_stack_level(),
)
else:
dummy = np.empty(0, dtype=self.subtype)
dummy = ensure_wrapped_if_datetimelike(dummy)

if not can_hold_element(dummy, val):
warnings.warn(
"Allowing arbitrary scalar fill_value in SparseDtype is "
"deprecated. In a future version, the fill_value must be "
"a valid value for the SparseDtype.subtype.",
FutureWarning,
stacklevel=find_stack_level(),
)

@property
def _is_na_fill_value(self) -> bool:
Expand Down
28 changes: 8 additions & 20 deletions pandas/tests/arrays/sparse/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,33 +52,21 @@ def test_set_fill_value(self):
arr.fill_value = 2
assert arr.fill_value == 2

# TODO: this seems fine? You can construct an integer
# sparsearray with NaN fill value, why not update one?
# coerces to int
# msg = "unable to set fill_value 3\\.1 to int64 dtype"
# with pytest.raises(ValueError, match=msg):
arr.fill_value = 3.1
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
with tm.assert_produces_warning(FutureWarning, match=msg):
arr.fill_value = 3.1
assert arr.fill_value == 3.1

# msg = "unable to set fill_value nan to int64 dtype"
# with pytest.raises(ValueError, match=msg):
arr.fill_value = np.nan
assert np.isnan(arr.fill_value)

arr = SparseArray([True, False, True], fill_value=False, dtype=np.bool_)
arr.fill_value = True
assert arr.fill_value

# FIXME: don't leave commented-out
# coerces to bool
# TODO: we can construct an sparse array of bool
# type and use as fill_value any value
# msg = "fill_value must be True, False or nan"
# with pytest.raises(ValueError, match=msg):
# arr.fill_value = 0

# msg = "unable to set fill_value nan to bool dtype"
# with pytest.raises(ValueError, match=msg):
assert arr.fill_value is True

with tm.assert_produces_warning(FutureWarning, match=msg):
arr.fill_value = 0

arr.fill_value = np.nan
assert np.isnan(arr.fill_value)

Expand Down
16 changes: 12 additions & 4 deletions pandas/tests/arrays/sparse/test_dtype.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
import warnings

import numpy as np
import pytest
Expand Down Expand Up @@ -67,15 +68,22 @@ def test_nans_equal():
assert b == a


@pytest.mark.parametrize(
"a, b",
[
with warnings.catch_warnings():
msg = "Allowing arbitrary scalar fill_value in SparseDtype is deprecated"
warnings.filterwarnings("ignore", msg, category=FutureWarning)

tups = [
(SparseDtype("float64"), SparseDtype("float32")),
(SparseDtype("float64"), SparseDtype("float64", 0)),
(SparseDtype("float64"), SparseDtype("datetime64[ns]", np.nan)),
(SparseDtype(int, pd.NaT), SparseDtype(float, pd.NaT)),
(SparseDtype("float64"), np.dtype("float64")),
],
]


@pytest.mark.parametrize(
"a, b",
tups,
)
def test_not_equal(a, b):
assert a != b
Expand Down
48 changes: 33 additions & 15 deletions pandas/tests/reshape/test_get_dummies.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,10 @@ def test_get_dummies_basic(self, sparse, dtype):
dtype=self.effective_dtype(dtype),
)
if sparse:
expected = expected.apply(SparseArray, fill_value=0.0)
if dtype.kind == "b":
expected = expected.apply(SparseArray, fill_value=False)
else:
expected = expected.apply(SparseArray, fill_value=0.0)
result = get_dummies(s_list, sparse=sparse, dtype=dtype)
tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -142,7 +145,10 @@ def test_get_dummies_include_na(self, sparse, dtype):
{"a": [1, 0, 0], "b": [0, 1, 0]}, dtype=self.effective_dtype(dtype)
)
if sparse:
exp = exp.apply(SparseArray, fill_value=0.0)
if dtype.kind == "b":
exp = exp.apply(SparseArray, fill_value=False)
else:
exp = exp.apply(SparseArray, fill_value=0.0)
tm.assert_frame_equal(res, exp)

# Sparse dataframes do not allow nan labelled columns, see #GH8822
Expand All @@ -155,7 +161,10 @@ def test_get_dummies_include_na(self, sparse, dtype):
# hack (NaN handling in assert_index_equal)
exp_na.columns = res_na.columns
if sparse:
exp_na = exp_na.apply(SparseArray, fill_value=0.0)
if dtype.kind == "b":
exp_na = exp_na.apply(SparseArray, fill_value=False)
else:
exp_na = exp_na.apply(SparseArray, fill_value=0.0)
tm.assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies([np.nan], dummy_na=True, sparse=sparse, dtype=dtype)
Expand All @@ -174,7 +183,7 @@ def test_get_dummies_unicode(self, sparse):
{"letter_e": [True, False, False], f"letter_{eacute}": [False, True, True]}
)
if sparse:
exp = exp.apply(SparseArray, fill_value=0)
exp = exp.apply(SparseArray, fill_value=False)
tm.assert_frame_equal(res, exp)

def test_dataframe_dummies_all_obj(self, df, sparse):
Expand Down Expand Up @@ -216,7 +225,10 @@ def test_dataframe_dummies_mix_default(self, df, sparse, dtype):
result = get_dummies(df, sparse=sparse, dtype=dtype)
if sparse:
arr = SparseArray
typ = SparseDtype(dtype, 0)
if dtype.kind == "b":
typ = SparseDtype(dtype, False)
else:
typ = SparseDtype(dtype, 0)
else:
arr = np.array
typ = dtype
Expand Down Expand Up @@ -296,7 +308,7 @@ def test_dataframe_dummies_subset(self, df, sparse):
expected[["C"]] = df[["C"]]
if sparse:
cols = ["from_A_a", "from_A_b"]
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
expected[cols] = expected[cols].astype(SparseDtype("bool", False))
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_prefix_sep(self, df, sparse):
Expand All @@ -314,7 +326,7 @@ def test_dataframe_dummies_prefix_sep(self, df, sparse):
expected = expected[["C", "A..a", "A..b", "B..b", "B..c"]]
if sparse:
cols = ["A..a", "A..b", "B..b", "B..c"]
expected[cols] = expected[cols].astype(SparseDtype("bool", 0))
expected[cols] = expected[cols].astype(SparseDtype("bool", False))

tm.assert_frame_equal(result, expected)

Expand Down Expand Up @@ -359,7 +371,7 @@ def test_dataframe_dummies_prefix_dict(self, sparse):
columns = ["from_A_a", "from_A_b", "from_B_b", "from_B_c"]
expected[columns] = expected[columns].astype(bool)
if sparse:
expected[columns] = expected[columns].astype(SparseDtype("bool", 0))
expected[columns] = expected[columns].astype(SparseDtype("bool", False))

tm.assert_frame_equal(result, expected)

Expand All @@ -371,7 +383,10 @@ def test_dataframe_dummies_with_na(self, df, sparse, dtype):

if sparse:
arr = SparseArray
typ = SparseDtype(dtype, 0)
if dtype.kind == "b":
typ = SparseDtype(dtype, False)
else:
typ = SparseDtype(dtype, 0)
else:
arr = np.array
typ = dtype
Expand Down Expand Up @@ -399,7 +414,10 @@ def test_dataframe_dummies_with_categorical(self, df, sparse, dtype):
result = get_dummies(df, sparse=sparse, dtype=dtype).sort_index(axis=1)
if sparse:
arr = SparseArray
typ = SparseDtype(dtype, 0)
if dtype.kind == "b":
typ = SparseDtype(dtype, False)
else:
typ = SparseDtype(dtype, 0)
else:
arr = np.array
typ = dtype
Expand Down Expand Up @@ -456,7 +474,7 @@ def test_get_dummies_basic_drop_first(self, sparse):

result = get_dummies(s_list, drop_first=True, sparse=sparse)
if sparse:
expected = expected.apply(SparseArray, fill_value=0)
expected = expected.apply(SparseArray, fill_value=False)
tm.assert_frame_equal(result, expected)

result = get_dummies(s_series, drop_first=True, sparse=sparse)
Expand Down Expand Up @@ -490,7 +508,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
res = get_dummies(s_NA, drop_first=True, sparse=sparse)
exp = DataFrame({"b": [0, 1, 0]}, dtype=bool)
if sparse:
exp = exp.apply(SparseArray, fill_value=0)
exp = exp.apply(SparseArray, fill_value=False)

tm.assert_frame_equal(res, exp)

Expand All @@ -499,7 +517,7 @@ def test_get_dummies_basic_drop_first_NA(self, sparse):
["b", np.nan], axis=1
)
if sparse:
exp_na = exp_na.apply(SparseArray, fill_value=0)
exp_na = exp_na.apply(SparseArray, fill_value=False)
tm.assert_frame_equal(res_na, exp_na)

res_just_na = get_dummies(
Expand All @@ -513,7 +531,7 @@ def test_dataframe_dummies_drop_first(self, df, sparse):
result = get_dummies(df, drop_first=True, sparse=sparse)
expected = DataFrame({"A_b": [0, 1, 0], "B_c": [0, 0, 1]}, dtype=bool)
if sparse:
expected = expected.apply(SparseArray, fill_value=0)
expected = expected.apply(SparseArray, fill_value=False)
tm.assert_frame_equal(result, expected)

def test_dataframe_dummies_drop_first_with_categorical(self, df, sparse, dtype):
Expand Down Expand Up @@ -632,7 +650,7 @@ def test_get_dummies_duplicate_columns(self, df):
def test_get_dummies_all_sparse(self):
df = DataFrame({"A": [1, 2]})
result = get_dummies(df, columns=["A"], sparse=True)
dtype = SparseDtype("bool", 0)
dtype = SparseDtype("bool", False)
expected = DataFrame(
{
"A_1": SparseArray([1, 0], dtype=dtype),
Expand Down