Skip to content

BUG pd.NA not treated correctly in where and mask operations #53124

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 21 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
89c0f3d
make NA propagate where and mask operations
Charlie-XIAO May 6, 2023
321147f
changelog added
Charlie-XIAO May 6, 2023
36bbe16
fix when using boolean arrays
Charlie-XIAO May 7, 2023
e2216cb
added tests, reword NA propagates -> if cond=NA then element propagates
Charlie-XIAO May 8, 2023
5a45a29
Merge branch 'main' into na-masked-unexp
Charlie-XIAO May 8, 2023
9875669
avoid multiple fillna when unnecessary
Charlie-XIAO May 8, 2023
8381aba
Merge branch 'main' into na-masked-unexp
Charlie-XIAO May 19, 2023
5a41560
Merge branch 'main' into na-masked-unexp
Charlie-XIAO Jun 4, 2023
8af09df
Merge branch 'main' into na-masked-unexp
Charlie-XIAO Jun 11, 2023
3859bff
Merge branch 'main' into na-masked-unexp
Charlie-XIAO Jun 12, 2023
c1d43c8
Merge remote-tracking branch 'upstream/main' into na-masked-unexp
Charlie-XIAO Jul 14, 2023
c542727
Merge branch 'na-masked-unexp' of https://github.com/Charlie-XIAO/pan…
Charlie-XIAO Jul 14, 2023
8140c5b
Merge branch 'main' into na-masked-unexp
Charlie-XIAO Jul 16, 2023
a2151be
Merge branch 'main' into na-masked-unexp
Charlie-XIAO Aug 1, 2023
6f90c1c
Merge remote-tracking branch 'upstream/main' into na-masked-unexp
Charlie-XIAO Aug 29, 2023
394d4bb
Merge branch 'na-masked-unexp' of https://github.com/Charlie-XIAO/pan…
Charlie-XIAO Aug 29, 2023
1cc6208
Merge remote-tracking branch 'upstream/main' into na-masked-unexp
Charlie-XIAO Aug 29, 2023
09f62bc
raise in where and mask if cond is nullable bool with NAs
Charlie-XIAO Aug 29, 2023
b55f411
Merge remote-tracking branch 'upstream/main' into na-masked-unexp
Charlie-XIAO Aug 29, 2023
cbbd866
remove conflicting (?) test and improve message
Charlie-XIAO Aug 30, 2023
3a34a85
Merge remote-tracking branch 'upstream/main' into na-masked-unexp
Charlie-XIAO Aug 30, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -735,6 +735,7 @@ Indexing
- Bug in :meth:`DataFrame.__setitem__` losing dtype when setting a :class:`DataFrame` into duplicated columns (:issue:`53143`)
- Bug in :meth:`DataFrame.__setitem__` with a boolean mask and :meth:`DataFrame.putmask` with mixed non-numeric dtypes and a value other than ``NaN`` incorrectly raising ``TypeError`` (:issue:`53291`)
- Bug in :meth:`DataFrame.iloc` when using ``nan`` as the only element (:issue:`52234`)
- Bug in :meth:`DataFrame.where`, :meth:`DataFrame.mask`, :meth:`Series.where`, and :meth:`Series.mask`, when ``cond`` for an element is ``pd.NA``; the corresponding element now propagates through (:issue:`52955`)
- Bug in :meth:`Series.loc` casting :class:`Series` to ``np.dnarray`` when assigning :class:`Series` at predefined index of ``object`` dtype :class:`Series` (:issue:`48933`)

Missing
Expand Down
29 changes: 18 additions & 11 deletions pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -10292,42 +10292,49 @@ def _where(
axis = self._get_axis_number(axis)

# align the cond to same shape as myself
cond_hasna: bool_t
cond = common.apply_if_callable(cond, self)
if isinstance(cond, NDFrame):
# CoW: Make sure reference is not kept alive
if cond.ndim == 1 and self.ndim == 2:
cond = cond._constructor_expanddim(
{i: cond for i in range(len(self.columns))},
copy=False,
cond_hasna = cond.isna().any(axis=None)
if not cond_hasna:
if cond.ndim == 1 and self.ndim == 2:
cond = cond._constructor_expanddim(
{i: cond for i in range(len(self.columns))},
copy=False,
)
cond.columns = self.columns
cond = cond.align(self, join="right", copy=False)[0].fillna(
bool(inplace)
)
cond.columns = self.columns
cond = cond.align(self, join="right", copy=False)[0]
else:
if not hasattr(cond, "shape"):
cond = np.asanyarray(cond)
if cond.shape != self.shape:
raise ValueError("Array conditional must be same shape as self")
cond = self._constructor(cond, **self._construct_axes_dict(), copy=False)

# make sure we are boolean
fill_value = bool(inplace)
cond = cond.fillna(fill_value)
cond_hasna = cond.isna().any(axis=None)

msg = "Boolean array expected for the condition, not {dtype}"
na_msg = "Cannot mask with an array containing NA / NaN values"

if not cond.empty:
if not isinstance(cond, ABCDataFrame):
# This is a single-dimensional object.
if not is_bool_dtype(cond):
raise ValueError(msg.format(dtype=cond.dtype))
if cond_hasna:
raise ValueError(na_msg)
else:
for _dt in cond.dtypes:
if not is_bool_dtype(_dt):
raise ValueError(msg.format(dtype=_dt))
if cond_hasna:
raise ValueError(na_msg)
if cond._mgr.any_extension_types:
# GH51574: avoid object ndarray conversion later on
cond = cond._constructor(
cond.to_numpy(dtype=bool, na_value=fill_value),
cond.to_numpy(dtype=bool),
**cond._construct_axes_dict(),
)
else:
Expand Down
12 changes: 12 additions & 0 deletions pandas/tests/frame/indexing/test_mask.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
"""

import numpy as np
import pytest

from pandas import (
NA,
DataFrame,
Float64Dtype,
Int64Dtype,
Series,
StringDtype,
Timedelta,
Expand Down Expand Up @@ -150,3 +152,13 @@ def test_mask_inplace_no_other():
df.mask(cond, inplace=True)
expected = DataFrame({"a": [np.nan, 2], "b": ["x", np.nan]})
tm.assert_frame_equal(df, expected)


def test_mask_with_na():
# GH#52955
df = DataFrame([[1, NA], [NA, 2]], dtype=Int64Dtype())
msg = "Cannot mask with an array containing NA / NaN values"

for cond_frame in [df, df[0]]:
with pytest.raises(ValueError, match=msg):
df.mask(cond_frame % 2 == 1, 0)
13 changes: 0 additions & 13 deletions pandas/tests/frame/indexing/test_setitem.py
Original file line number Diff line number Diff line change
Expand Up @@ -1144,19 +1144,6 @@ def test_loc_setitem_all_false_boolean_two_blocks(self):
df.loc[indexer, ["b"]] = DataFrame({"b": [5, 6]}, index=[0, 1])
tm.assert_frame_equal(df, expected)

def test_setitem_ea_boolean_mask(self):
# GH#47125
df = DataFrame([[-1, 2], [3, -4]])
expected = DataFrame([[0, 2], [3, 0]])
boolean_indexer = DataFrame(
{
0: Series([True, False], dtype="boolean"),
1: Series([pd.NA, True], dtype="boolean"),
}
)
df[boolean_indexer] = 0
tm.assert_frame_equal(df, expected)


class TestDataFrameSetitemCopyViewSemantics:
def test_setitem_always_copy(self, float_frame):
Expand Down
21 changes: 11 additions & 10 deletions pandas/tests/frame/indexing/test_where.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
DataFrame,
DatetimeIndex,
Index,
Int64Dtype,
Series,
StringDtype,
Timestamp,
Expand Down Expand Up @@ -1046,16 +1047,6 @@ def test_where_dt64_2d():
_check_where_equivalences(df, mask, other, expected)


def test_where_producing_ea_cond_for_np_dtype():
# GH#44014
df = DataFrame({"a": Series([1, pd.NA, 2], dtype="Int64"), "b": [1, 2, 3]})
result = df.where(lambda x: x.apply(lambda y: y > 1, axis=1))
expected = DataFrame(
{"a": Series([pd.NA, pd.NA, 2], dtype="Int64"), "b": [np.nan, 2, 3]}
)
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
"replacement", [0.001, True, "snake", None, datetime(2022, 5, 4)]
)
Expand All @@ -1075,3 +1066,13 @@ def test_where_inplace_no_other():
df.where(cond, inplace=True)
expected = DataFrame({"a": [1, np.nan], "b": [np.nan, "y"]})
tm.assert_frame_equal(df, expected)


def test_where_with_na():
# GH#52955
df = DataFrame([[1, pd.NA], [pd.NA, 2]], dtype=Int64Dtype())
msg = "Cannot mask with an array containing NA / NaN values"

for cond_frame in [df, df[0]]:
with pytest.raises(ValueError, match=msg):
df.where(cond_frame % 2 == 1, 0)
16 changes: 15 additions & 1 deletion pandas/tests/series/indexing/test_mask.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import numpy as np
import pytest

from pandas import Series
from pandas import (
NA,
Int64Dtype,
Series,
)
import pandas._testing as tm


Expand Down Expand Up @@ -67,3 +71,13 @@ def test_mask_inplace():
rs = s.copy()
rs.mask(cond, -s, inplace=True)
tm.assert_series_equal(rs, s.mask(cond, -s))


def test_mask_with_na():
# GH#52955
ser = Series([1, 2, NA], dtype=Int64Dtype())
msg = "Cannot mask with an array containing NA / NaN values"

for cond_arr in [ser, ser.array]:
with pytest.raises(ValueError, match=msg):
ser.mask(cond_arr % 2 == 1, 0)
12 changes: 12 additions & 0 deletions pandas/tests/series/indexing/test_where.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@

import pandas as pd
from pandas import (
NA,
Int64Dtype,
Series,
Timestamp,
date_range,
Expand Down Expand Up @@ -471,3 +473,13 @@ def test_where_datetimelike_categorical(tz_naive_fixture):
res = pd.DataFrame(lvals).where(mask[:, None], pd.DataFrame(rvals))

tm.assert_frame_equal(res, pd.DataFrame(dr))


def test_where_with_na():
# GH#52955
ser = Series([1, 2, NA], dtype=Int64Dtype())
msg = "Cannot mask with an array containing NA / NaN values"

for cond_arr in [ser, ser.array]:
with pytest.raises(ValueError, match=msg):
ser.where(cond_arr % 2 == 1, 0)