Skip to content

Commit 0d2505d

Browse files
String dtype: fix isin() values handling for python storage (#59759)
* String dtype: fix isin() values handling for python storage * address feedback
1 parent 2c49f55 commit 0d2505d

File tree

3 files changed

+64
-6
lines changed

3 files changed

+64
-6
lines changed

pandas/conftest.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -1338,7 +1338,13 @@ def string_storage(request):
13381338
pytest.param(("pyarrow", pd.NA), marks=td.skip_if_no("pyarrow")),
13391339
pytest.param(("pyarrow", np.nan), marks=td.skip_if_no("pyarrow")),
13401340
("python", np.nan),
1341-
]
1341+
],
1342+
ids=[
1343+
"string=string[python]",
1344+
"string=string[pyarrow]",
1345+
"string=str[pyarrow]",
1346+
"string=str[python]",
1347+
],
13421348
)
13431349
def string_dtype_arguments(request):
13441350
"""
@@ -1369,6 +1375,7 @@ def dtype_backend(request):
13691375

13701376
# Alias so we can test with cartesian product of string_storage
13711377
string_storage2 = string_storage
1378+
string_dtype_arguments2 = string_dtype_arguments
13721379

13731380

13741381
@pytest.fixture(params=tm.BYTES_DTYPES)

pandas/core/arrays/string_.py

+20
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@
4646
nanops,
4747
ops,
4848
)
49+
from pandas.core.algorithms import isin
4950
from pandas.core.array_algos import masked_reductions
5051
from pandas.core.arrays.base import ExtensionArray
5152
from pandas.core.arrays.floating import (
@@ -65,6 +66,7 @@
6566
import pyarrow
6667

6768
from pandas._typing import (
69+
ArrayLike,
6870
AxisInt,
6971
Dtype,
7072
DtypeObj,
@@ -735,6 +737,24 @@ def _putmask(self, mask: npt.NDArray[np.bool_], value) -> None:
735737
# base class implementation that uses __setitem__
736738
ExtensionArray._putmask(self, mask, value)
737739

740+
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
741+
if isinstance(values, BaseStringArray) or (
742+
isinstance(values, ExtensionArray) and is_string_dtype(values.dtype)
743+
):
744+
values = values.astype(self.dtype, copy=False)
745+
else:
746+
if not lib.is_string_array(np.asarray(values), skipna=True):
747+
values = np.array(
748+
[val for val in values if isinstance(val, str) or isna(val)],
749+
dtype=object,
750+
)
751+
if not len(values):
752+
return np.zeros(self.shape, dtype=bool)
753+
754+
values = self._from_sequence(values, dtype=self.dtype)
755+
756+
return isin(np.asarray(self), np.asarray(values))
757+
738758
def astype(self, dtype, copy: bool = True):
739759
dtype = pandas_dtype(dtype)
740760

pandas/tests/arrays/string_/test_string.py

+36-5
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,12 @@ def dtype(string_dtype_arguments):
3030
return pd.StringDtype(storage=storage, na_value=na_value)
3131

3232

33+
@pytest.fixture
34+
def dtype2(string_dtype_arguments2):
35+
storage, na_value = string_dtype_arguments2
36+
return pd.StringDtype(storage=storage, na_value=na_value)
37+
38+
3339
@pytest.fixture
3440
def cls(dtype):
3541
"""Fixture giving array type from parametrized 'dtype'"""
@@ -662,11 +668,7 @@ def test_isin(dtype, fixed_now_ts):
662668
tm.assert_series_equal(result, expected)
663669

664670
result = s.isin(["a", pd.NA])
665-
if dtype.storage == "python" and dtype.na_value is np.nan:
666-
# TODO(infer_string) we should make this consistent
667-
expected = pd.Series([True, False, False])
668-
else:
669-
expected = pd.Series([True, False, True])
671+
expected = pd.Series([True, False, True])
670672
tm.assert_series_equal(result, expected)
671673

672674
result = s.isin([])
@@ -677,6 +679,35 @@ def test_isin(dtype, fixed_now_ts):
677679
expected = pd.Series([True, False, False])
678680
tm.assert_series_equal(result, expected)
679681

682+
result = s.isin([fixed_now_ts])
683+
expected = pd.Series([False, False, False])
684+
tm.assert_series_equal(result, expected)
685+
686+
687+
def test_isin_string_array(dtype, dtype2):
688+
s = pd.Series(["a", "b", None], dtype=dtype)
689+
690+
result = s.isin(pd.array(["a", "c"], dtype=dtype2))
691+
expected = pd.Series([True, False, False])
692+
tm.assert_series_equal(result, expected)
693+
694+
result = s.isin(pd.array(["a", None], dtype=dtype2))
695+
expected = pd.Series([True, False, True])
696+
tm.assert_series_equal(result, expected)
697+
698+
699+
def test_isin_arrow_string_array(dtype):
700+
pa = pytest.importorskip("pyarrow")
701+
s = pd.Series(["a", "b", None], dtype=dtype)
702+
703+
result = s.isin(pd.array(["a", "c"], dtype=pd.ArrowDtype(pa.string())))
704+
expected = pd.Series([True, False, False])
705+
tm.assert_series_equal(result, expected)
706+
707+
result = s.isin(pd.array(["a", None], dtype=pd.ArrowDtype(pa.string())))
708+
expected = pd.Series([True, False, True])
709+
tm.assert_series_equal(result, expected)
710+
680711

681712
def test_setitem_scalar_with_mask_validation(dtype):
682713
# https://github.com/pandas-dev/pandas/issues/47628

0 commit comments

Comments
 (0)