Skip to content

Commit 7db3ca4

Browse files
Backport PR #60454 on branch 2.3.x (String dtype: coerce missing values in indexers for string dtype Index) (#60649)
* Backport PR #60454: String dtype: coerce missing values in indexers for string dtype Index * fixup import --------- Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent d90edeb commit 7db3ca4

File tree

4 files changed

+22
-34
lines changed

4 files changed

+22
-34
lines changed

pandas/_libs/index.pyx

+1-9
Original file line numberDiff line numberDiff line change
@@ -536,23 +536,15 @@ cdef class StringObjectEngine(ObjectEngine):
536536

537537
cdef:
538538
object na_value
539-
bint uses_na
540539

541540
def __init__(self, ndarray values, na_value):
542541
super().__init__(values)
543542
self.na_value = na_value
544-
self.uses_na = na_value is C_NA
545-
546-
cdef bint _checknull(self, object val):
547-
if self.uses_na:
548-
return val is C_NA
549-
else:
550-
return util.is_nan(val)
551543

552544
cdef _check_type(self, object val):
553545
if isinstance(val, str):
554546
return val
555-
elif self._checknull(val):
547+
elif checknull(val):
556548
return self.na_value
557549
else:
558550
raise KeyError(val)

pandas/tests/frame/indexing/test_indexing.py

-1
Original file line numberDiff line numberDiff line change
@@ -517,7 +517,6 @@ def test_setitem_ambig(self, using_infer_string):
517517
else:
518518
assert dm[2].dtype == np.object_
519519

520-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
521520
def test_setitem_None(self, float_frame):
522521
# GH #766
523522
float_frame[None] = float_frame["A"]

pandas/tests/indexes/string/test_indexing.py

+16-17
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,15 @@ def _isnan(val):
1313
return False
1414

1515

16+
def _equivalent_na(dtype, null):
17+
if dtype.na_value is pd.NA and null is pd.NA:
18+
return True
19+
elif _isnan(dtype.na_value) and _isnan(null):
20+
return True
21+
else:
22+
return False
23+
24+
1625
class TestGetLoc:
1726
def test_get_loc(self, any_string_dtype):
1827
index = Index(["a", "b", "c"], dtype=any_string_dtype)
@@ -41,14 +50,7 @@ def test_get_loc_non_missing(self, any_string_dtype, nulls_fixture):
4150

4251
def test_get_loc_missing(self, any_string_dtype, nulls_fixture):
4352
index = Index(["a", "b", nulls_fixture], dtype=any_string_dtype)
44-
if any_string_dtype == "string" and (
45-
(any_string_dtype.na_value is pd.NA and nulls_fixture is not pd.NA)
46-
or (_isnan(any_string_dtype.na_value) and not _isnan(nulls_fixture))
47-
):
48-
with pytest.raises(KeyError):
49-
index.get_loc(nulls_fixture)
50-
else:
51-
assert index.get_loc(nulls_fixture) == 2
53+
assert index.get_loc(nulls_fixture) == 2
5254

5355

5456
class TestGetIndexer:
@@ -93,9 +95,8 @@ def test_get_indexer_missing(self, any_string_dtype, null, using_infer_string):
9395
result = index.get_indexer(["a", null, "c"])
9496
if using_infer_string:
9597
expected = np.array([0, 2, -1], dtype=np.intp)
96-
elif any_string_dtype == "string" and (
97-
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
98-
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
98+
elif any_string_dtype == "string" and not _equivalent_na(
99+
any_string_dtype, null
99100
):
100101
expected = np.array([0, -1, -1], dtype=np.intp)
101102
else:
@@ -115,9 +116,8 @@ def test_get_indexer_non_unique_nas(
115116
if using_infer_string:
116117
expected_indexer = np.array([0, 2], dtype=np.intp)
117118
expected_missing = np.array([], dtype=np.intp)
118-
elif any_string_dtype == "string" and (
119-
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
120-
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
119+
elif any_string_dtype == "string" and not _equivalent_na(
120+
any_string_dtype, null
121121
):
122122
expected_indexer = np.array([0, -1], dtype=np.intp)
123123
expected_missing = np.array([1], dtype=np.intp)
@@ -133,9 +133,8 @@ def test_get_indexer_non_unique_nas(
133133

134134
if using_infer_string:
135135
expected_indexer = np.array([0, 1, 3], dtype=np.intp)
136-
elif any_string_dtype == "string" and (
137-
(any_string_dtype.na_value is pd.NA and null is not pd.NA)
138-
or (_isnan(any_string_dtype.na_value) and not _isnan(null))
136+
elif any_string_dtype == "string" and not _equivalent_na(
137+
any_string_dtype, null
139138
):
140139
pass
141140
else:

pandas/tests/reshape/test_pivot.py

+5-7
Original file line numberDiff line numberDiff line change
@@ -2619,6 +2619,8 @@ def test_pivot_columns_not_given(self):
26192619
with pytest.raises(TypeError, match="missing 1 required keyword-only argument"):
26202620
df.pivot() # pylint: disable=missing-kwoa
26212621

2622+
# this still fails because columns=None gets passed down to unstack as level=None
2623+
# while at that point None was converted to NaN
26222624
@pytest.mark.xfail(
26232625
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
26242626
)
@@ -2637,10 +2639,7 @@ def test_pivot_columns_is_none(self):
26372639
expected = DataFrame({1: 3}, index=Index([2], name="b"))
26382640
tm.assert_frame_equal(result, expected)
26392641

2640-
@pytest.mark.xfail(
2641-
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
2642-
)
2643-
def test_pivot_index_is_none(self):
2642+
def test_pivot_index_is_none(self, using_infer_string):
26442643
# GH#48293
26452644
df = DataFrame({None: [1], "b": 2, "c": 3})
26462645

@@ -2651,11 +2650,10 @@ def test_pivot_index_is_none(self):
26512650

26522651
result = df.pivot(columns="b", index=None, values="c")
26532652
expected = DataFrame(3, index=[1], columns=Index([2], name="b"))
2653+
if using_infer_string:
2654+
expected.index.name = np.nan
26542655
tm.assert_frame_equal(result, expected)
26552656

2656-
@pytest.mark.xfail(
2657-
using_string_dtype(), reason="TODO(infer_string) None is cast to NaN"
2658-
)
26592657
def test_pivot_values_is_none(self):
26602658
# GH#48293
26612659
df = DataFrame({None: [1], "b": 2, "c": 3})

0 commit comments

Comments
 (0)