Skip to content

Commit 22b16d7

Browse files
TST (string dtype): Adjust indexing string tests (#59541)
Co-authored-by: Joris Van den Bossche <[email protected]>
1 parent 2e006e7 commit 22b16d7

File tree

6 files changed

+59
-48
lines changed

6 files changed

+59
-48
lines changed

pandas/core/arrays/string_.py

+4
Original file line numberDiff line numberDiff line change
@@ -713,6 +713,10 @@ def __setitem__(self, key, value) -> None:
713713
else:
714714
if not is_array_like(value):
715715
value = np.asarray(value, dtype=object)
716+
else:
717+
# cast categories and friends to arrays to see if values are
718+
# compatible, compatibility with arrow backed strings
719+
value = np.asarray(value)
716720
if len(value) and not lib.is_string_array(value, skipna=True):
717721
raise TypeError("Must provide strings.")
718722

pandas/core/arrays/string_arrow.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -231,7 +231,7 @@ def _maybe_convert_setitem_value(self, value):
231231
value[isna(value)] = None
232232
for v in value:
233233
if not (v is None or isinstance(v, str)):
234-
raise TypeError("Scalar must be NA or str")
234+
raise TypeError("Must provide strings")
235235
return super()._maybe_convert_setitem_value(value)
236236

237237
def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:

pandas/tests/arrays/string_/test_string.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,7 @@ def test_setitem_validates(cls, dtype):
101101
with pytest.raises(TypeError, match=msg):
102102
arr[0] = 10
103103

104-
if dtype.storage == "python":
105-
msg = "Must provide strings."
106-
else:
107-
msg = "Scalar must be NA or str"
104+
msg = "Must provide strings"
108105
with pytest.raises(TypeError, match=msg):
109106
arr[:] = np.array([1, 2])
110107

pandas/tests/indexing/test_iloc.py

+16-15
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
import numpy as np
77
import pytest
88

9-
from pandas._config import using_string_dtype
10-
119
from pandas.errors import IndexingError
1210
import pandas.util._test_decorators as td
1311

@@ -1218,22 +1216,25 @@ def test_iloc_getitem_int_single_ea_block_view(self):
12181216
arr[2] = arr[-1]
12191217
assert ser[0] == arr[-1]
12201218

1221-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
1222-
def test_iloc_setitem_multicolumn_to_datetime(self):
1219+
def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string):
12231220
# GH#20511
12241221
df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]})
12251222

1226-
df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
1227-
expected = DataFrame(
1228-
{
1229-
"A": [
1230-
Timestamp("2021-01-01 00:00:00"),
1231-
Timestamp("2022-01-01 00:00:00"),
1232-
],
1233-
"B": ["2021", "2022"],
1234-
}
1235-
)
1236-
tm.assert_frame_equal(df, expected, check_dtype=False)
1223+
if using_infer_string:
1224+
with pytest.raises(TypeError, match="Invalid value"):
1225+
df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
1226+
else:
1227+
df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
1228+
expected = DataFrame(
1229+
{
1230+
"A": [
1231+
Timestamp("2021-01-01 00:00:00"),
1232+
Timestamp("2022-01-01 00:00:00"),
1233+
],
1234+
"B": ["2021", "2022"],
1235+
}
1236+
)
1237+
tm.assert_frame_equal(df, expected, check_dtype=False)
12371238

12381239

12391240
class TestILocErrors:

pandas/tests/indexing/test_indexing.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,6 @@
88
import numpy as np
99
import pytest
1010

11-
from pandas._config import using_string_dtype
12-
1311
from pandas.errors import IndexingError
1412

1513
from pandas.core.dtypes.common import (
@@ -563,12 +561,12 @@ def test_string_slice_empty(self):
563561
with pytest.raises(KeyError, match="^0$"):
564562
df.loc["2011", 0]
565563

566-
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
567564
def test_astype_assignment(self, using_infer_string):
568565
# GH4312 (iloc)
569566
df_orig = DataFrame(
570567
[["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
571568
)
569+
df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object)
572570

573571
df = df_orig.copy()
574572

@@ -578,9 +576,9 @@ def test_astype_assignment(self, using_infer_string):
578576
expected = DataFrame(
579577
[[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
580578
)
581-
if not using_infer_string:
582-
expected["A"] = expected["A"].astype(object)
583-
expected["B"] = expected["B"].astype(object)
579+
expected[list("CDG")] = expected[list("CDG")].astype(object)
580+
expected["A"] = expected["A"].astype(object)
581+
expected["B"] = expected["B"].astype(object)
584582
tm.assert_frame_equal(df, expected)
585583

586584
# GH5702 (loc)
@@ -589,18 +587,16 @@ def test_astype_assignment(self, using_infer_string):
589587
expected = DataFrame(
590588
[[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
591589
)
592-
if not using_infer_string:
593-
expected["A"] = expected["A"].astype(object)
590+
expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
594591
tm.assert_frame_equal(df, expected)
595592

596593
df = df_orig.copy()
594+
597595
df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
598596
expected = DataFrame(
599597
[["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
600598
)
601-
if not using_infer_string:
602-
expected["B"] = expected["B"].astype(object)
603-
expected["C"] = expected["C"].astype(object)
599+
expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
604600
tm.assert_frame_equal(df, expected)
605601

606602
def test_astype_assignment_full_replacements(self):

pandas/tests/indexing/test_loc.py

+30-17
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
""" test label based indexing with loc """
22
from collections import namedtuple
3+
import contextlib
34
from datetime import (
45
date,
56
datetime,
@@ -648,8 +649,9 @@ def test_loc_setitem_consistency_empty(self):
648649
expected["x"] = expected["x"].astype(np.int64)
649650
tm.assert_frame_equal(df, expected)
650651

652+
# incompatible dtype warning
651653
@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
652-
def test_loc_setitem_consistency_slice_column_len(self):
654+
def test_loc_setitem_consistency_slice_column_len(self, using_infer_string):
653655
# .loc[:,column] setting with slice == len of the column
654656
# GH10408
655657
levels = [
@@ -673,13 +675,24 @@ def test_loc_setitem_consistency_slice_column_len(self):
673675
]
674676
df = DataFrame(values, index=mi, columns=cols)
675677

676-
df.loc[:, ("Respondent", "StartDate")] = to_datetime(
677-
df.loc[:, ("Respondent", "StartDate")]
678-
)
679-
df.loc[:, ("Respondent", "EndDate")] = to_datetime(
680-
df.loc[:, ("Respondent", "EndDate")]
681-
)
682-
df = df.infer_objects(copy=False)
678+
ctx = contextlib.nullcontext()
679+
if using_infer_string:
680+
ctx = pytest.raises(TypeError, match="Invalid value")
681+
682+
with ctx:
683+
df.loc[:, ("Respondent", "StartDate")] = to_datetime(
684+
df.loc[:, ("Respondent", "StartDate")]
685+
)
686+
with ctx:
687+
df.loc[:, ("Respondent", "EndDate")] = to_datetime(
688+
df.loc[:, ("Respondent", "EndDate")]
689+
)
690+
691+
if using_infer_string:
692+
# infer-objects won't infer stuff anymore
693+
return
694+
695+
df = df.infer_objects()
683696

684697
# Adding a new key
685698
df.loc[:, ("Respondent", "Duration")] = (
@@ -1269,20 +1282,23 @@ def test_loc_reverse_assignment(self):
12691282

12701283
tm.assert_series_equal(result, expected)
12711284

1272-
@pytest.mark.xfail(using_string_dtype(), reason="can't set int into string")
1273-
def test_loc_setitem_str_to_small_float_conversion_type(self):
1285+
def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string):
12741286
# GH#20388
12751287

12761288
col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)]
12771289
result = DataFrame(col_data, columns=["A"])
1278-
expected = DataFrame(col_data, columns=["A"], dtype=object)
1290+
expected = DataFrame(col_data, columns=["A"])
12791291
tm.assert_frame_equal(result, expected)
12801292

12811293
# assigning with loc/iloc attempts to set the values inplace, which
12821294
# in this case is successful
1283-
result.loc[result.index, "A"] = [float(x) for x in col_data]
1284-
expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
1285-
tm.assert_frame_equal(result, expected)
1295+
if using_infer_string:
1296+
with pytest.raises(TypeError, match="Must provide strings"):
1297+
result.loc[result.index, "A"] = [float(x) for x in col_data]
1298+
else:
1299+
result.loc[result.index, "A"] = [float(x) for x in col_data]
1300+
expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
1301+
tm.assert_frame_equal(result, expected)
12861302

12871303
# assigning the entire column using __setitem__ swaps in the new array
12881304
# GH#???
@@ -1458,9 +1474,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self):
14581474
df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
14591475
tm.assert_frame_equal(df, exp)
14601476

1461-
@pytest.mark.xfail(
1462-
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
1463-
)
14641477
def test_loc_setitem_single_row_categorical(self, using_infer_string):
14651478
# GH#25495
14661479
df = DataFrame({"Alpha": ["a"], "Numeric": [0]})

0 commit comments

Comments
 (0)