From 2f5221fe72055bcc818b8433da535df0b2edb23c Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sat, 17 Aug 2024 21:41:45 +0200
Subject: [PATCH 1/4] TST (string-dtype): Adjust indexing string tests

---
 pandas/core/arrays/string_.py          |  4 +++
 pandas/tests/indexing/test_iloc.py     | 31 ++++++++--------
 pandas/tests/indexing/test_indexing.py | 49 +++++++++++++++-----------
 pandas/tests/indexing/test_loc.py      | 48 ++++++++++++++-----------
 4 files changed, 76 insertions(+), 56 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 823084c3e9982..03aa676c4e24c 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -688,6 +688,10 @@ def __setitem__(self, key, value) -> None:
         else:
             if not is_array_like(value):
                 value = np.asarray(value, dtype=object)
+            else:
+                # cast categories and friends to arrays to see if values are
+                # compatible, compatibility with arrow backed strings
+                value = np.asarray(value)
             if len(value) and not lib.is_string_array(value, skipna=True):
                 raise TypeError("Must provide strings.")
 
diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
index b05b5d3dea2dc..dc95e1bb1b8a0 100644
--- a/pandas/tests/indexing/test_iloc.py
+++ b/pandas/tests/indexing/test_iloc.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import IndexingError
 
 from pandas import (
@@ -1198,22 +1196,25 @@ def test_iloc_getitem_int_single_ea_block_view(self):
         arr[2] = arr[-1]
         assert ser[0] == arr[-1]
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_iloc_setitem_multicolumn_to_datetime(self):
+    def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string):
         # GH#20511
         df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]})
 
-        df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
-        expected = DataFrame(
-            {
-                "A": [
-                    Timestamp("2021-01-01 00:00:00"),
-                    Timestamp("2022-01-01 00:00:00"),
-                ],
-                "B": ["2021", "2022"],
-            }
-        )
-        tm.assert_frame_equal(df, expected, check_dtype=False)
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
+        else:
+            df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
+            expected = DataFrame(
+                {
+                    "A": [
+                        Timestamp("2021-01-01 00:00:00"),
+                        Timestamp("2022-01-01 00:00:00"),
+                    ],
+                    "B": ["2021", "2022"],
+                }
+            )
+            tm.assert_frame_equal(df, expected, check_dtype=False)
 
 
 class TestILocErrors:
diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index ef8c0e432ca49..9a76dc6814171 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import IndexingError
 
 from pandas.core.dtypes.common import (
@@ -528,7 +526,6 @@ def test_string_slice_empty(self):
         with pytest.raises(KeyError, match="^0$"):
             df.loc["2011", 0]
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_astype_assignment(self, using_infer_string):
         # GH4312 (iloc)
         df_orig = DataFrame(
@@ -539,34 +536,44 @@ def test_astype_assignment(self, using_infer_string):
 
         # with the enforcement of GH#45333 in 2.0, this setting is attempted inplace,
         #  so object dtype is retained
-        df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
-        expected = DataFrame(
-            [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
-        )
-        if not using_infer_string:
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.iloc[:, 0] = df.iloc[:, 0].astype(np.int64)
+        else:
+            df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
+            expected = DataFrame(
+                [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
+            )
             expected["A"] = expected["A"].astype(object)
             expected["B"] = expected["B"].astype(object)
-        tm.assert_frame_equal(df, expected)
+            tm.assert_frame_equal(df, expected)
 
         # GH5702 (loc)
         df = df_orig.copy()
-        df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
-        expected = DataFrame(
-            [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
-        )
-        if not using_infer_string:
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
+        else:
+            df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
+            expected = DataFrame(
+                [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
+            )
             expected["A"] = expected["A"].astype(object)
-        tm.assert_frame_equal(df, expected)
+            tm.assert_frame_equal(df, expected)
 
         df = df_orig.copy()
-        df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
-        expected = DataFrame(
-            [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
-        )
-        if not using_infer_string:
+
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
+        else:
+            df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
+            expected = DataFrame(
+                [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
+            )
             expected["B"] = expected["B"].astype(object)
             expected["C"] = expected["C"].astype(object)
-        tm.assert_frame_equal(df, expected)
+            tm.assert_frame_equal(df, expected)
 
     def test_astype_assignment_full_replacements(self):
         # full replacements / no nans
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index e007b8c4e97ac..51b10b6eedddc 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -1,6 +1,7 @@
 """test label based indexing with loc"""
 
 from collections import namedtuple
+import contextlib
 from datetime import (
     date,
     datetime,
@@ -13,10 +14,7 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import index as libindex
-from pandas.compat import HAS_PYARROW
 from pandas.errors import IndexingError
 
 import pandas as pd
@@ -615,8 +613,7 @@ def test_loc_setitem_consistency_empty(self):
         expected["x"] = expected["x"].astype(np.int64)
         tm.assert_frame_equal(df, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_loc_setitem_consistency_slice_column_len(self):
+    def test_loc_setitem_consistency_slice_column_len(self, using_infer_string):
         # .loc[:,column] setting with slice == len of the column
         # GH10408
         levels = [
@@ -640,12 +637,23 @@ def test_loc_setitem_consistency_slice_column_len(self):
         ]
         df = DataFrame(values, index=mi, columns=cols)
 
-        df.loc[:, ("Respondent", "StartDate")] = to_datetime(
-            df.loc[:, ("Respondent", "StartDate")]
-        )
-        df.loc[:, ("Respondent", "EndDate")] = to_datetime(
-            df.loc[:, ("Respondent", "EndDate")]
-        )
+        ctx = contextlib.nullcontext()
+        if using_infer_string:
+            ctx = pytest.raises(TypeError, match="Invalid value")
+
+        with ctx:
+            df.loc[:, ("Respondent", "StartDate")] = to_datetime(
+                df.loc[:, ("Respondent", "StartDate")]
+            )
+        with ctx:
+            df.loc[:, ("Respondent", "EndDate")] = to_datetime(
+                df.loc[:, ("Respondent", "EndDate")]
+            )
+
+        if using_infer_string:
+            # infer-objects won't infer stuff anymore
+            return
+
         df = df.infer_objects()
 
         # Adding a new key
@@ -1211,20 +1219,23 @@ def test_loc_reverse_assignment(self):
 
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string")
-    def test_loc_setitem_str_to_small_float_conversion_type(self):
+    def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string):
         # GH#20388
 
         col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)]
         result = DataFrame(col_data, columns=["A"])
-        expected = DataFrame(col_data, columns=["A"], dtype=object)
+        expected = DataFrame(col_data, columns=["A"])
         tm.assert_frame_equal(result, expected)
 
         # assigning with loc/iloc attempts to set the values inplace, which
         #  in this case is successful
-        result.loc[result.index, "A"] = [float(x) for x in col_data]
-        expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
-        tm.assert_frame_equal(result, expected)
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Scalar must"):
+                result.loc[result.index, "A"] = [float(x) for x in col_data]
+        else:
+            result.loc[result.index, "A"] = [float(x) for x in col_data]
+            expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
+            tm.assert_frame_equal(result, expected)
 
         # assigning the entire column using __setitem__ swaps in the new array
         # GH#???
@@ -1389,9 +1400,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self):
             df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
             df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_loc_setitem_single_row_categorical(self, using_infer_string):
         # GH#25495
         df = DataFrame({"Alpha": ["a"], "Numeric": [0]})

From cf3c44eff080effece4ef79efd465900fddde427 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 19 Aug 2024 09:15:17 +0200
Subject: [PATCH 2/4] Update

---
 pandas/tests/indexing/test_indexing.py | 53 ++++++++++----------------
 1 file changed, 21 insertions(+), 32 deletions(-)

diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
index 9a76dc6814171..c793c75695a86 100644
--- a/pandas/tests/indexing/test_indexing.py
+++ b/pandas/tests/indexing/test_indexing.py
@@ -531,49 +531,38 @@ def test_astype_assignment(self, using_infer_string):
         df_orig = DataFrame(
             [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
         )
+        df_orig[list("ABCDG")] = df_orig[list("ABCDG")].astype(object)
 
         df = df_orig.copy()
 
         # with the enforcement of GH#45333 in 2.0, this setting is attempted inplace,
         #  so object dtype is retained
-        if using_infer_string:
-            with pytest.raises(TypeError, match="Invalid value"):
-                df.iloc[:, 0] = df.iloc[:, 0].astype(np.int64)
-        else:
-            df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
-            expected = DataFrame(
-                [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
-            )
-            expected["A"] = expected["A"].astype(object)
-            expected["B"] = expected["B"].astype(object)
-            tm.assert_frame_equal(df, expected)
+        df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
+        expected = DataFrame(
+            [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
+        )
+        expected[list("CDG")] = expected[list("CDG")].astype(object)
+        expected["A"] = expected["A"].astype(object)
+        expected["B"] = expected["B"].astype(object)
+        tm.assert_frame_equal(df, expected)
 
         # GH5702 (loc)
         df = df_orig.copy()
-        if using_infer_string:
-            with pytest.raises(TypeError, match="Invalid value"):
-                df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
-        else:
-            df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
-            expected = DataFrame(
-                [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
-            )
-            expected["A"] = expected["A"].astype(object)
-            tm.assert_frame_equal(df, expected)
+        df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
+        expected = DataFrame(
+            [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
+        )
+        expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
+        tm.assert_frame_equal(df, expected)
 
         df = df_orig.copy()
 
-        if using_infer_string:
-            with pytest.raises(TypeError, match="Invalid value"):
-                df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
-        else:
-            df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
-            expected = DataFrame(
-                [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
-            )
-            expected["B"] = expected["B"].astype(object)
-            expected["C"] = expected["C"].astype(object)
-            tm.assert_frame_equal(df, expected)
+        df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
+        expected = DataFrame(
+            [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
+        )
+        expected[list("ABCDG")] = expected[list("ABCDG")].astype(object)
+        tm.assert_frame_equal(df, expected)
 
     def test_astype_assignment_full_replacements(self):
         # full replacements / no nans

From 4bb986ab14c4862f641fe0b9ce89136bd6fde72a Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 9 Sep 2024 11:36:26 +0200
Subject: [PATCH 3/4] make error message consistent

---
 pandas/core/arrays/string_arrow.py | 2 +-
 pandas/tests/indexing/test_loc.py  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py
index 97381b82ceab9..1e5adf106752f 100644
--- a/pandas/core/arrays/string_arrow.py
+++ b/pandas/core/arrays/string_arrow.py
@@ -240,7 +240,7 @@ def _maybe_convert_setitem_value(self, value):
             value[isna(value)] = None
             for v in value:
                 if not (v is None or isinstance(v, str)):
-                    raise TypeError("Scalar must be NA or str")
+                    raise TypeError("Must provide strings")
         return super()._maybe_convert_setitem_value(value)
 
     def isin(self, values: ArrayLike) -> npt.NDArray[np.bool_]:
diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
index 51b10b6eedddc..36b08ee1df790 100644
--- a/pandas/tests/indexing/test_loc.py
+++ b/pandas/tests/indexing/test_loc.py
@@ -1230,7 +1230,7 @@ def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string
         # assigning with loc/iloc attempts to set the values inplace, which
         #  in this case is successful
         if using_infer_string:
-            with pytest.raises(TypeError, match="Scalar must"):
+            with pytest.raises(TypeError, match="Must provide strings"):
                 result.loc[result.index, "A"] = [float(x) for x in col_data]
         else:
             result.loc[result.index, "A"] = [float(x) for x in col_data]

From 677e06389f04ce835d4dc164f521e031b5e0f225 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 9 Sep 2024 12:08:33 +0200
Subject: [PATCH 4/4] update test

---
 pandas/tests/arrays/string_/test_string.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index dd87dbf8e9a43..87bd1d5921caa 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -102,10 +102,7 @@ def test_setitem_validates(cls, dtype):
     with pytest.raises(TypeError, match=msg):
         arr[0] = 10
 
-    if dtype.storage == "python":
-        msg = "Must provide strings."
-    else:
-        msg = "Scalar must be NA or str"
+    msg = "Must provide strings"
     with pytest.raises(TypeError, match=msg):
         arr[:] = np.array([1, 2])