pandas-dev · jorisvandenbossche · Sep 9, 2024 · Aug 17, 2024 · Aug 19, 2024 · Aug 22, 2024
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -688,6 +688,10 @@ def __setitem__(self, key, value) -> None:
         else:
             if not is_array_like(value):
                 value = np.asarray(value, dtype=object)
+            else:
+                # cast categories and friends to arrays to see if values are
+                # compatible, compatibility with arrow backed strings
+                value = np.asarray(value)
             if len(value) and not lib.is_string_array(value, skipna=True):
                 raise TypeError("Must provide strings.")
 

diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -6,8 +6,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import IndexingError
 
 from pandas import (
@@ -1198,22 +1196,25 @@ def test_iloc_getitem_int_single_ea_block_view(self):
         arr[2] = arr[-1]
         assert ser[0] == arr[-1]
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_iloc_setitem_multicolumn_to_datetime(self):
+    def test_iloc_setitem_multicolumn_to_datetime(self, using_infer_string):
         # GH#20511
         df = DataFrame({"A": ["2022-01-01", "2022-01-02"], "B": ["2021", "2022"]})
 
-        df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
-        expected = DataFrame(
-            {
-                "A": [
-                    Timestamp("2021-01-01 00:00:00"),
-                    Timestamp("2022-01-01 00:00:00"),
-                ],
-                "B": ["2021", "2022"],
-            }
-        )
-        tm.assert_frame_equal(df, expected, check_dtype=False)
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
+        else:
+            df.iloc[:, [0]] = DataFrame({"A": to_datetime(["2021", "2022"])})
+            expected = DataFrame(
+                {
+                    "A": [
+                        Timestamp("2021-01-01 00:00:00"),
+                        Timestamp("2022-01-01 00:00:00"),
+                    ],
+                    "B": ["2021", "2022"],
+                }
+            )
+            tm.assert_frame_equal(df, expected, check_dtype=False)
 
 
 class TestILocErrors:

diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -8,8 +8,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas.errors import IndexingError
 
 from pandas.core.dtypes.common import (
@@ -528,7 +526,6 @@ def test_string_slice_empty(self):
         with pytest.raises(KeyError, match="^0$"):
             df.loc["2011", 0]
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
     def test_astype_assignment(self, using_infer_string):
         # GH4312 (iloc)
         df_orig = DataFrame(
@@ -539,34 +536,44 @@ def test_astype_assignment(self, using_infer_string):
 
         # with the enforcement of GH#45333 in 2.0, this setting is attempted inplace,
         #  so object dtype is retained
-        df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
-        expected = DataFrame(
-            [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
-        )
-        if not using_infer_string:
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.iloc[:, 0] = df.iloc[:, 0].astype(np.int64)
+        else:
+            df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
+            expected = DataFrame(
+                [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
+            )
             expected["A"] = expected["A"].astype(object)
             expected["B"] = expected["B"].astype(object)
-        tm.assert_frame_equal(df, expected)
+            tm.assert_frame_equal(df, expected)
 
         # GH5702 (loc)
         df = df_orig.copy()
-        df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
-        expected = DataFrame(
-            [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
-        )
-        if not using_infer_string:
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
+        else:
+            df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
+            expected = DataFrame(
+                [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
+            )
             expected["A"] = expected["A"].astype(object)
-        tm.assert_frame_equal(df, expected)
+            tm.assert_frame_equal(df, expected)
 
         df = df_orig.copy()
-        df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
-        expected = DataFrame(
-            [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
-        )
-        if not using_infer_string:
+
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Invalid value"):
+                df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
+        else:
+            df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
+            expected = DataFrame(
+                [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
+            )
             expected["B"] = expected["B"].astype(object)
             expected["C"] = expected["C"].astype(object)
-        tm.assert_frame_equal(df, expected)
+            tm.assert_frame_equal(df, expected)
 
     def test_astype_assignment_full_replacements(self):
         # full replacements / no nans

diff --git a/pandas/tests/indexing/test_loc.py b/pandas/tests/indexing/test_loc.py
@@ -1,6 +1,7 @@
 """test label based indexing with loc"""
 
 from collections import namedtuple
+import contextlib
 from datetime import (
     date,
     datetime,
@@ -13,10 +14,7 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs import index as libindex
-from pandas.compat import HAS_PYARROW
 from pandas.errors import IndexingError
 
 import pandas as pd
@@ -615,8 +613,7 @@ def test_loc_setitem_consistency_empty(self):
         expected["x"] = expected["x"].astype(np.int64)
         tm.assert_frame_equal(df, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)")
-    def test_loc_setitem_consistency_slice_column_len(self):
+    def test_loc_setitem_consistency_slice_column_len(self, using_infer_string):
         # .loc[:,column] setting with slice == len of the column
         # GH10408
         levels = [
@@ -640,12 +637,23 @@ def test_loc_setitem_consistency_slice_column_len(self):
         ]
         df = DataFrame(values, index=mi, columns=cols)
 
-        df.loc[:, ("Respondent", "StartDate")] = to_datetime(
-            df.loc[:, ("Respondent", "StartDate")]
-        )
-        df.loc[:, ("Respondent", "EndDate")] = to_datetime(
-            df.loc[:, ("Respondent", "EndDate")]
-        )
+        ctx = contextlib.nullcontext()
+        if using_infer_string:
+            ctx = pytest.raises(TypeError, match="Invalid value")
+
+        with ctx:
+            df.loc[:, ("Respondent", "StartDate")] = to_datetime(
+                df.loc[:, ("Respondent", "StartDate")]
+            )
+        with ctx:
+            df.loc[:, ("Respondent", "EndDate")] = to_datetime(
+                df.loc[:, ("Respondent", "EndDate")]
+            )
+
+        if using_infer_string:
+            # infer-objects won't infer stuff anymore
+            return
+
         df = df.infer_objects()
 
         # Adding a new key
@@ -1211,20 +1219,23 @@ def test_loc_reverse_assignment(self):
 
         tm.assert_series_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="can't set int into string")
-    def test_loc_setitem_str_to_small_float_conversion_type(self):
+    def test_loc_setitem_str_to_small_float_conversion_type(self, using_infer_string):
         # GH#20388
 
         col_data = [str(np.random.default_rng(2).random() * 1e-12) for _ in range(5)]
         result = DataFrame(col_data, columns=["A"])
-        expected = DataFrame(col_data, columns=["A"], dtype=object)
+        expected = DataFrame(col_data, columns=["A"])
         tm.assert_frame_equal(result, expected)
 
         # assigning with loc/iloc attempts to set the values inplace, which
         #  in this case is successful
-        result.loc[result.index, "A"] = [float(x) for x in col_data]
-        expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
-        tm.assert_frame_equal(result, expected)
+        if using_infer_string:
+            with pytest.raises(TypeError, match="Scalar must"):
+                result.loc[result.index, "A"] = [float(x) for x in col_data]
+        else:
+            result.loc[result.index, "A"] = [float(x) for x in col_data]
+            expected = DataFrame(col_data, columns=["A"], dtype=float).astype(object)
+            tm.assert_frame_equal(result, expected)
 
         # assigning the entire column using __setitem__ swaps in the new array
         # GH#???
@@ -1389,9 +1400,6 @@ def test_loc_setitem_categorical_values_partial_column_slice(self):
             df.loc[1:2, "a"] = Categorical(["b", "b"], categories=["a", "b"])
             df.loc[2:3, "b"] = Categorical(["b", "b"], categories=["a", "b"])
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_loc_setitem_single_row_categorical(self, using_infer_string):
         # GH#25495
         df = DataFrame({"Alpha": ["a"], "Numeric": [0]})