pandas-dev · mroeschke · Nov 27, 2023 · Nov 5, 2023 · Nov 5, 2023 · Nov 5, 2023
diff --git a/pandas/tests/indexing/multiindex/test_loc.py b/pandas/tests/indexing/multiindex/test_loc.py
@@ -566,7 +566,7 @@ def test_loc_setitem_single_column_slice():
     tm.assert_frame_equal(df, expected)
 
 
-def test_loc_nan_multiindex():
+def test_loc_nan_multiindex(using_infer_string):
     # GH 5286
     tups = [
         ("Good Things", "C", np.nan),
@@ -586,8 +586,12 @@ def test_loc_nan_multiindex():
     result = df.loc["Good Things"].loc["C"]
     expected = DataFrame(
         np.ones((1, 4)),
-        index=Index([np.nan], dtype="object", name="u3"),
-        columns=Index(["d1", "d2", "d3", "d4"], dtype="object"),
+        index=Index(
+            [np.nan],
+            dtype="object" if not using_infer_string else "string[pyarrow_numpy]",
+            name="u3",
+        ),
+        columns=Index(["d1", "d2", "d3", "d4"]),
     )
     tm.assert_frame_equal(result, expected)
 

diff --git a/pandas/tests/indexing/test_at.py b/pandas/tests/indexing/test_at.py
@@ -13,6 +13,7 @@
     CategoricalIndex,
     DataFrame,
     DatetimeIndex,
+    Index,
     MultiIndex,
     Series,
     Timestamp,
@@ -70,7 +71,11 @@ def test_at_setitem_item_cache_cleared(self):
         df.at[0, "x"] = 4
         df.at[0, "cost"] = 789
 
-        expected = DataFrame({"x": [4], "cost": 789}, index=[0])
+        expected = DataFrame(
+            {"x": [4], "cost": 789},
+            index=[0],
+            columns=Index(["x", "cost"], dtype=object),
+        )
         tm.assert_frame_equal(df, expected)
 
         # And in particular, check that the _item_cache has updated correctly.

diff --git a/pandas/tests/indexing/test_categorical.py b/pandas/tests/indexing/test_categorical.py
@@ -273,7 +273,7 @@ def test_slicing_doc_examples(self):
         tm.assert_frame_equal(result, expected)
 
         result = df.iloc[2:4, :].dtypes
-        expected = Series(["category", "int64"], ["cats", "values"])
+        expected = Series(["category", "int64"], ["cats", "values"], dtype=object)
         tm.assert_series_equal(result, expected)
 
         result = df.loc["h":"j", "cats"]

diff --git a/pandas/tests/indexing/test_chaining_and_caching.py b/pandas/tests/indexing/test_chaining_and_caching.py
@@ -339,7 +339,9 @@ def test_detect_chained_assignment_object_dtype(
         self, using_array_manager, using_copy_on_write, warn_copy_on_write
     ):
         expected = DataFrame({"A": [111, "bbb", "ccc"], "B": [1, 2, 3]})
-        df = DataFrame({"A": ["aaa", "bbb", "ccc"], "B": [1, 2, 3]})
+        df = DataFrame(
+            {"A": Series(["aaa", "bbb", "ccc"], dtype=object), "B": [1, 2, 3]}
+        )
         df_original = df.copy()
 
         if not using_copy_on_write and not warn_copy_on_write:

diff --git a/pandas/tests/indexing/test_coercion.py b/pandas/tests/indexing/test_coercion.py
@@ -9,6 +9,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas.compat import (
     IS64,
     is_platform_windows,
@@ -111,7 +113,7 @@ def _assert_setitem_index_conversion(
         "val,exp_dtype", [("x", object), (5, IndexError), (1.1, object)]
     )
     def test_setitem_index_object(self, val, exp_dtype):
-        obj = pd.Series([1, 2, 3, 4], index=list("abcd"))
+        obj = pd.Series([1, 2, 3, 4], index=pd.Index(list("abcd"), dtype=object))
         assert obj.index.dtype == object
 
         if exp_dtype is IndexError:
@@ -122,7 +124,7 @@ def test_setitem_index_object(self, val, exp_dtype):
                 with tm.assert_produces_warning(FutureWarning, match=warn_msg):
                     temp[5] = 5
         else:
-            exp_index = pd.Index(list("abcd") + [val])
+            exp_index = pd.Index(list("abcd") + [val], dtype=object)
             self._assert_setitem_index_conversion(obj, val, exp_index, exp_dtype)
 
     @pytest.mark.parametrize(
@@ -195,10 +197,10 @@ def _assert_insert_conversion(self, original, value, expected, expected_dtype):
         ],
     )
     def test_insert_index_object(self, insert, coerced_val, coerced_dtype):
-        obj = pd.Index(list("abcd"))
+        obj = pd.Index(list("abcd"), dtype=object)
         assert obj.dtype == object
 
-        exp = pd.Index(["a", coerced_val, "b", "c", "d"])
+        exp = pd.Index(["a", coerced_val, "b", "c", "d"], dtype=object)
         self._assert_insert_conversion(obj, insert, exp, coerced_dtype)
 
     @pytest.mark.parametrize(
@@ -397,7 +399,7 @@ def _run_test(self, obj, fill_val, klass, exp_dtype):
     )
     def test_where_object(self, index_or_series, fill_val, exp_dtype):
         klass = index_or_series
-        obj = klass(list("abcd"))
+        obj = klass(list("abcd"), dtype=object)
         assert obj.dtype == object
         self._run_test(obj, fill_val, klass, exp_dtype)
 
@@ -559,10 +561,10 @@ def _assert_fillna_conversion(self, original, value, expected, expected_dtype):
     )
     def test_fillna_object(self, index_or_series, fill_val, fill_dtype):
         klass = index_or_series
-        obj = klass(["a", np.nan, "c", "d"])
+        obj = klass(["a", np.nan, "c", "d"], dtype=object)
         assert obj.dtype == object
 
-        exp = klass(["a", fill_val, "c", "d"])
+        exp = klass(["a", fill_val, "c", "d"], dtype=object)
         self._assert_fillna_conversion(obj, fill_val, exp, fill_dtype)
 
     @pytest.mark.parametrize(
@@ -824,6 +826,8 @@ def replacer(self, how, from_key, to_key):
             raise ValueError
         return replacer
 
+    # Expected needs adjustment for the infer string option, seems to work as expecetd
+    @pytest.mark.skipif(using_pyarrow_string_dtype(), reason="TODO: test is to complex")
     def test_replace_series(self, how, to_key, from_key, replacer):
         index = pd.Index([3, 4], name="xxx")
         obj = pd.Series(self.rep[from_key], index=index, name="yyy")
@@ -870,13 +874,18 @@ def test_replace_series(self, how, to_key, from_key, replacer):
     @pytest.mark.parametrize(
         "from_key", ["datetime64[ns, UTC]", "datetime64[ns, US/Eastern]"], indirect=True
     )
-    def test_replace_series_datetime_tz(self, how, to_key, from_key, replacer):
+    def test_replace_series_datetime_tz(
+        self, how, to_key, from_key, replacer, using_infer_string
+    ):
         index = pd.Index([3, 4], name="xyz")
         obj = pd.Series(self.rep[from_key], index=index, name="yyy")
         assert obj.dtype == from_key
 
         exp = pd.Series(self.rep[to_key], index=index, name="yyy")
-        assert exp.dtype == to_key
+        if using_infer_string and to_key == "object":
+            assert exp.dtype == "string"
+        else:
+            assert exp.dtype == to_key
 
         msg = "Downcasting behavior in `replace`"
         warn = FutureWarning if exp.dtype != object else None

diff --git a/pandas/tests/indexing/test_iloc.py b/pandas/tests/indexing/test_iloc.py
@@ -100,9 +100,8 @@ def test_iloc_setitem_fullcol_categorical(self, indexer, key, using_array_manage
         #  we retain the object dtype.
         frame = DataFrame({0: np.array([0, 1, 2], dtype=object), 1: range(3)})
         df = frame.copy()
-        orig_vals = df.values
         indexer(df)[key, 0] = cat
-        expected = DataFrame({0: cat.astype(object), 1: range(3)})
+        expected = DataFrame({0: Series(cat.astype(object), dtype=object), 1: range(3)})
         tm.assert_frame_equal(df, expected)
 
     @pytest.mark.parametrize("box", [array, Series])
@@ -232,7 +231,10 @@ def test_iloc_exceeds_bounds(self):
         dfl = DataFrame(
             np.random.default_rng(2).standard_normal((5, 2)), columns=list("AB")
         )
-        tm.assert_frame_equal(dfl.iloc[:, 2:3], DataFrame(index=dfl.index, columns=[]))
+        tm.assert_frame_equal(
+            dfl.iloc[:, 2:3],
+            DataFrame(index=dfl.index, columns=Index([], dtype=dfl.columns.dtype)),
+        )
         tm.assert_frame_equal(dfl.iloc[:, 1:3], dfl.iloc[:, [1]])
         tm.assert_frame_equal(dfl.iloc[4:6], dfl.iloc[[4]])
 
@@ -451,12 +453,16 @@ def test_iloc_setitem(self):
     def test_iloc_setitem_axis_argument(self):
         # GH45032
         df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]])
+        df[1] = df[1].astype(object)
         expected = DataFrame([[6, "c", 10], [7, "d", 11], [5, 5, 5]])
+        expected[1] = expected[1].astype(object)
         df.iloc(axis=0)[2] = 5
         tm.assert_frame_equal(df, expected)
 
         df = DataFrame([[6, "c", 10], [7, "d", 11], [8, "e", 12]])
+        df[1] = df[1].astype(object)
         expected = DataFrame([[6, "c", 5], [7, "d", 5], [8, "e", 5]])
+        expected[1] = expected[1].astype(object)
         df.iloc(axis=1)[2] = 5
         tm.assert_frame_equal(df, expected)
 
@@ -615,7 +621,7 @@ def test_iloc_getitem_labelled_frame(self):
         assert result == exp
 
         # out-of-bounds exception
-        msg = "index 5 is out of bounds for axis 0 with size 4"
+        msg = "index 5 is out of bounds for axis 0 with size 4|index out of bounds"
         with pytest.raises(IndexError, match=msg):
             df.iloc[10, 5]
 
@@ -1313,7 +1319,9 @@ def test_iloc_setitem_dtypes_duplicate_columns(
         self, dtypes, init_value, expected_value
     ):
         # GH#22035
-        df = DataFrame([[init_value, "str", "str2"]], columns=["a", "b", "b"])
+        df = DataFrame(
+            [[init_value, "str", "str2"]], columns=["a", "b", "b"], dtype=object
+        )
 
         # with the enforcement of GH#45333 in 2.0, this sets values inplace,
         #  so we retain object dtype
@@ -1360,7 +1368,10 @@ def test_frame_iloc_getitem_callable(self):
 
     def test_frame_iloc_setitem_callable(self):
         # GH#11485
-        df = DataFrame({"X": [1, 2, 3, 4], "Y": list("aabb")}, index=list("ABCD"))
+        df = DataFrame(
+            {"X": [1, 2, 3, 4], "Y": Series(list("aabb"), dtype=object)},
+            index=list("ABCD"),
+        )
 
         # return location
         res = df.copy()

diff --git a/pandas/tests/indexing/test_indexing.py b/pandas/tests/indexing/test_indexing.py
@@ -8,6 +8,8 @@
 import numpy as np
 import pytest
 
+from pandas._config import using_pyarrow_string_dtype
+
 from pandas.errors import IndexingError
 
 from pandas.core.dtypes.common import (
@@ -189,7 +191,7 @@ def test_setitem_dtype_upcast(self):
         ):
             df.loc[0, "c"] = "foo"
         expected = DataFrame(
-            [{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}]
+            {"a": [1, 3], "b": [np.nan, 2], "c": Series(["foo", np.nan], dtype=object)}
         )
         tm.assert_frame_equal(df, expected)
 
@@ -284,18 +286,27 @@ def test_dups_fancy_indexing_not_in_order(self):
         with pytest.raises(KeyError, match="not in index"):
             df.loc[rows]
 
-    def test_dups_fancy_indexing_only_missing_label(self):
+    def test_dups_fancy_indexing_only_missing_label(self, using_infer_string):
         # List containing only missing label
         dfnu = DataFrame(
             np.random.default_rng(2).standard_normal((5, 3)), index=list("AABCD")
         )
-        with pytest.raises(
-            KeyError,
-            match=re.escape(
-                "\"None of [Index(['E'], dtype='object')] are in the [index]\""
-            ),
-        ):
-            dfnu.loc[["E"]]
+        if using_infer_string:
+            with pytest.raises(
+                KeyError,
+                match=re.escape(
+                    "\"None of [Index(['E'], dtype='string')] are in the [index]\""
+                ),
+            ):
+                dfnu.loc[["E"]]
+        else:
+            with pytest.raises(
+                KeyError,
+                match=re.escape(
+                    "\"None of [Index(['E'], dtype='object')] are in the [index]\""
+                ),
+            ):
+                dfnu.loc[["E"]]
 
     @pytest.mark.parametrize("vals", [[0, 1, 2], list("abc")])
     def test_dups_fancy_indexing_missing_label(self, vals):
@@ -451,6 +462,9 @@ def test_set_index_nan(self):
         )
         tm.assert_frame_equal(result, df)
 
+    @pytest.mark.xfail(
+        using_pyarrow_string_dtype(), reason="can't multiply arrow strings"
+    )
     def test_multi_assign(self):
         # GH 3626, an assignment of a sub-df to a df
         # set float64 to avoid upcast when setting nan
@@ -553,7 +567,7 @@ def test_string_slice_empty(self):
         with pytest.raises(KeyError, match="^0$"):
             df.loc["2011", 0]
 
-    def test_astype_assignment(self):
+    def test_astype_assignment(self, using_infer_string):
         # GH4312 (iloc)
         df_orig = DataFrame(
             [["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
@@ -567,8 +581,9 @@ def test_astype_assignment(self):
         expected = DataFrame(
             [[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
         )
-        expected["A"] = expected["A"].astype(object)
-        expected["B"] = expected["B"].astype(object)
+        if not using_infer_string:
+            expected["A"] = expected["A"].astype(object)
+            expected["B"] = expected["B"].astype(object)
         tm.assert_frame_equal(df, expected)
 
         # GH5702 (loc)
@@ -577,16 +592,18 @@ def test_astype_assignment(self):
         expected = DataFrame(
             [[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
         )
-        expected["A"] = expected["A"].astype(object)
+        if not using_infer_string:
+            expected["A"] = expected["A"].astype(object)
         tm.assert_frame_equal(df, expected)
 
         df = df_orig.copy()
         df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
         expected = DataFrame(
             [["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
         )
-        expected["B"] = expected["B"].astype(object)
-        expected["C"] = expected["C"].astype(object)
+        if not using_infer_string:
+            expected["B"] = expected["B"].astype(object)
+            expected["C"] = expected["C"].astype(object)
         tm.assert_frame_equal(df, expected)
 
     def test_astype_assignment_full_replacements(self):
@@ -673,6 +690,7 @@ def test_loc_setitem_fullindex_views(self):
         df.loc[df.index] = df.loc[df.index]
         tm.assert_frame_equal(df, df2)
 
+    @pytest.mark.xfail(using_pyarrow_string_dtype(), reason="can't set int into string")
     def test_rhs_alignment(self):
         # GH8258, tests that both rows & columns are aligned to what is
         # assigned to. covers both uniform data-type & multi-type cases