From b6f7130eff317b521ebd19cd22ce521fa8dd9048 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sat, 17 Aug 2024 23:01:33 +0200
Subject: [PATCH 1/7] TST (string-dtype): Adjust indexes string tests

---
 pandas/core/config_init.py                    |  2 +-
 pandas/core/indexes/base.py                   |  6 +++-
 .../tests/indexes/base_class/test_setops.py   |  6 ++--
 pandas/tests/indexes/test_base.py             | 12 ++-----
 pandas/tests/indexes/test_old_base.py         | 31 +++++++------------
 5 files changed, 21 insertions(+), 36 deletions(-)

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index e4eefb570fd95..40dd1e3f0d936 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -873,7 +873,7 @@ def register_converter_cb(key: str) -> None:
 with cf.config_prefix("future"):
     cf.register_option(
         "infer_string",
-        True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False,
+        True,
         "Whether to infer sequence of str objects as pyarrow string "
         "dtype, which will be the default in pandas 3.0 "
         "(at which point this option will be deprecated).",
diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py
index d39c337fbb4b2..b2f1208501337 100644
--- a/pandas/core/indexes/base.py
+++ b/pandas/core/indexes/base.py
@@ -504,7 +504,8 @@ def __new__(
 
         elif is_ea_or_datetimelike_dtype(dtype):
             # non-EA dtype indexes have special casting logic, so we punt here
-            pass
+            if isinstance(data, (set, frozenset)):
+                data = list(data)
 
         elif is_ea_or_datetimelike_dtype(data_dtype):
             pass
@@ -6877,6 +6878,9 @@ def insert(self, loc: int, item) -> Index:
             #  We cannot keep the same dtype, so cast to the (often object)
             #  minimal shared dtype before doing the insert.
             dtype = self._find_common_type_compat(item)
+            if dtype == self.dtype:
+                # EA's might run into recursion errors if loc is invalid
+                raise
             return self.astype(dtype).insert(loc, item)
 
         if arr.dtype != object or not isinstance(
diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py
index f9636ec19f2ec..0e9fb77d6e8dd 100644
--- a/pandas/tests/indexes/base_class/test_setops.py
+++ b/pandas/tests/indexes/base_class/test_setops.py
@@ -3,8 +3,6 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 import pandas as pd
 from pandas import (
     Index,
@@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort):
         expected = Index(expected)
         tm.assert_index_equal(result, expected)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     @pytest.mark.parametrize("first_list", [["b", "a"], []])
     @pytest.mark.parametrize("second_list", [["a", "b"], []])
     @pytest.mark.parametrize(
@@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort):
     def test_union_name_preservation(
         self, first_list, second_list, first_name, second_name, expected_name, sort
     ):
+        expected_dtype = object if not first_list or not second_list else "str"
         first = Index(first_list, name=first_name)
         second = Index(second_list, name=second_name)
         union = first.union(second, sort=sort)
@@ -253,7 +251,7 @@ def test_union_name_preservation(
             expected = Index(sorted(vals), name=expected_name)
             tm.assert_index_equal(union, expected)
         else:
-            expected = Index(vals, name=expected_name)
+            expected = Index(vals, name=expected_name, dtype=expected_dtype)
             tm.assert_index_equal(union.sort_values(), expected.sort_values())
 
     @pytest.mark.parametrize(
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 7ec66100b7291..304fc143a50ea 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -76,16 +76,13 @@ def test_constructor_casting(self, index):
         tm.assert_contains_all(arr, new_index)
         tm.assert_index_equal(index, new_index)
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
-    )
     def test_constructor_copy(self, using_infer_string):
         index = Index(list("abc"), name="name")
         arr = np.array(index)
         new_index = Index(arr, copy=True, name="name")
         assert isinstance(new_index, Index)
         assert new_index.name == "name"
-        if using_infer_string:
+        if using_infer_string and HAS_PYARROW:
             tm.assert_extension_array_equal(
                 new_index.values, pd.array(arr, dtype="str")
             )
@@ -343,11 +340,6 @@ def test_constructor_empty_special(self, empty, klass):
     def test_view_with_args(self, index):
         index.view("i8")
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     @pytest.mark.parametrize(
         "index",
         [
@@ -364,7 +356,7 @@ def test_view_with_args_object_array_raises(self, index):
             msg = "When changing to a larger dtype"
             with pytest.raises(ValueError, match=msg):
                 index.view("i8")
-        elif index.dtype == "string":
+        elif index.dtype == "str" and not index.dtype.storage == "python":
             with pytest.raises(NotImplementedError, match="i8"):
                 index.view("i8")
         else:
diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
index 9993a21d93f12..c9088079e3dfa 100644
--- a/pandas/tests/indexes/test_old_base.py
+++ b/pandas/tests/indexes/test_old_base.py
@@ -6,10 +6,7 @@
 import numpy as np
 import pytest
 
-from pandas._config import using_string_dtype
-
 from pandas._libs.tslibs import Timestamp
-from pandas.compat import HAS_PYARROW
 
 from pandas.core.dtypes.common import (
     is_integer_dtype,
@@ -28,6 +25,7 @@
     PeriodIndex,
     RangeIndex,
     Series,
+    StringDtype,
     TimedeltaIndex,
     isna,
     period_range,
@@ -229,7 +227,6 @@ def test_logical_compat(self, simple_index):
             with pytest.raises(TypeError, match=msg):
                 idx.any()
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_repr_roundtrip(self, simple_index):
         if isinstance(simple_index, IntervalIndex):
             pytest.skip(f"Not a valid repr for {type(simple_index).__name__}")
@@ -246,11 +243,6 @@ def test_repr_max_seq_item_setting(self, simple_index):
             repr(idx)
             assert "..." not in str(idx)
 
-    @pytest.mark.xfail(
-        using_string_dtype() and not HAS_PYARROW,
-        reason="TODO(infer_string)",
-        strict=False,
-    )
     @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
     def test_ensure_copied_data(self, index):
         # Check the "copy" argument of each Index.__new__ is honoured
@@ -296,7 +288,9 @@ def test_ensure_copied_data(self, index):
                 tm.assert_numpy_array_equal(
                     index._values._mask, result._values._mask, check_same="same"
                 )
-            elif index.dtype == "string[python]":
+            elif (
+                isinstance(index.dtype, StringDtype) and index.dtype.storage == "python"
+            ):
                 assert np.shares_memory(index._values._ndarray, result._values._ndarray)
                 tm.assert_numpy_array_equal(
                     index._values._ndarray, result._values._ndarray, check_same="same"
@@ -444,11 +438,7 @@ def test_insert_base(self, index):
         result = trimmed.insert(0, index[0])
         assert index[0:4].equals(result)
 
-    @pytest.mark.skipif(
-        using_string_dtype(),
-        reason="completely different behavior, tested elsewher",
-    )
-    def test_insert_out_of_bounds(self, index):
+    def test_insert_out_of_bounds(self, index, using_infer_string):
         # TypeError/IndexError matches what np.insert raises in these cases
 
         if len(index) > 0:
@@ -460,6 +450,10 @@ def test_insert_out_of_bounds(self, index):
             msg = "index (0|0.5) is out of bounds for axis 0 with size 0"
         else:
             msg = "slice indices must be integers or None or have an __index__ method"
+
+        if using_infer_string and (index.dtype == "str" or index.dtype == "category"):  # noqa: PLR1714
+            msg = "loc must be an integer between"
+
         with pytest.raises(err, match=msg):
             index.insert(0.5, "foo")
 
@@ -836,7 +830,6 @@ def test_append_preserves_dtype(self, simple_index):
         alt = index.take(list(range(N)) * 2)
         tm.assert_index_equal(result, alt, check_exact=True)
 
-    @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
     def test_inv(self, simple_index, using_infer_string):
         idx = simple_index
 
@@ -853,10 +846,8 @@ def test_inv(self, simple_index, using_infer_string):
                 err = TypeError
                 msg = "ufunc 'invert' not supported for the input types"
             elif using_infer_string and idx.dtype == "string":
-                import pyarrow as pa
-
-                err = pa.lib.ArrowNotImplementedError
-                msg = "has no kernel"
+                err = TypeError
+                msg = "not supported for string dtypes"
             else:
                 err = TypeError
                 msg = "bad operand"

From 37252ef8e928f4df8f785c710661a67d202665f0 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Sat, 17 Aug 2024 23:01:57 +0200
Subject: [PATCH 2/7] TST (string-dtype): Adjust indexes string tests

---
 pandas/core/config_init.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py
index 40dd1e3f0d936..e4eefb570fd95 100644
--- a/pandas/core/config_init.py
+++ b/pandas/core/config_init.py
@@ -873,7 +873,7 @@ def register_converter_cb(key: str) -> None:
 with cf.config_prefix("future"):
     cf.register_option(
         "infer_string",
-        True,
+        True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False,
         "Whether to infer sequence of str objects as pyarrow string "
         "dtype, which will be the default in pandas 3.0 "
         "(at which point this option will be deprecated).",

From 093fede35baeb371223ae15fcbe7ba947c529ceb Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <61934744+phofl@users.noreply.github.com>
Date: Mon, 19 Aug 2024 08:59:23 +0200
Subject: [PATCH 3/7] Update test_base.py

---
 pandas/tests/indexes/test_base.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 304fc143a50ea..9ee8108b9a9fe 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -357,6 +357,7 @@ def test_view_with_args_object_array_raises(self, index):
             with pytest.raises(ValueError, match=msg):
                 index.view("i8")
         elif index.dtype == "str" and not index.dtype.storage == "python":
+            # TODO(infer_string): Make the errors consistent
             with pytest.raises(NotImplementedError, match="i8"):
                 index.view("i8")
         else:

From fcfefdbb3db1d8ed471ca3af0a35146d687e7009 Mon Sep 17 00:00:00 2001
From: Patrick Hoefler <patrick_hoefler@gmx.net>
Date: Mon, 19 Aug 2024 09:11:11 +0200
Subject: [PATCH 4/7] Update

---
 pandas/core/construction.py       | 6 +++++-
 pandas/tests/indexes/test_base.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 665eb75953078..4f8a55e44a101 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -611,7 +611,11 @@ def sanitize_array(
                 dtype = StringDtype(na_value=np.nan)
                 subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype)
 
-            if subarr is data and copy:
+            if (
+                subarr is data
+                or subarr.dtype == "str"
+                and subarr.dtype.storage == "python"
+            ) and copy:
                 subarr = subarr.copy()
 
         else:
diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py
index 9ee8108b9a9fe..486b24845d2ff 100644
--- a/pandas/tests/indexes/test_base.py
+++ b/pandas/tests/indexes/test_base.py
@@ -82,7 +82,7 @@ def test_constructor_copy(self, using_infer_string):
         new_index = Index(arr, copy=True, name="name")
         assert isinstance(new_index, Index)
         assert new_index.name == "name"
-        if using_infer_string and HAS_PYARROW:
+        if using_infer_string:
             tm.assert_extension_array_equal(
                 new_index.values, pd.array(arr, dtype="str")
             )

From d7164726d1bd1b2f10fb04f241f93dcf39dcbb6e Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Thu, 22 Aug 2024 11:45:29 +0200
Subject: [PATCH 5/7] Update pandas/core/construction.py

---
 pandas/core/construction.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 4f8a55e44a101..628190ed557c7 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -613,8 +613,7 @@ def sanitize_array(
 
             if (
                 subarr is data
-                or subarr.dtype == "str"
-                and subarr.dtype.storage == "python"
+                or (subarr.dtype == "str" and subarr.dtype.storage == "python")
             ) and copy:
                 subarr = subarr.copy()
 

From 460e60a832cb1a7d484a1ca0cb827ba6798d8031 Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 9 Sep 2024 10:03:20 +0200
Subject: [PATCH 6/7] add type ignore

---
 pandas/core/construction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/construction.py b/pandas/core/construction.py
index 628190ed557c7..bb3aa3867ab08 100644
--- a/pandas/core/construction.py
+++ b/pandas/core/construction.py
@@ -613,7 +613,7 @@ def sanitize_array(
 
             if (
                 subarr is data
-                or (subarr.dtype == "str" and subarr.dtype.storage == "python")
+                or (subarr.dtype == "str" and subarr.dtype.storage == "python")  # type: ignore[union-attr]
             ) and copy:
                 subarr = subarr.copy()
 

From 551627580e7e2812dc81a68a3bc8586a48984feb Mon Sep 17 00:00:00 2001
From: Joris Van den Bossche <jorisvandenbossche@gmail.com>
Date: Mon, 9 Sep 2024 11:41:42 +0200
Subject: [PATCH 7/7] fix check for string dtype in tests

---
 pandas/tests/indexes/test_old_base.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py
index 7f1e466e0de3d..75284a8f8fd47 100644
--- a/pandas/tests/indexes/test_old_base.py
+++ b/pandas/tests/indexes/test_old_base.py
@@ -451,7 +451,9 @@ def test_insert_out_of_bounds(self, index, using_infer_string):
         else:
             msg = "slice indices must be integers or None or have an __index__ method"
 
-        if using_infer_string and (index.dtype == "str" or index.dtype == "category"):  # noqa: PLR1714
+        if using_infer_string and (
+            index.dtype == "string" or index.dtype == "category"  # noqa: PLR1714
+        ):
             msg = "loc must be an integer between"
 
         with pytest.raises(err, match=msg):