From b6f7130eff317b521ebd19cd22ce521fa8dd9048 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Aug 2024 23:01:33 +0200 Subject: [PATCH 1/7] TST (string-dtype): Adjust indexes string tests --- pandas/core/config_init.py | 2 +- pandas/core/indexes/base.py | 6 +++- .../tests/indexes/base_class/test_setops.py | 6 ++-- pandas/tests/indexes/test_base.py | 12 ++----- pandas/tests/indexes/test_old_base.py | 31 +++++++------------ 5 files changed, 21 insertions(+), 36 deletions(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index e4eefb570fd95..40dd1e3f0d936 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -873,7 +873,7 @@ def register_converter_cb(key: str) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, + True, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index d39c337fbb4b2..b2f1208501337 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -504,7 +504,8 @@ def __new__( elif is_ea_or_datetimelike_dtype(dtype): # non-EA dtype indexes have special casting logic, so we punt here - pass + if isinstance(data, (set, frozenset)): + data = list(data) elif is_ea_or_datetimelike_dtype(data_dtype): pass @@ -6877,6 +6878,9 @@ def insert(self, loc: int, item) -> Index: # We cannot keep the same dtype, so cast to the (often object) # minimal shared dtype before doing the insert. dtype = self._find_common_type_compat(item) + if dtype == self.dtype: + # EA's might run into recursion errors if loc is invalid + raise return self.astype(dtype).insert(loc, item) if arr.dtype != object or not isinstance( diff --git a/pandas/tests/indexes/base_class/test_setops.py b/pandas/tests/indexes/base_class/test_setops.py index f9636ec19f2ec..0e9fb77d6e8dd 100644 --- a/pandas/tests/indexes/base_class/test_setops.py +++ b/pandas/tests/indexes/base_class/test_setops.py @@ -3,8 +3,6 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - import pandas as pd from pandas import ( Index, @@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort): expected = Index(expected) tm.assert_index_equal(result, expected) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) @pytest.mark.parametrize("first_list", [["b", "a"], []]) @pytest.mark.parametrize("second_list", [["a", "b"], []]) @pytest.mark.parametrize( @@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort): def test_union_name_preservation( self, first_list, second_list, first_name, second_name, expected_name, sort ): + expected_dtype = object if not first_list or not second_list else "str" first = Index(first_list, name=first_name) second = Index(second_list, name=second_name) union = first.union(second, sort=sort) @@ -253,7 +251,7 @@ def test_union_name_preservation( expected = Index(sorted(vals), name=expected_name) tm.assert_index_equal(union, expected) else: - expected = Index(vals, name=expected_name) + expected = Index(vals, name=expected_name, dtype=expected_dtype) tm.assert_index_equal(union.sort_values(), expected.sort_values()) @pytest.mark.parametrize( diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 7ec66100b7291..304fc143a50ea 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -76,16 +76,13 @@ def test_constructor_casting(self, index): tm.assert_contains_all(arr, new_index) tm.assert_index_equal(index, new_index) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)" - ) def test_constructor_copy(self, using_infer_string): index = Index(list("abc"), name="name") arr = np.array(index) new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) assert new_index.name == "name" - if using_infer_string: + if using_infer_string and HAS_PYARROW: tm.assert_extension_array_equal( new_index.values, pd.array(arr, dtype="str") ) @@ -343,11 +340,6 @@ def test_constructor_empty_special(self, empty, klass): def test_view_with_args(self, index): index.view("i8") - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.parametrize( "index", [ @@ -364,7 +356,7 @@ def test_view_with_args_object_array_raises(self, index): msg = "When changing to a larger dtype" with pytest.raises(ValueError, match=msg): index.view("i8") - elif index.dtype == "string": + elif index.dtype == "str" and not index.dtype.storage == "python": with pytest.raises(NotImplementedError, match="i8"): index.view("i8") else: diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 9993a21d93f12..c9088079e3dfa 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -6,10 +6,7 @@ import numpy as np import pytest -from pandas._config import using_string_dtype - from pandas._libs.tslibs import Timestamp -from pandas.compat import HAS_PYARROW from pandas.core.dtypes.common import ( is_integer_dtype, @@ -28,6 +25,7 @@ PeriodIndex, RangeIndex, Series, + StringDtype, TimedeltaIndex, isna, period_range, @@ -229,7 +227,6 @@ def test_logical_compat(self, simple_index): with pytest.raises(TypeError, match=msg): idx.any() - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_repr_roundtrip(self, simple_index): if isinstance(simple_index, IntervalIndex): pytest.skip(f"Not a valid repr for {type(simple_index).__name__}") @@ -246,11 +243,6 @@ def test_repr_max_seq_item_setting(self, simple_index): repr(idx) assert "..." not in str(idx) - @pytest.mark.xfail( - using_string_dtype() and not HAS_PYARROW, - reason="TODO(infer_string)", - strict=False, - ) @pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning") def test_ensure_copied_data(self, index): # Check the "copy" argument of each Index.__new__ is honoured @@ -296,7 +288,9 @@ def test_ensure_copied_data(self, index): tm.assert_numpy_array_equal( index._values._mask, result._values._mask, check_same="same" ) - elif index.dtype == "string[python]": + elif ( + isinstance(index.dtype, StringDtype) and index.dtype.storage == "python" + ): assert np.shares_memory(index._values._ndarray, result._values._ndarray) tm.assert_numpy_array_equal( index._values._ndarray, result._values._ndarray, check_same="same" @@ -444,11 +438,7 @@ def test_insert_base(self, index): result = trimmed.insert(0, index[0]) assert index[0:4].equals(result) - @pytest.mark.skipif( - using_string_dtype(), - reason="completely different behavior, tested elsewher", - ) - def test_insert_out_of_bounds(self, index): + def test_insert_out_of_bounds(self, index, using_infer_string): # TypeError/IndexError matches what np.insert raises in these cases if len(index) > 0: @@ -460,6 +450,10 @@ def test_insert_out_of_bounds(self, index): msg = "index (0|0.5) is out of bounds for axis 0 with size 0" else: msg = "slice indices must be integers or None or have an __index__ method" + + if using_infer_string and (index.dtype == "str" or index.dtype == "category"): # noqa: PLR1714 + msg = "loc must be an integer between" + with pytest.raises(err, match=msg): index.insert(0.5, "foo") @@ -836,7 +830,6 @@ def test_append_preserves_dtype(self, simple_index): alt = index.take(list(range(N)) * 2) tm.assert_index_equal(result, alt, check_exact=True) - @pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False) def test_inv(self, simple_index, using_infer_string): idx = simple_index @@ -853,10 +846,8 @@ def test_inv(self, simple_index, using_infer_string): err = TypeError msg = "ufunc 'invert' not supported for the input types" elif using_infer_string and idx.dtype == "string": - import pyarrow as pa - - err = pa.lib.ArrowNotImplementedError - msg = "has no kernel" + err = TypeError + msg = "not supported for string dtypes" else: err = TypeError msg = "bad operand" From 37252ef8e928f4df8f785c710661a67d202665f0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sat, 17 Aug 2024 23:01:57 +0200 Subject: [PATCH 2/7] TST (string-dtype): Adjust indexes string tests --- pandas/core/config_init.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 40dd1e3f0d936..e4eefb570fd95 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -873,7 +873,7 @@ def register_converter_cb(key: str) -> None: with cf.config_prefix("future"): cf.register_option( "infer_string", - True, + True if os.environ.get("PANDAS_FUTURE_INFER_STRING", "0") == "1" else False, "Whether to infer sequence of str objects as pyarrow string " "dtype, which will be the default in pandas 3.0 " "(at which point this option will be deprecated).", From 093fede35baeb371223ae15fcbe7ba947c529ceb Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Mon, 19 Aug 2024 08:59:23 +0200 Subject: [PATCH 3/7] Update test_base.py --- pandas/tests/indexes/test_base.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 304fc143a50ea..9ee8108b9a9fe 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -357,6 +357,7 @@ def test_view_with_args_object_array_raises(self, index): with pytest.raises(ValueError, match=msg): index.view("i8") elif index.dtype == "str" and not index.dtype.storage == "python": + # TODO(infer_string): Make the errors consistent with pytest.raises(NotImplementedError, match="i8"): index.view("i8") else: From fcfefdbb3db1d8ed471ca3af0a35146d687e7009 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Mon, 19 Aug 2024 09:11:11 +0200 Subject: [PATCH 4/7] Update --- pandas/core/construction.py | 6 +++++- pandas/tests/indexes/test_base.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 665eb75953078..4f8a55e44a101 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -611,7 +611,11 @@ def sanitize_array( dtype = StringDtype(na_value=np.nan) subarr = dtype.construct_array_type()._from_sequence(data, dtype=dtype) - if subarr is data and copy: + if ( + subarr is data + or subarr.dtype == "str" + and subarr.dtype.storage == "python" + ) and copy: subarr = subarr.copy() else: diff --git a/pandas/tests/indexes/test_base.py b/pandas/tests/indexes/test_base.py index 9ee8108b9a9fe..486b24845d2ff 100644 --- a/pandas/tests/indexes/test_base.py +++ b/pandas/tests/indexes/test_base.py @@ -82,7 +82,7 @@ def test_constructor_copy(self, using_infer_string): new_index = Index(arr, copy=True, name="name") assert isinstance(new_index, Index) assert new_index.name == "name" - if using_infer_string and HAS_PYARROW: + if using_infer_string: tm.assert_extension_array_equal( new_index.values, pd.array(arr, dtype="str") ) From d7164726d1bd1b2f10fb04f241f93dcf39dcbb6e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 22 Aug 2024 11:45:29 +0200 Subject: [PATCH 5/7] Update pandas/core/construction.py --- pandas/core/construction.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 4f8a55e44a101..628190ed557c7 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -613,8 +613,7 @@ def sanitize_array( if ( subarr is data - or subarr.dtype == "str" - and subarr.dtype.storage == "python" + or (subarr.dtype == "str" and subarr.dtype.storage == "python") ) and copy: subarr = subarr.copy() From 460e60a832cb1a7d484a1ca0cb827ba6798d8031 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 10:03:20 +0200 Subject: [PATCH 6/7] add type ignore --- pandas/core/construction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/construction.py b/pandas/core/construction.py index 628190ed557c7..bb3aa3867ab08 100644 --- a/pandas/core/construction.py +++ b/pandas/core/construction.py @@ -613,7 +613,7 @@ def sanitize_array( if ( subarr is data - or (subarr.dtype == "str" and subarr.dtype.storage == "python") + or (subarr.dtype == "str" and subarr.dtype.storage == "python") # type: ignore[union-attr] ) and copy: subarr = subarr.copy() From 551627580e7e2812dc81a68a3bc8586a48984feb Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 9 Sep 2024 11:41:42 +0200 Subject: [PATCH 7/7] fix check for string dtype in tests --- pandas/tests/indexes/test_old_base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/indexes/test_old_base.py b/pandas/tests/indexes/test_old_base.py index 7f1e466e0de3d..75284a8f8fd47 100644 --- a/pandas/tests/indexes/test_old_base.py +++ b/pandas/tests/indexes/test_old_base.py @@ -451,7 +451,9 @@ def test_insert_out_of_bounds(self, index, using_infer_string): else: msg = "slice indices must be integers or None or have an __index__ method" - if using_infer_string and (index.dtype == "str" or index.dtype == "category"): # noqa: PLR1714 + if using_infer_string and ( + index.dtype == "string" or index.dtype == "category" # noqa: PLR1714 + ): msg = "loc must be an integer between" with pytest.raises(err, match=msg):