Skip to content

TST/BUG (string dtype): Fix and adjust indexes string tests #59544

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Sep 9, 2024
6 changes: 5 additions & 1 deletion pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -504,7 +504,8 @@ def __new__(

elif is_ea_or_datetimelike_dtype(dtype):
# non-EA dtype indexes have special casting logic, so we punt here
pass
if isinstance(data, (set, frozenset)):
data = list(data)
Comment on lines 506 to +508
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is related to what you mentioned on slack, I assume?

idx = pd.Index({1, 2}, dtype="Int64")
idx = pd.Index({1, 2}, dtype="int64")

Whether we consider a set as sequence as valid input for constructors or not (IMO it would also be fine to not do that more generally)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, correct. It#s explicitly allowed for Index, so we shouldn't break this now

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see, for Series(..) we disallow it explicitly, but for Index(..) it currently works for non-EA dtypes. Sounds good to allow that for now.


elif is_ea_or_datetimelike_dtype(data_dtype):
pass
Expand Down Expand Up @@ -6877,6 +6878,9 @@ def insert(self, loc: int, item) -> Index:
# We cannot keep the same dtype, so cast to the (often object)
# minimal shared dtype before doing the insert.
dtype = self._find_common_type_compat(item)
if dtype == self.dtype:
# EA's might run into recursion errors if loc is invalid
raise
return self.astype(dtype).insert(loc, item)

if arr.dtype != object or not isinstance(
Expand Down
6 changes: 2 additions & 4 deletions pandas/tests/indexes/base_class/test_setops.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

import pandas as pd
from pandas import (
Index,
Expand Down Expand Up @@ -233,7 +231,6 @@ def test_tuple_union_bug(self, method, expected, sort):
expected = Index(expected)
tm.assert_index_equal(result, expected)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
@pytest.mark.parametrize("first_list", [["b", "a"], []])
@pytest.mark.parametrize("second_list", [["a", "b"], []])
@pytest.mark.parametrize(
Expand All @@ -243,6 +240,7 @@ def test_tuple_union_bug(self, method, expected, sort):
def test_union_name_preservation(
self, first_list, second_list, first_name, second_name, expected_name, sort
):
expected_dtype = object if not first_list or not second_list else "str"
first = Index(first_list, name=first_name)
second = Index(second_list, name=second_name)
union = first.union(second, sort=sort)
Expand All @@ -253,7 +251,7 @@ def test_union_name_preservation(
expected = Index(sorted(vals), name=expected_name)
tm.assert_index_equal(union, expected)
else:
expected = Index(vals, name=expected_name)
expected = Index(vals, name=expected_name, dtype=expected_dtype)
tm.assert_index_equal(union.sort_values(), expected.sort_values())

@pytest.mark.parametrize(
Expand Down
12 changes: 2 additions & 10 deletions pandas/tests/indexes/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,16 +76,13 @@ def test_constructor_casting(self, index):
tm.assert_contains_all(arr, new_index)
tm.assert_index_equal(index, new_index)

@pytest.mark.xfail(
using_string_dtype() and not HAS_PYARROW, reason="TODO(infer_string)"
)
def test_constructor_copy(self, using_infer_string):
index = Index(list("abc"), name="name")
arr = np.array(index)
new_index = Index(arr, copy=True, name="name")
assert isinstance(new_index, Index)
assert new_index.name == "name"
if using_infer_string:
if using_infer_string and HAS_PYARROW:
tm.assert_extension_array_equal(
new_index.values, pd.array(arr, dtype="str")
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This one I don't understand. Why is index.values a numpy array in case of "python" storage?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I thought I rolled that one back, the change was just something I tried.

Updated now

Expand Down Expand Up @@ -343,11 +340,6 @@ def test_constructor_empty_special(self, empty, klass):
def test_view_with_args(self, index):
index.view("i8")

@pytest.mark.xfail(
using_string_dtype() and not HAS_PYARROW,
reason="TODO(infer_string)",
strict=False,
)
@pytest.mark.parametrize(
"index",
[
Expand All @@ -364,7 +356,7 @@ def test_view_with_args_object_array_raises(self, index):
msg = "When changing to a larger dtype"
with pytest.raises(ValueError, match=msg):
index.view("i8")
elif index.dtype == "string":
elif index.dtype == "str" and not index.dtype.storage == "python":
with pytest.raises(NotImplementedError, match="i8"):
index.view("i8")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just testing a different error, right? (just to understand why this has a separate branch)

(ideally we should also make it a TypeError, I assume, maybe can add a TODO note)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added a TODO

else:
Expand Down
31 changes: 11 additions & 20 deletions pandas/tests/indexes/test_old_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
import numpy as np
import pytest

from pandas._config import using_string_dtype

from pandas._libs.tslibs import Timestamp
from pandas.compat import HAS_PYARROW

from pandas.core.dtypes.common import (
is_integer_dtype,
Expand All @@ -28,6 +25,7 @@
PeriodIndex,
RangeIndex,
Series,
StringDtype,
TimedeltaIndex,
isna,
period_range,
Expand Down Expand Up @@ -229,7 +227,6 @@ def test_logical_compat(self, simple_index):
with pytest.raises(TypeError, match=msg):
idx.any()

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_repr_roundtrip(self, simple_index):
if isinstance(simple_index, IntervalIndex):
pytest.skip(f"Not a valid repr for {type(simple_index).__name__}")
Expand All @@ -246,11 +243,6 @@ def test_repr_max_seq_item_setting(self, simple_index):
repr(idx)
assert "..." not in str(idx)

@pytest.mark.xfail(
using_string_dtype() and not HAS_PYARROW,
reason="TODO(infer_string)",
strict=False,
)
@pytest.mark.filterwarnings(r"ignore:PeriodDtype\[B\] is deprecated:FutureWarning")
def test_ensure_copied_data(self, index):
# Check the "copy" argument of each Index.__new__ is honoured
Expand Down Expand Up @@ -296,7 +288,9 @@ def test_ensure_copied_data(self, index):
tm.assert_numpy_array_equal(
index._values._mask, result._values._mask, check_same="same"
)
elif index.dtype == "string[python]":
elif (
isinstance(index.dtype, StringDtype) and index.dtype.storage == "python"
):
assert np.shares_memory(index._values._ndarray, result._values._ndarray)
tm.assert_numpy_array_equal(
index._values._ndarray, result._values._ndarray, check_same="same"
Expand Down Expand Up @@ -444,11 +438,7 @@ def test_insert_base(self, index):
result = trimmed.insert(0, index[0])
assert index[0:4].equals(result)

@pytest.mark.skipif(
using_string_dtype(),
reason="completely different behavior, tested elsewher",
)
def test_insert_out_of_bounds(self, index):
def test_insert_out_of_bounds(self, index, using_infer_string):
# TypeError/IndexError matches what np.insert raises in these cases

if len(index) > 0:
Expand All @@ -460,6 +450,10 @@ def test_insert_out_of_bounds(self, index):
msg = "index (0|0.5) is out of bounds for axis 0 with size 0"
else:
msg = "slice indices must be integers or None or have an __index__ method"

if using_infer_string and (index.dtype == "str" or index.dtype == "category"): # noqa: PLR1714
msg = "loc must be an integer between"

with pytest.raises(err, match=msg):
index.insert(0.5, "foo")

Expand Down Expand Up @@ -836,7 +830,6 @@ def test_append_preserves_dtype(self, simple_index):
alt = index.take(list(range(N)) * 2)
tm.assert_index_equal(result, alt, check_exact=True)

@pytest.mark.xfail(using_string_dtype(), reason="TODO(infer_string)", strict=False)
def test_inv(self, simple_index, using_infer_string):
idx = simple_index

Expand All @@ -853,10 +846,8 @@ def test_inv(self, simple_index, using_infer_string):
err = TypeError
msg = "ufunc 'invert' not supported for the input types"
elif using_infer_string and idx.dtype == "string":
import pyarrow as pa

err = pa.lib.ArrowNotImplementedError
msg = "has no kernel"
err = TypeError
msg = "not supported for string dtypes"
else:
err = TypeError
msg = "bad operand"
Expand Down
Loading