diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index bed64efc690ec..0e413f81834b2 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -56,6 +56,7 @@ is_datetime64tz_dtype, is_dtype_equal, is_integer, + is_list_like, is_object_dtype, is_scalar, is_string_dtype, @@ -927,6 +928,14 @@ def __getitem__( indices = np.arange(len(self), dtype=np.int32)[key] return self.take(indices) + elif not is_list_like(key): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) + else: # TODO: I think we can avoid densifying when masking a # boolean SparseArray with another. Need to look at the diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 4e3bd05d2cc8d..58b4a0c9f9242 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -322,6 +322,13 @@ def __getitem__( elif item[1] is Ellipsis: item = item[0] + if is_scalar(item) and not is_integer(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) # We are not an array indexer, so maybe e.g. a slice or integer # indexer. We dispatch to pyarrow. value = self._data[item] @@ -392,6 +399,11 @@ def _cmp_method(self, other, op): # TODO(ARROW-9429): Add a .to_numpy() to ChunkedArray return BooleanArray._from_sequence(result.to_pandas().values) + def insert(self, loc: int, item): + if not isinstance(item, str) and item is not libmissing.NA: + raise TypeError("Scalar must be NA or str") + return super().insert(loc, item) + def __setitem__(self, key: int | slice | np.ndarray, value: Any) -> None: """Set one or more values inplace. diff --git a/pandas/tests/extension/base/getitem.py b/pandas/tests/extension/base/getitem.py index 7efd3bdb6920a..73bff29305f20 100644 --- a/pandas/tests/extension/base/getitem.py +++ b/pandas/tests/extension/base/getitem.py @@ -120,6 +120,33 @@ def test_getitem_scalar(self, data): result = pd.Series(data)[0] assert isinstance(result, data.dtype.type) + def test_getitem_invalid(self, data): + # TODO: box over scalar, [scalar], (scalar,)? + + msg = ( + r"only integers, slices \(`:`\), ellipsis \(`...`\), numpy.newaxis " + r"\(`None`\) and integer or boolean arrays are valid indices" + ) + with pytest.raises(IndexError, match=msg): + data["foo"] + with pytest.raises(IndexError, match=msg): + data[2.5] + + ub = len(data) + msg = "|".join( + [ + "list index out of range", # json + "index out of bounds", # pyarrow + "Out of bounds access", # Sparse + f"index {ub+1} is out of bounds for axis 0 with size {ub}", + f"index -{ub+1} is out of bounds for axis 0 with size {ub}", + ] + ) + with pytest.raises(IndexError, match=msg): + data[ub + 1] + with pytest.raises(IndexError, match=msg): + data[-ub - 1] + def test_getitem_scalar_na(self, data_missing, na_cmp, na_value): result = data_missing[0] assert na_cmp(result, na_value) diff --git a/pandas/tests/extension/base/setitem.py b/pandas/tests/extension/base/setitem.py index 0392ea794237c..a2d100db81a2c 100644 --- a/pandas/tests/extension/base/setitem.py +++ b/pandas/tests/extension/base/setitem.py @@ -367,3 +367,11 @@ def test_delitem_series(self, data): expected = ser[taker] del ser[1] self.assert_series_equal(ser, expected) + + def test_setitem_invalid(self, data, invalid_scalar): + msg = "" # messages vary by subclass, so we do not test it + with pytest.raises((ValueError, TypeError), match=msg): + data[0] = invalid_scalar + + with pytest.raises((ValueError, TypeError), match=msg): + data[:] = invalid_scalar diff --git a/pandas/tests/extension/json/array.py b/pandas/tests/extension/json/array.py index 2eef828288e59..309d865bc7452 100644 --- a/pandas/tests/extension/json/array.py +++ b/pandas/tests/extension/json/array.py @@ -32,7 +32,10 @@ from pandas._typing import type_t from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike -from pandas.core.dtypes.common import pandas_dtype +from pandas.core.dtypes.common import ( + is_list_like, + pandas_dtype, +) import pandas as pd from pandas.api.extensions import ( @@ -103,6 +106,13 @@ def __getitem__(self, item): elif isinstance(item, slice): # slice return type(self)(self.data[item]) + elif not is_list_like(item): + # e.g. "foo" or 2.5 + # exception message copied from numpy + raise IndexError( + r"only integers, slices (`:`), ellipsis (`...`), numpy.newaxis " + r"(`None`) and integer or boolean arrays are valid indices" + ) else: item = pd.api.indexers.check_array_indexer(self, item) if is_bool_dtype(item.dtype): diff --git a/pandas/tests/extension/test_numpy.py b/pandas/tests/extension/test_numpy.py index 0e3e26e7e9500..e60f7769270bd 100644 --- a/pandas/tests/extension/test_numpy.py +++ b/pandas/tests/extension/test_numpy.py @@ -363,6 +363,11 @@ def test_concat(self, data, in_frame): class TestSetitem(BaseNumPyTests, base.BaseSetitemTests): + @skip_nested + def test_setitem_invalid(self, data, invalid_scalar): + # object dtype can hold anything, so doesn't raise + super().test_setitem_invalid(data, invalid_scalar) + @skip_nested def test_setitem_sequence_broadcasts(self, data, box_in_series): # ValueError: cannot set using a list-like indexer with a different diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 06b07968f949e..af86c359c4c00 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -160,13 +160,6 @@ def test_value_counts(self, all_data, dropna): def test_value_counts_with_normalize(self, data): pass - def test_insert_invalid(self, data, invalid_scalar, request): - if data.dtype.storage == "pyarrow": - mark = pytest.mark.xfail(reason="casts invalid_scalar to string") - request.node.add_marker(mark) - - super().test_insert_invalid(data, invalid_scalar) - class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 7566c17eda9e6..ea76a4b4b1cfc 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -403,6 +403,33 @@ def test_insert_base(self, index): # test 0th element assert index[0:4].equals(result.insert(0, index[0])) + def test_insert_out_of_bounds(self, index): + # TypeError/IndexError matches what np.insert raises in these cases + + if len(index) > 0: + err = TypeError + else: + err = IndexError + if len(index) == 0: + # 0 vs 0.5 in error message varies with numpy version + msg = "index (0|0.5) is out of bounds for axis 0 with size 0" + else: + msg = "slice indices must be integers or None or have an __index__ method" + with pytest.raises(err, match=msg): + index.insert(0.5, "foo") + + msg = "|".join( + [ + r"index -?\d+ is out of bounds for axis 0 with size \d+", + "loc must be an integer between", + ] + ) + with pytest.raises(IndexError, match=msg): + index.insert(len(index) + 1, 1) + + with pytest.raises(IndexError, match=msg): + index.insert(-len(index) - 1, 1) + def test_delete_base(self, index): if not len(index): return