diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index 31b507e9b7800..cdb53e0e838e9 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -87,11 +87,7 @@ cdef class IndexEngine: values = self.values self._check_type(val) - try: - loc = _bin_search(values, val) # .searchsorted(val, side='left') - except TypeError: - # GH#35788 e.g. val=None with float64 values - raise KeyError(val) + loc = self._searchsorted_left(val) if loc >= len(values): raise KeyError(val) if values[loc] != val: @@ -110,6 +106,17 @@ cdef class IndexEngine: # GH#41775 OverflowError e.g. if we are uint64 and val is -1 raise KeyError(val) + cdef Py_ssize_t _searchsorted_left(self, val) except? -1: + """ + See ObjectEngine._searchsorted_left.__doc__. + """ + try: + loc = self.values.searchsorted(val, side="left") + except TypeError as err: + # GH#35788 e.g. val=None with float64 values + raise KeyError(val) + return loc + cdef inline _get_loc_duplicates(self, object val): # -> Py_ssize_t | slice | ndarray[bool] cdef: @@ -373,6 +380,11 @@ cdef class IndexEngine: cdef Py_ssize_t _bin_search(ndarray values, object val) except -1: + # GH#1757 ndarray.searchsorted is not safe to use with array of tuples + # (treats a tuple `val` as a sequence of keys instead of a single key), + # so we implement something similar. + # This is equivalent to the stdlib's bisect.bisect_left + cdef: Py_ssize_t mid = 0, lo = 0, hi = len(values) - 1 object pval @@ -405,6 +417,15 @@ cdef class ObjectEngine(IndexEngine): cdef _make_hash_table(self, Py_ssize_t n): return _hash.PyObjectHashTable(n) + cdef Py_ssize_t _searchsorted_left(self, val) except? -1: + # using values.searchsorted here would treat a tuple `val` as a sequence + # instead of a single key, so we use a different implementation + try: + loc = _bin_search(self.values, val) + except TypeError as err: + raise KeyError(val) from err + return loc + cdef class DatetimeEngine(Int64Engine): diff --git a/pandas/tests/indexes/base_class/test_indexing.py b/pandas/tests/indexes/base_class/test_indexing.py index 654f5a89f1828..d961f3f416bda 100644 --- a/pandas/tests/indexes/base_class/test_indexing.py +++ b/pandas/tests/indexes/base_class/test_indexing.py @@ -1,6 +1,7 @@ import numpy as np import pytest +import pandas as pd from pandas import Index import pandas._testing as tm @@ -36,3 +37,22 @@ def test_get_indexer_non_unique_dtype_mismatch(self): indexes, missing = Index(["A", "B"]).get_indexer_non_unique(Index([0])) tm.assert_numpy_array_equal(np.array([-1], dtype=np.intp), indexes) tm.assert_numpy_array_equal(np.array([0], dtype=np.intp), missing) + + +class TestGetLoc: + @pytest.mark.slow # to_flat_index takes a while + def test_get_loc_tuple_monotonic_above_size_cutoff(self): + # Go through the libindex path for which using + # _bin_search vs ndarray.searchsorted makes a difference + + lev = list("ABCDEFGHIJKLMNOPQRSTUVWXYZ") + dti = pd.date_range("2016-01-01", periods=100) + + mi = pd.MultiIndex.from_product([lev, range(10 ** 3), dti]) + oidx = mi.to_flat_index() + + loc = len(oidx) // 2 + tup = oidx[loc] + + res = oidx.get_loc(tup) + assert res == loc diff --git a/pandas/tests/indexes/numeric/test_indexing.py b/pandas/tests/indexes/numeric/test_indexing.py index 8f113491dad60..dae8a3340bcd2 100644 --- a/pandas/tests/indexes/numeric/test_indexing.py +++ b/pandas/tests/indexes/numeric/test_indexing.py @@ -152,6 +152,14 @@ def test_get_loc_float_index_nan_with_method(self, vals, method): with tm.assert_produces_warning(FutureWarning, match="deprecated"): idx.get_loc(np.nan, method=method) + @pytest.mark.parametrize("dtype", ["f8", "i8", "u8"]) + def test_get_loc_numericindex_none_raises(self, dtype): + # case that goes through searchsorted and key is non-comparable to values + arr = np.arange(10 ** 7, dtype=dtype) + idx = Index(arr) + with pytest.raises(KeyError, match="None"): + idx.get_loc(None) + class TestGetIndexer: def test_get_indexer(self):