diff --git a/asv_bench/benchmarks/series_methods.py b/asv_bench/benchmarks/series_methods.py index f7d0083b86a01..3303483c50e20 100644 --- a/asv_bench/benchmarks/series_methods.py +++ b/asv_bench/benchmarks/series_methods.py @@ -124,6 +124,25 @@ def time_dropna(self, dtype): self.s.dropna() +class SearchSorted(object): + + goal_time = 0.2 + params = ['int8', 'int16', 'int32', 'int64', + 'uint8', 'uint16', 'uint32', 'uint64', + 'float16', 'float32', 'float64', + 'str'] + param_names = ['dtype'] + + def setup(self, dtype): + N = 10**5 + data = np.array([1] * N + [2] * N + [3] * N).astype(dtype) + self.s = Series(data) + + def time_searchsorted(self, dtype): + key = '2' if dtype == 'str' else 2 + self.s.searchsorted(key) + + class Map(object): params = ['dict', 'Series'] diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 6e225185ecf84..bc4778a3cce2d 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -64,7 +64,8 @@ Performance Improvements - Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) - `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) -- +- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is + int8/int16/int32 and the searched key is within the integer bounds for the dtype (:issue:`22034`) .. _whatsnew_0250.bug_fixes: diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index c5c8f47ad6dba..b056a357d0a51 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -19,7 +19,7 @@ ensure_float64, ensure_int64, ensure_object, ensure_platform_int, ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_datetimelike, is_extension_array_dtype, is_float_dtype, + is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, @@ -1724,6 +1724,89 @@ def func(arr, indexer, out, fill_value=np.nan): return out +# ------------ # +# searchsorted # +# ------------ # + +def searchsorted(arr, value, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + .. versionadded:: 0.25.0 + + Find the indices into a sorted array `arr` (a) such that, if the + corresponding elements in `value` were inserted before the indices, + the order of `arr` would be preserved. + + Assuming that `arr` is sorted: + + ====== ================================ + `side` returned index `i` satisfies + ====== ================================ + left ``arr[i-1] < value <= self[i]`` + right ``arr[i-1] <= value < self[i]`` + ====== ================================ + + Parameters + ---------- + arr: array-like + Input array. If `sorter` is None, then it must be sorted in + ascending order, otherwise `sorter` must be an array of indices + that sort it. + value : array_like + Values to insert into `arr`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + array of ints + Array of insertion points with the same shape as `value`. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + """ + if sorter is not None: + sorter = ensure_platform_int(sorter) + + if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and ( + is_integer(value) or is_integer_dtype(value)): + from .arrays.array_ import array + # if `arr` and `value` have different dtypes, `arr` would be + # recast by numpy, causing a slow search. + # Before searching below, we therefore try to give `value` the + # same dtype as `arr`, while guarding against integer overflows. + iinfo = np.iinfo(arr.dtype.type) + value_arr = np.array([value]) if is_scalar(value) else np.array(value) + if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): + # value within bounds, so no overflow, so can convert value dtype + # to dtype of arr + dtype = arr.dtype + else: + dtype = value_arr.dtype + + if is_scalar(value): + value = dtype.type(value) + else: + value = array(value, dtype=dtype) + elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or + is_categorical_dtype(arr)): + from pandas.core.series import Series + # E.g. if `arr` is an array with dtype='datetime64[ns]' + # and `value` is a pd.Timestamp, we may need to convert value + value_ser = Series(value)._values + value = value_ser[0] if is_scalar(value) else value_ser + + result = arr.searchsorted(value, side=side, sorter=sorter) + return result + + # ---- # # diff # # ---- # diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 7aaefef3d03e5..e770281596134 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -555,17 +555,17 @@ def searchsorted(self, value, side="left", sorter=None): .. versionadded:: 0.24.0 Find the indices into a sorted array `self` (a) such that, if the - corresponding elements in `v` were inserted before the indices, the - order of `self` would be preserved. + corresponding elements in `value` were inserted before the indices, + the order of `self` would be preserved. - Assuming that `a` is sorted: + Assuming that `self` is sorted: - ====== ============================ + ====== ================================ `side` returned index `i` satisfies - ====== ============================ - left ``self[i-1] < v <= self[i]`` - right ``self[i-1] <= v < self[i]`` - ====== ============================ + ====== ================================ + left ``self[i-1] < value <= self[i]`` + right ``self[i-1] <= value < self[i]`` + ====== ================================ Parameters ---------- @@ -581,7 +581,7 @@ def searchsorted(self, value, side="left", sorter=None): Returns ------- - indices : array of ints + array of ints Array of insertion points with the same shape as `value`. See Also diff --git a/pandas/core/arrays/numpy_.py b/pandas/core/arrays/numpy_.py index 791ff44303e96..8e2ab586cacb6 100644 --- a/pandas/core/arrays/numpy_.py +++ b/pandas/core/arrays/numpy_.py @@ -4,6 +4,7 @@ from pandas._libs import lib from pandas.compat.numpy import function as nv +from pandas.util._decorators import Appender from pandas.util._validators import validate_fillna_kwargs from pandas.core.dtypes.dtypes import ExtensionDtype @@ -12,6 +13,7 @@ from pandas import compat from pandas.core import nanops +from pandas.core.algorithms import searchsorted from pandas.core.missing import backfill_1d, pad_1d from .base import ExtensionArray, ExtensionOpsMixin @@ -423,6 +425,11 @@ def to_numpy(self, dtype=None, copy=False): return result + @Appender(ExtensionArray.searchsorted.__doc__) + def searchsorted(self, value, side='left', sorter=None): + return searchsorted(self.to_numpy(), value, + side=side, sorter=sorter) + # ------------------------------------------------------------------------ # Ops diff --git a/pandas/core/base.py b/pandas/core/base.py index 7fdc64a8d9f85..f896596dd5216 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1522,11 +1522,11 @@ def factorize(self, sort=False, na_sentinel=-1): array([3]) """) - @Substitution(klass='IndexOpsMixin') + @Substitution(klass='Index') @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - # needs coercion on the key (DatetimeIndex does already) - return self._values.searchsorted(value, side=side, sorter=sorter) + return algorithms.searchsorted(self._values, value, + side=side, sorter=sorter) def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') diff --git a/pandas/core/series.py b/pandas/core/series.py index a5dfe8d43c336..ad7c6af21f637 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -2392,12 +2392,8 @@ def __rmatmul__(self, other): @Substitution(klass='Series') @Appender(base._shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): - if sorter is not None: - sorter = ensure_platform_int(sorter) - result = self._values.searchsorted(Series(value)._values, - side=side, sorter=sorter) - - return result[0] if is_scalar(value) else result + return algorithms.searchsorted(self._values, value, + side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination diff --git a/pandas/tests/arrays/test_array.py b/pandas/tests/arrays/test_array.py index 9fea1989e46df..b68ec2bf348b4 100644 --- a/pandas/tests/arrays/test_array.py +++ b/pandas/tests/arrays/test_array.py @@ -9,6 +9,7 @@ import pandas as pd from pandas.api.extensions import register_extension_dtype +from pandas.api.types import is_scalar from pandas.core.arrays import PandasArray, integer_array, period_array from pandas.tests.extension.decimal import ( DecimalArray, DecimalDtype, to_decimal) @@ -254,3 +255,51 @@ def test_array_not_registered(registry_without_decimal): result = pd.array(data, dtype=DecimalDtype) expected = DecimalArray._from_sequence(data) tm.assert_equal(result, expected) + + +class TestArrayAnalytics(object): + def test_searchsorted(self, string_dtype): + arr = pd.array(['a', 'b', 'c'], dtype=string_dtype) + + result = arr.searchsorted('a', side='left') + assert is_scalar(result) + assert result == 0 + + result = arr.searchsorted('a', side='right') + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted(30) + assert is_scalar(result) + assert result == 2 + + result = arr.searchsorted([30]) + expected = np.array([2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): + arr = pd.array([1, 3, 90], dtype=any_real_dtype) + result = arr.searchsorted([2, 30]) + expected = np.array([1, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize('arr, val', [ + [pd.date_range('20120101', periods=10, freq='2D'), + pd.Timestamp('20120102')], + [pd.date_range('20120101', periods=10, freq='2D', tz='Asia/Hong_Kong'), + pd.Timestamp('20120102', tz='Asia/Hong_Kong')], + [pd.timedelta_range(start='1 day', end='10 days', periods=10), + pd.Timedelta('2 days')]]) + def test_search_sorted_datetime64_scalar(self, arr, val): + arr = pd.array(arr) + result = arr.searchsorted(val) + assert is_scalar(result) + assert result == 1 + + def test_searchsorted_sorter(self, any_real_dtype): + arr = pd.array([3, 1, 2], dtype=any_real_dtype) + result = arr.searchsorted([0, 3], sorter=np.argsort(arr)) + expected = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected)