-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF/REF: improve performance of Series.searchsorted, PandasArray.searchsorted, collect functionality #22034
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 8 commits
6ad3f12
60742c3
672802d
c1a337c
686a0a1
ea8280e
a9905fd
9e6ed43
bcbe226
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -64,7 +64,8 @@ Performance Improvements | |
|
||
- Significant speedup in `SparseArray` initialization that benefits most operations, fixing performance regression introduced in v0.20.0 (:issue:`24985`) | ||
- `DataFrame.to_stata()` is now faster when outputting data with any string or non-native endian columns (:issue:`25045`) | ||
- | ||
- Improved performance of :meth:`Series.searchsorted`. The speedup is especially large when the dtype is | ||
int8/int16/int32 and the searched key is within the integer bounds for the dtype(:issue:`22034`) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need a space before the parens |
||
|
||
|
||
.. _whatsnew_0250.bug_fixes: | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,7 +19,7 @@ | |
ensure_float64, ensure_int64, ensure_object, ensure_platform_int, | ||
ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, | ||
is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, | ||
is_datetimelike, is_extension_array_dtype, is_float_dtype, | ||
is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer, | ||
is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, | ||
is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, | ||
is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, | ||
|
@@ -1724,6 +1724,88 @@ def func(arr, indexer, out, fill_value=np.nan): | |
return out | ||
|
||
|
||
# ------------ # | ||
# searchsorted # | ||
# ------------ # | ||
|
||
def searchsorted(arr, value, side="left", sorter=None): | ||
""" | ||
Find indices where elements should be inserted to maintain order. | ||
|
||
.. versionadded:: 0.25.0 | ||
|
||
Find the indices into a sorted array `self` (a) such that, if the | ||
corresponding elements in `value` were inserted before the indices, | ||
the order of `self` would be preserved. | ||
|
||
Assuming that `self` is sorted: | ||
|
||
====== ================================ | ||
`side` returned index `i` satisfies | ||
====== ================================ | ||
left ``self[i-1] < value <= self[i]`` | ||
right ``self[i-1] <= value < self[i]`` | ||
====== ================================ | ||
|
||
Parameters | ||
---------- | ||
arr: numpy.array or ExtensionArray | ||
array to search in. Cannot be Index, Series or PandasArray, as that | ||
would cause a RecursionError. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not sure what this is referring. why is this not an array-like here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes this text is wrong now, it can be indeed be array-like. |
||
value : array_like | ||
Values to insert into `arr`. | ||
side : {'left', 'right'}, optional | ||
If 'left', the index of the first suitable location found is given. | ||
If 'right', return the last such index. If there is no suitable | ||
index, return either 0 or N (where N is the length of `self`). | ||
sorter : 1-D array_like, optional | ||
Optional array of integer indices that sort array a into ascending | ||
order. They are typically the result of argsort. | ||
|
||
Returns | ||
------- | ||
array of ints | ||
Array of insertion points with the same shape as `value`. | ||
|
||
See Also | ||
-------- | ||
numpy.searchsorted : Similar method from NumPy. | ||
""" | ||
if sorter is not None: | ||
sorter = ensure_platform_int(sorter) | ||
|
||
if isinstance(arr, np.ndarray) and is_integer_dtype(arr) and ( | ||
is_integer(value) or is_integer_dtype(value)): | ||
from .arrays.array_ import array | ||
# if `arr` and `value` have different dtypes, `arr` would be | ||
# recast by numpy, causing a slow search. | ||
# Before searching below, we therefore try to give `value` the | ||
# same dtype as `arr`, while guarding against integer overflows. | ||
iinfo = np.iinfo(arr.dtype.type) | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
value_arr = np.array([value]) if is_scalar(value) else np.array(value) | ||
if (value_arr >= iinfo.min).all() and (value_arr <= iinfo.max).all(): | ||
# value within bounds, so no overflow, so can convert value dtype | ||
# to dtype of arr | ||
dtype = arr.dtype | ||
else: | ||
dtype = value_arr.dtype | ||
|
||
if is_scalar(value): | ||
value = dtype.type(value) | ||
else: | ||
value = array(value, dtype=dtype) | ||
elif not (is_object_dtype(arr) or is_numeric_dtype(arr) or | ||
is_categorical_dtype(arr)): | ||
from pandas.core.series import Series | ||
# E.g. if `arr` is an array with dtype='datetime64[ns]' | ||
# and `value` is a pd.Timestamp, we may need to convert value | ||
value_ser = Series(value)._values | ||
value = value_ser[0] if is_scalar(value) else value_ser | ||
|
||
result = arr.searchsorted(value, side=side, sorter=sorter) | ||
return result | ||
|
||
|
||
# ---- # | ||
# diff # | ||
# ---- # | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
|
||
import pandas as pd | ||
from pandas.api.extensions import register_extension_dtype | ||
from pandas.api.types import is_scalar | ||
from pandas.core.arrays import PandasArray, integer_array, period_array | ||
from pandas.tests.extension.decimal import ( | ||
DecimalArray, DecimalDtype, to_decimal) | ||
|
@@ -254,3 +255,45 @@ def test_array_not_registered(registry_without_decimal): | |
result = pd.array(data, dtype=DecimalDtype) | ||
expected = DecimalArray._from_sequence(data) | ||
tm.assert_equal(result, expected) | ||
|
||
|
||
class TestArrayAnalytics(object): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. there are already tests in pandas/tests/extension/base/methods.py are these supplmental? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Extension arrays and PandasArrays do not have the same interface, so the tests in pandas/tests/extension/base/methods.py will not work for PandasArrays. E.g. BTW, I copied these tests from pandas/tests/series/test_analytics.py. |
||
def test_searchsorted(self, string_dtype): | ||
arr = pd.array(['a', 'b', 'c'], dtype=string_dtype) | ||
|
||
result = arr.searchsorted('a', side='left') | ||
assert is_scalar(result) | ||
assert result == 0 | ||
|
||
result = arr.searchsorted('a', side='right') | ||
assert is_scalar(result) | ||
assert result == 1 | ||
|
||
def test_searchsorted_numeric_dtypes_scalar(self, any_real_dtype): | ||
arr = pd.array([1, 3, 90], dtype=any_real_dtype) | ||
result = arr.searchsorted(30) | ||
assert is_scalar(result) | ||
assert result == 2 | ||
|
||
result = arr.searchsorted([30]) | ||
expected = np.array([2], dtype=np.intp) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_searchsorted_numeric_dtypes_vector(self, any_real_dtype): | ||
arr = pd.array([1, 3, 90], dtype=any_real_dtype) | ||
result = arr.searchsorted([2, 30]) | ||
expected = np.array([1, 2], dtype=np.intp) | ||
tm.assert_numpy_array_equal(result, expected) | ||
|
||
def test_search_sorted_datetime64_scalar(self): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can you test for timedelta & datetime w/tz as well |
||
arr = pd.array(pd.date_range('20120101', periods=10, freq='2D')) | ||
val = pd.Timestamp('20120102') | ||
result = arr.searchsorted(val) | ||
assert is_scalar(result) | ||
assert result == 1 | ||
|
||
def test_searchsorted_sorter(self, any_real_dtype): | ||
arr = pd.array([3, 1, 2], dtype=any_real_dtype) | ||
result = arr.searchsorted([0, 3], sorter=np.argsort(arr)) | ||
expected = np.array([0, 2], dtype=np.intp) | ||
tm.assert_numpy_array_equal(result, expected) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for a followup can add EA types here (Int8 and so on)