diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 0118dea6f8867..d9c226176c30e 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -43,6 +43,7 @@ Enhancements API changes ~~~~~~~~~~~ +- ``searchsorted`` for ``Index`` and ``TimedeltaIndex`` now accept a ``sorter`` argument to maintain compatibility with numpy's ``searchsorted`` function (:issue:`12238`) - ``Period`` and ``PeriodIndex`` now raises ``IncompatibleFrequency`` error which inherits ``ValueError`` rather than raw ``ValueError`` (:issue:`12615`) diff --git a/pandas/core/base.py b/pandas/core/base.py index 168310b6d7da0..3ebd60d45b48d 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -7,7 +7,8 @@ from pandas.core import common as com import pandas.core.nanops as nanops import pandas.lib as lib -from pandas.util.decorators import Appender, cache_readonly, deprecate_kwarg +from pandas.util.decorators import (Appender, cache_readonly, + deprecate_kwarg, Substitution) from pandas.core.common import AbstractMethodError _shared_docs = dict() @@ -990,13 +991,73 @@ def factorize(self, sort=False, na_sentinel=-1): from pandas.core.algorithms import factorize return factorize(self, sort=sort, na_sentinel=na_sentinel) - def searchsorted(self, key, side='left'): - """ np.ndarray searchsorted compat """ + _shared_docs['searchsorted'] = ( + """Find indices where elements should be inserted to maintain order. - # FIXME in GH7447 - # needs coercion on the key (DatetimeIndex does alreay) - # needs tests/doc-string - return self.values.searchsorted(key, side=side) + Find the indices into a sorted %(klass)s `self` such that, if the + corresponding elements in `v` were inserted before the indices, the + order of `self` would be preserved. + + Parameters + ---------- + %(value)s : array_like + Values to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort `self` into ascending + order. They are typically the result of ``np.argsort``. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `v`. + + See Also + -------- + numpy.searchsorted + + Notes + ----- + Binary search is used to find the required insertion points. + + Examples + -------- + >>> x = pd.Series([1, 2, 3]) + >>> x + 0 1 + 1 2 + 2 3 + dtype: int64 + >>> x.searchsorted(4) + array([3]) + >>> x.searchsorted([0, 4]) + array([0, 3]) + >>> x.searchsorted([1, 3], side='left') + array([0, 2]) + >>> x.searchsorted([1, 3], side='right') + array([1, 3]) + >>> + >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ]) + [apple, bread, bread, cheese, milk] + Categories (4, object): [apple < bread < cheese < milk] + >>> x.searchsorted('bread') + array([1]) # Note: an array, not a scalar + >>> x.searchsorted(['bread']) + array([1]) + >>> x.searchsorted(['bread', 'eggs']) + array([1, 4]) + >>> x.searchsorted(['bread', 'eggs'], side='right') + array([3, 4]) # eggs before milk + """) + + @Substitution(klass='IndexOpsMixin', value='key') + @Appender(_shared_docs['searchsorted']) + def searchsorted(self, key, side='left', sorter=None): + # needs coercion on the key (DatetimeIndex does already) + return self.values.searchsorted(key, side=side, sorter=sorter) _shared_docs['drop_duplicates'] = ( """Return %(klass)s with duplicate values removed diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 35fa06ce5009c..69c1adbfae574 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -8,10 +8,12 @@ from pandas.compat import u from pandas.core.algorithms import factorize -from pandas.core.base import PandasObject, PandasDelegate, NoNewAttributesMixin +from pandas.core.base import (PandasObject, PandasDelegate, + NoNewAttributesMixin, _shared_docs) import pandas.core.common as com from pandas.core.missing import interpolate_2d -from pandas.util.decorators import cache_readonly, deprecate_kwarg +from pandas.util.decorators import (Appender, cache_readonly, + deprecate_kwarg, Substitution) from pandas.core.common import ( ABCSeries, ABCIndexClass, ABCCategoricalIndex, isnull, notnull, @@ -1003,59 +1005,9 @@ def memory_usage(self, deep=False): """ return self._codes.nbytes + self._categories.memory_usage(deep=deep) + @Substitution(klass='Categorical', value='v') + @Appender(_shared_docs['searchsorted']) def searchsorted(self, v, side='left', sorter=None): - """Find indices where elements should be inserted to maintain order. - - Find the indices into a sorted Categorical `self` such that, if the - corresponding elements in `v` were inserted before the indices, the - order of `self` would be preserved. - - Parameters - ---------- - v : array_like - Array-like values or a scalar value, to insert/search for in - `self`. - side : {'left', 'right'}, optional - If 'left', the index of the first suitable location found is given. - If 'right', return the last such index. If there is no suitable - index, return either 0 or N (where N is the length of `a`). - sorter : 1-D array_like, optional - Optional array of integer indices that sort `self` into ascending - order. They are typically the result of ``np.argsort``. - - Returns - ------- - indices : array of ints - Array of insertion points with the same shape as `v`. - - See Also - -------- - Series.searchsorted - numpy.searchsorted - - Notes - ----- - Binary search is used to find the required insertion points. - - Examples - -------- - >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk' ]) - [apple, bread, bread, cheese, milk] - Categories (4, object): [apple < bread < cheese < milk] - >>> x.searchsorted('bread') - array([1]) # Note: an array, not a scalar - >>> x.searchsorted(['bread']) - array([1]) - >>> x.searchsorted(['bread', 'eggs']) - array([1, 4]) - >>> x.searchsorted(['bread', 'eggs'], side='right') - array([3, 4]) # eggs before milk - >>> x = pd.Categorical(['apple', 'bread', 'bread', 'cheese', 'milk', - 'donuts' ]) - >>> x.searchsorted(['bread', 'eggs'], side='right', - sorter=[0, 1, 2, 3, 5, 4]) - array([3, 5]) # eggs after donuts, after switching milk and donuts - """ if not self.ordered: raise ValueError("Categorical not ordered\nyou can use " ".as_ordered() to change the Categorical to an " @@ -1063,7 +1015,8 @@ def searchsorted(self, v, side='left', sorter=None): from pandas.core.series import Series values_as_codes = self.categories.values.searchsorted( - Series(v).values, side) + Series(v).values, side=side) + return self.codes.searchsorted(values_as_codes, sorter=sorter) def isnull(self): diff --git a/pandas/core/series.py b/pandas/core/series.py index 80154065f0c8f..ffbea0d5704e3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -49,7 +49,7 @@ import pandas.core.datetools as datetools import pandas.core.format as fmt import pandas.core.nanops as nanops -from pandas.util.decorators import Appender, deprecate_kwarg +from pandas.util.decorators import Appender, deprecate_kwarg, Substitution import pandas.lib as lib import pandas.tslib as tslib @@ -1464,63 +1464,11 @@ def dot(self, other): else: # pragma: no cover raise TypeError('unsupported type: %s' % type(other)) + @Substitution(klass='Series', value='v') + @Appender(base._shared_docs['searchsorted']) def searchsorted(self, v, side='left', sorter=None): - """Find indices where elements should be inserted to maintain order. - - Find the indices into a sorted Series `self` such that, if the - corresponding elements in `v` were inserted before the indices, the - order of `self` would be preserved. - - Parameters - ---------- - v : array_like - Values to insert into `a`. - side : {'left', 'right'}, optional - If 'left', the index of the first suitable location found is given. - If 'right', return the last such index. If there is no suitable - index, return either 0 or N (where N is the length of `a`). - sorter : 1-D array_like, optional - Optional array of integer indices that sort `self` into ascending - order. They are typically the result of ``np.argsort``. - - Returns - ------- - indices : array of ints - Array of insertion points with the same shape as `v`. - - See Also - -------- - Series.sort_values - numpy.searchsorted - - Notes - ----- - Binary search is used to find the required insertion points. - - Examples - -------- - >>> x = pd.Series([1, 2, 3]) - >>> x - 0 1 - 1 2 - 2 3 - dtype: int64 - >>> x.searchsorted(4) - array([3]) - >>> x.searchsorted([0, 4]) - array([0, 3]) - >>> x.searchsorted([1, 3], side='left') - array([0, 2]) - >>> x.searchsorted([1, 3], side='right') - array([1, 3]) - >>> x.searchsorted([1, 2], side='right', sorter=[0, 2, 1]) - array([1, 3]) - """ - if sorter is not None: - sorter = com._ensure_platform_int(sorter) - - return self._values.searchsorted(Series(v)._values, side=side, - sorter=sorter) + return self._values.searchsorted(Series(v)._values, + side=side, sorter=sorter) # ------------------------------------------------------------------- # Combination diff --git a/pandas/tests/test_base.py b/pandas/tests/test_base.py index 99f894bfd3320..0a64bb058fbb4 100644 --- a/pandas/tests/test_base.py +++ b/pandas/tests/test_base.py @@ -972,6 +972,15 @@ def test_memory_usage(self): diff = res_deep - sys.getsizeof(o) self.assertTrue(abs(diff) < 100) + def test_searchsorted(self): + # See gh-12238 + for o in self.objs: + index = np.searchsorted(o, max(o)) + self.assertTrue(0 <= index <= len(o)) + + index = np.searchsorted(o, max(o), sorter=range(len(o))) + self.assertTrue(0 <= index <= len(o)) + class TestFloat64HashTable(tm.TestCase): diff --git a/pandas/tseries/index.py b/pandas/tseries/index.py index b3b43e1a5babb..8381273873dcf 100644 --- a/pandas/tseries/index.py +++ b/pandas/tseries/index.py @@ -5,6 +5,7 @@ from datetime import time, datetime from datetime import timedelta import numpy as np +from pandas.core.base import _shared_docs from pandas.core.common import (_NS_DTYPE, _INT64_DTYPE, _values_from_object, _maybe_box, is_object_dtype, is_datetime64_dtype, @@ -22,7 +23,8 @@ from pandas.tseries.offsets import DateOffset, generate_range, Tick, CDay from pandas.tseries.tools import parse_time_string, normalize_date, to_time from pandas.tseries.timedeltas import to_timedelta -from pandas.util.decorators import cache_readonly, deprecate_kwarg +from pandas.util.decorators import (Appender, cache_readonly, + deprecate_kwarg, Substitution) import pandas.core.common as com import pandas.tseries.offsets as offsets import pandas.tseries.tools as tools @@ -1629,7 +1631,9 @@ def normalize(self): return DatetimeIndex(new_values, freq='infer', name=self.name, tz=self.tz) - def searchsorted(self, key, side='left'): + @Substitution(klass='DatetimeIndex', value='key') + @Appender(_shared_docs['searchsorted']) + def searchsorted(self, key, side='left', sorter=None): if isinstance(key, (np.ndarray, Index)): key = np.array(key, dtype=_NS_DTYPE, copy=False) else: diff --git a/pandas/tseries/period.py b/pandas/tseries/period.py index 798df0b9e31bd..b34af4e62845b 100644 --- a/pandas/tseries/period.py +++ b/pandas/tseries/period.py @@ -13,13 +13,14 @@ get_period_field_arr, _validate_end_alias, _quarter_to_myear) +from pandas.core.base import _shared_docs + import pandas.core.common as com from pandas.core.common import (isnull, _INT64_DTYPE, _maybe_box, _values_from_object, ABCSeries, is_integer, is_float, is_object_dtype) from pandas import compat -from pandas.util.decorators import cache_readonly - +from pandas.util.decorators import Appender, cache_readonly, Substitution from pandas.lib import Timedelta import pandas.lib as lib import pandas.tslib as tslib @@ -385,7 +386,9 @@ def astype(self, dtype): return Index(self.values, dtype) raise ValueError('Cannot cast PeriodIndex to dtype %s' % dtype) - def searchsorted(self, key, side='left'): + @Substitution(klass='PeriodIndex', value='key') + @Appender(_shared_docs['searchsorted']) + def searchsorted(self, key, side='left', sorter=None): if isinstance(key, Period): if key.freq != self.freq: msg = _DIFFERENT_FREQ_INDEX.format(self.freqstr, key.freqstr) @@ -394,7 +397,7 @@ def searchsorted(self, key, side='left'): elif isinstance(key, compat.string_types): key = Period(key, freq=self.freq).ordinal - return self.values.searchsorted(key, side=side) + return self.values.searchsorted(key, side=side, sorter=sorter) @property def is_all_dates(self): diff --git a/pandas/tseries/tdi.py b/pandas/tseries/tdi.py index bea2aeb508358..6e54f1fde8a8f 100644 --- a/pandas/tseries/tdi.py +++ b/pandas/tseries/tdi.py @@ -10,7 +10,9 @@ import pandas.compat as compat from pandas.compat import u from pandas.tseries.frequencies import to_offset +from pandas.core.base import _shared_docs import pandas.core.common as com +from pandas.util.decorators import Appender, Substitution from pandas.tseries.base import TimelikeOps, DatetimeIndexOpsMixin from pandas.tseries.timedeltas import (to_timedelta, _coerce_scalar_to_timedelta_type) @@ -786,13 +788,15 @@ def _partial_td_slice(self, key, freq, use_lhs=True, use_rhs=True): # # try to find a the dates # return (lhs_mask & rhs_mask).nonzero()[0] - def searchsorted(self, key, side='left'): + @Substitution(klass='TimedeltaIndex', value='key') + @Appender(_shared_docs['searchsorted']) + def searchsorted(self, key, side='left', sorter=None): if isinstance(key, (np.ndarray, Index)): key = np.array(key, dtype=_TD_DTYPE, copy=False) else: key = _to_m8(key) - return self.values.searchsorted(key, side=side) + return self.values.searchsorted(key, side=side, sorter=sorter) def is_type_compatible(self, typ): return typ == self.inferred_type or typ == 'timedelta'