From ac94414a9506b485be9b5b5a7b5eb6cabf5f4481 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 19 Dec 2018 07:10:36 -0600 Subject: [PATCH 1/5] ENH: ExtensionArray.searchsorted --- doc/source/whatsnew/v0.24.0.rst | 1 + pandas/core/arrays/base.py | 49 ++++++++++++++++++++++ pandas/core/arrays/sparse.py | 10 +++++ pandas/tests/extension/base/methods.py | 20 +++++++++ pandas/tests/extension/json/test_json.py | 4 ++ pandas/tests/extension/test_categorical.py | 4 ++ pandas/tests/extension/test_sparse.py | 4 ++ 7 files changed, 92 insertions(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index fe5e4a57c557a..60227652e6e7d 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -990,6 +990,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) +- :meth:`~pandas.api.types.ExtensionArray.searchsorted` has been added (:issue:``) - An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) - Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index cf145064fd7b1..e80af43c624fe 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -64,6 +64,7 @@ class ExtensionArray(object): * unique * factorize / _values_for_factorize * argsort / _values_for_argsort + * searchsorted The remaining methods implemented on this class should be performant, as they only compose abstract methods. Still, a more efficient @@ -505,6 +506,54 @@ def unique(self): uniques = unique(self.astype(object)) return self._from_sequence(uniques, dtype=self.dtype) + def searchsorted(self, v, side="left", sorter=None): + """ + Find indices where elements should be inserted to maintain order. + + .. versionadded:: 0.25.0 + + Find the indices into a sorted array `self` (a) such that, if the + corresponding elements in `v` were inserted before the indices, the + order of `self` would be preserved. + + Assuming that `a` is sorted: + + ====== ============================ + `side` returned index `i` satisfies + ====== ============================ + left ``self[i-1] < v <= self[i]`` + right ``self[i-1] <= v < self[i]`` + ====== ============================ + + Parameters + ---------- + v : array_like + Values to insert into `self`. + side : {'left', 'right'}, optional + If 'left', the index of the first suitable location found is given. + If 'right', return the last such index. If there is no suitable + index, return either 0 or N (where N is the length of `self`). + sorter : 1-D array_like, optional + Optional array of integer indices that sort array a into ascending + order. They are typically the result of argsort. + + Returns + ------- + indices : array of ints + Array of insertion points with the same shape as `v`. + + See Also + -------- + numpy.searchsorted : Similar method from NumPy. + """ + # Note: the base tests provided by pandas only test the basics. + # We do not test + # 1. Values outside the range of the `data_for_sorting` fixture + # 2. Values between the values in the `data_for_sorting` fixture + # 3. Missing values. + arr = self.astype(object) + return arr.searchsorted(v, side=side, sorter=sorter) + def _values_for_factorize(self): # type: () -> Tuple[ndarray, Any] """ diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 9e1d2efc21b81..4ceeaa1fa9157 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1166,6 +1166,16 @@ def _take_without_fill(self, indices): return taken + def searchsorted(self, v, side="left", sorter=None): + msg = "searchsorted requires high memory usage." + warnings.warn(msg, PerformanceWarning, stacklevel=2) + if not is_scalar(v): + v = np.asarray(v) + v = np.asarray(v) + return np.asarray(self, dtype=self.dtype.subtype).searchsorted( + v, side, sorter + ) + def copy(self, deep=False): if deep: values = self.sp_values.copy() diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 4a409a84f3db4..93e6e70372478 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -231,6 +231,26 @@ def test_hash_pandas_object_works(self, data, as_frame): b = pd.util.hash_pandas_object(data) self.assert_equal(a, b) + def test_searchsorted(self, data_for_sorting): + b, c, a = data_for_sorting + arr = type(data_for_sorting)._from_sequence([a, b, c]) + assert arr.searchsorted(a) == 0 + assert arr.searchsorted(a, side="right") == 1 + + assert arr.searchsorted(b) == 1 + assert arr.searchsorted(b, side="right") == 2 + + assert arr.searchsorted(c) == 2 + assert arr.searchsorted(c, side="right") == 3 + + result = arr.searchsorted(arr.take([0, 2])) + expected = np.array([0, 2]) + tm.assert_numpy_array_equal(result, expected) + + # sorter + sorter = np.array([1, 2, 0]) + assert data_for_sorting.searchsorted(a, sorter=sorter) == 0 + @pytest.mark.parametrize("as_frame", [True, False]) def test_where_series(self, data, na_value, as_frame): assert data[0] != data[1] diff --git a/pandas/tests/extension/json/test_json.py b/pandas/tests/extension/json/test_json.py index a35997b07fd83..9ee131950f19c 100644 --- a/pandas/tests/extension/json/test_json.py +++ b/pandas/tests/extension/json/test_json.py @@ -232,6 +232,10 @@ def test_where_series(self, data, na_value): # with shapes (4,) (4,) (0,) super().test_where_series(data, na_value) + @pytest.mark.skip(reason="Can't compare dicts.") + def test_searchsorted(self, data_for_sorting): + super(TestMethods, self).test_searchsorted(data_for_sorting) + class TestCasting(BaseJSON, base.BaseCastingTests): @pytest.mark.skip(reason="failing on np.array(self, dtype=str)") diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 6106bc3d58620..c876db416470c 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -189,6 +189,10 @@ def test_combine_add(self, data_repeated): def test_fillna_length_mismatch(self, data_missing): super().test_fillna_length_mismatch(data_missing) + def test_searchsorted(self, data_for_sorting): + if not data_for_sorting.ordered: + raise pytest.skip(reason="searchsorted requires ordered data.") + class TestCasting(base.BaseCastingTests): pass diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index ea849a78cda12..7fceade674595 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -286,6 +286,10 @@ def test_combine_first(self, data): pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.") super(TestMethods, self).test_combine_first(data) + def test_searchsorted(self, data_for_sorting): + with tm.assert_produces_warning(PerformanceWarning): + super(TestMethods, self).test_searchsorted(data_for_sorting) + class TestCasting(BaseSparseTests, base.BaseCastingTests): pass From 58418abee8b1a02f9b4b1512286b0db96aaf35aa Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 19 Dec 2018 07:22:17 -0600 Subject: [PATCH 2/5] PR number --- doc/source/whatsnew/v0.24.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 60227652e6e7d..b26a30f0cd2b8 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -990,7 +990,7 @@ update the ``ExtensionDtype._metadata`` tuple to match the signature of your - ``ExtensionArray`` has gained the abstract methods ``.dropna()`` (:issue:`21185`) - ``ExtensionDtype`` has gained the ability to instantiate from string dtypes, e.g. ``decimal`` would instantiate a registered ``DecimalDtype``; furthermore the ``ExtensionDtype`` has gained the method ``construct_array_type`` (:issue:`21185`) -- :meth:`~pandas.api.types.ExtensionArray.searchsorted` has been added (:issue:``) +- :meth:`~pandas.api.types.ExtensionArray.searchsorted` has been added (:issue:`24350`) - An ``ExtensionArray`` with a boolean dtype now works correctly as a boolean indexer. :meth:`pandas.api.types.is_bool_dtype` now properly considers them boolean (:issue:`22326`) - Added ``ExtensionDtype._is_numeric`` for controlling whether an extension dtype is considered numeric (:issue:`22290`). - The ``ExtensionArray`` constructor, ``_from_sequence`` now take the keyword arg ``copy=False`` (:issue:`21185`) From fee9c1a332de0f272f9c4480915bb0686e44724b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 19 Dec 2018 08:16:32 -0600 Subject: [PATCH 3/5] 32-bit compat --- pandas/tests/extension/base/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 93e6e70372478..daa7fe05b9cc8 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -244,7 +244,7 @@ def test_searchsorted(self, data_for_sorting): assert arr.searchsorted(c, side="right") == 3 result = arr.searchsorted(arr.take([0, 2])) - expected = np.array([0, 2]) + expected = np.array([0, 2], dtype=np.intp) tm.assert_numpy_array_equal(result, expected) # sorter From ff8bbc361b5bed4c244ba38a4a49c0e2e79758e5 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Wed, 19 Dec 2018 08:28:41 -0600 Subject: [PATCH 4/5] updates --- pandas/core/arrays/base.py | 2 +- pandas/core/base.py | 2 +- pandas/tests/extension/base/methods.py | 7 ++++++- pandas/tests/extension/test_sparse.py | 6 ++++-- 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index e80af43c624fe..7226311614bc2 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -510,7 +510,7 @@ def searchsorted(self, v, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. - .. versionadded:: 0.25.0 + .. versionadded:: 0.24.0 Find the indices into a sorted array `self` (a) such that, if the corresponding elements in `v` were inserted before the indices, the diff --git a/pandas/core/base.py b/pandas/core/base.py index 928e90977f95b..2b1dc4e86663a 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -1399,7 +1399,7 @@ def factorize(self, sort=False, na_sentinel=-1): @Appender(_shared_docs['searchsorted']) def searchsorted(self, value, side='left', sorter=None): # needs coercion on the key (DatetimeIndex does already) - return self.values.searchsorted(value, side=side, sorter=sorter) + return self._values.searchsorted(value, side=side, sorter=sorter) def drop_duplicates(self, keep='first', inplace=False): inplace = validate_bool_kwarg(inplace, 'inplace') diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index daa7fe05b9cc8..00d5c18c28edd 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -231,9 +231,13 @@ def test_hash_pandas_object_works(self, data, as_frame): b = pd.util.hash_pandas_object(data) self.assert_equal(a, b) - def test_searchsorted(self, data_for_sorting): + @pytest.mark.parametrize("as_series", [True, False]) + def test_searchsorted(self, data_for_sorting, as_series): b, c, a = data_for_sorting arr = type(data_for_sorting)._from_sequence([a, b, c]) + + if as_series: + arr = pd.Series(arr) assert arr.searchsorted(a) == 0 assert arr.searchsorted(a, side="right") == 1 @@ -245,6 +249,7 @@ def test_searchsorted(self, data_for_sorting): result = arr.searchsorted(arr.take([0, 2])) expected = np.array([0, 2], dtype=np.intp) + tm.assert_numpy_array_equal(result, expected) # sorter diff --git a/pandas/tests/extension/test_sparse.py b/pandas/tests/extension/test_sparse.py index 7fceade674595..257eb44cd94fe 100644 --- a/pandas/tests/extension/test_sparse.py +++ b/pandas/tests/extension/test_sparse.py @@ -286,9 +286,11 @@ def test_combine_first(self, data): pytest.skip("TODO(SparseArray.__setitem__ will preserve dtype.") super(TestMethods, self).test_combine_first(data) - def test_searchsorted(self, data_for_sorting): + @pytest.mark.parametrize("as_series", [True, False]) + def test_searchsorted(self, data_for_sorting, as_series): with tm.assert_produces_warning(PerformanceWarning): - super(TestMethods, self).test_searchsorted(data_for_sorting) + super(TestMethods, self).test_searchsorted(data_for_sorting, + as_series=as_series) class TestCasting(BaseSparseTests, base.BaseCastingTests): From a91fcec63155c010a1f6677b61dbd6d0235b111d Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 28 Dec 2018 13:54:45 -0600 Subject: [PATCH 5/5] v -> value --- pandas/core/arrays/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index 0f4bcf41a9331..2d4f8ca9c2cee 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -519,7 +519,7 @@ def unique(self): uniques = unique(self.astype(object)) return self._from_sequence(uniques, dtype=self.dtype) - def searchsorted(self, v, side="left", sorter=None): + def searchsorted(self, value, side="left", sorter=None): """ Find indices where elements should be inserted to maintain order. @@ -540,7 +540,7 @@ def searchsorted(self, v, side="left", sorter=None): Parameters ---------- - v : array_like + value : array_like Values to insert into `self`. side : {'left', 'right'}, optional If 'left', the index of the first suitable location found is given. @@ -553,7 +553,7 @@ def searchsorted(self, v, side="left", sorter=None): Returns ------- indices : array of ints - Array of insertion points with the same shape as `v`. + Array of insertion points with the same shape as `value`. See Also -------- @@ -565,7 +565,7 @@ def searchsorted(self, v, side="left", sorter=None): # 2. Values between the values in the `data_for_sorting` fixture # 3. Missing values. arr = self.astype(object) - return arr.searchsorted(v, side=side, sorter=sorter) + return arr.searchsorted(value, side=side, sorter=sorter) def _values_for_factorize(self): # type: () -> Tuple[ndarray, Any]