Skip to content

TYP: ExtensionArray delete() and searchsorted() #41513

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 20 commits into from
Sep 6, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions pandas/_typing.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,20 @@
PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"]
Scalar = Union[PythonScalar, PandasScalar]

# numpy compatible types
NumpyValueArrayLike = Union[PythonScalar, Sequence[PythonScalar]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The typing definition of Sequence does not include numpy arrays, EAs, Series, Index etc. #28770

from numpy types

    @overload
    def searchsorted(  # type: ignore[misc]
        self,  # >= 1D array
        v: _ScalarLike_co,  # 0D array-like
        side: _SortSide = ...,
        sorter: Optional[_ArrayLikeInt_co] = ...,
    ) -> intp: ...
    @overload
    def searchsorted(
        self,  # >= 1D array
        v: ArrayLike,
        side: _SortSide = ...,
        sorter: Optional[_ArrayLikeInt_co] = ...,
    ) -> ndarray[Any, dtype[intp]]: ...

Suggested change
NumpyValueArrayLike = Union[PythonScalar, Sequence[PythonScalar]]
NumpyValueArrayLike = npt.ArrayLike | npt._ScalarLike_co

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Next commit includes that change.

NumpySorter = Union[
int,
np.integer,
bool,
np.ndarray,
Sequence[int],
Sequence[np.integer],
Sequence[bool],
None,
]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is equivalent to Optional[npt._ArrayLikeInt_co]?

also use Optional[...] instead of Union[..., None]. https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#style-guidelines (Although the docs need updating for the new syntax, not relevant for aliases)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Redefined as suggested in next commit.



# timestamp and timedelta convertible types

TimestampConvertibleTypes = Union[
Expand Down
16 changes: 12 additions & 4 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
ArrayLike,
DtypeObj,
FrameOrSeriesUnion,
NumpySorter,
NumpyValueArrayLike,
Scalar,
)
from pandas.util._decorators import doc
Expand Down Expand Up @@ -1507,7 +1509,12 @@ def take(
# ------------ #


def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray:
def searchsorted(
arr: ArrayLike,
value: NumpyValueArrayLike,
side: Literal["left", "right"] = "left",
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this is npt._SortSide? we probably don't want to use private aliases in the codebase, but maybe OK to do so in pandas._typing. thoughts?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't use _SortSide from numpy as it is only defined in their pyi files

sorter: NumpySorter = None,
) -> np.ndarray:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
) -> np.ndarray:
) -> np.ndarray | int:

although if we return the value from numpy unchanged, the return type is np.intp?

docstring also needs updating?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is np.intp in next commit. Will fix docstring.

"""
Find indices where elements should be inserted to maintain order.

Expand All @@ -1532,7 +1539,7 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray:
Input array. If `sorter` is None, then it must be sorted in
ascending order, otherwise `sorter` must be an array of indices
that sort it.
value : array_like
value : array_like or single value
Values to insert into `arr`.
side : {'left', 'right'}, optional
If 'left', the index of the first suitable location found is given.
Expand Down Expand Up @@ -1573,9 +1580,10 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray:
dtype = value_arr.dtype

if is_scalar(value):
value = dtype.type(value)
# We know that value is int
value = cast(int, dtype.type(value))
else:
value = pd_array(value, dtype=dtype)
value = pd_array(cast(ArrayLike, value), dtype=dtype)
elif not (
is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr)
):
Expand Down
17 changes: 14 additions & 3 deletions pandas/core/arrays/_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from functools import wraps
from typing import (
TYPE_CHECKING,
Any,
Sequence,
TypeVar,
Expand All @@ -13,7 +14,9 @@
from pandas._libs import lib
from pandas._libs.arrays import NDArrayBacked
from pandas._typing import (
ArrayLike,
F,
NumpySorter,
PositionalIndexer2D,
Shape,
type_t,
Expand Down Expand Up @@ -45,6 +48,9 @@
"NDArrayBackedExtensionArrayT", bound="NDArrayBackedExtensionArray"
)

if TYPE_CHECKING:
from typing import Literal


def ravel_compat(meth: F) -> F:
"""
Expand Down Expand Up @@ -176,9 +182,14 @@ def _concat_same_type(
return to_concat[0]._from_backing_data(new_values) # type: ignore[arg-type]

@doc(ExtensionArray.searchsorted)
def searchsorted(self, value, side="left", sorter=None):
value = self._validate_searchsorted_value(value)
return self._ndarray.searchsorted(value, side=side, sorter=sorter)
def searchsorted(
self,
value: ArrayLike | object,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

object -> Scalar?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would violate the Liskov substitution principle. Our searchsorted needs to have the EA's reject an incompatible type

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess that would be bc you've typed EA.searchsorted with object, so why not change it there?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because in EA, we have to allow any type to be inside the EA, including non-EA types.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ArrayLike is a subtype of object so not needed? Why does this not use the NumpyValueArrayLike alias?

side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> np.ndarray:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also int?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, np.intp in next commit

npvalue: np.ndarray = cast(np.ndarray, self._validate_searchsorted_value(value))
return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter)

def _validate_searchsorted_value(self, value):
return value
Expand Down
12 changes: 9 additions & 3 deletions pandas/core/arrays/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
ArrayLike,
Dtype,
FillnaOptions,
NumpySorter,
PositionalIndexer,
Shape,
)
Expand Down Expand Up @@ -812,7 +813,12 @@ def unique(self: ExtensionArrayT) -> ExtensionArrayT:
uniques = unique(self.astype(object))
return self._from_sequence(uniques, dtype=self.dtype)

def searchsorted(self, value, side="left", sorter=None):
def searchsorted(
self,
value: ArrayLike | object,
side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> np.ndarray:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

also int?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, np.intp in next commit.

"""
Find indices where elements should be inserted to maintain order.

Expand All @@ -833,7 +839,7 @@ def searchsorted(self, value, side="left", sorter=None):

Parameters
----------
value : array_like
value : array_like or a single value
Values to insert into `self`.
side : {'left', 'right'}, optional
If 'left', the index of the first suitable location found is given.
Expand Down Expand Up @@ -1307,7 +1313,7 @@ def __hash__(self) -> int:
# ------------------------------------------------------------------------
# Non-Optimized Default Methods

def delete(self: ExtensionArrayT, loc) -> ExtensionArrayT:
def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is in the 'public' base EA class. it was added in #39405. @jbrockmendel is this method public? Is it part of the EA interface? does it need a docstring?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

good catch ill make a note to add a docstring

indexer = np.delete(np.arange(len(self)), loc)
return self.take(indexer)

Expand Down
18 changes: 15 additions & 3 deletions pandas/core/arrays/period.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
Any,
Callable,
Sequence,
cast,
)

import numpy as np
Expand Down Expand Up @@ -39,8 +40,10 @@
)
from pandas._typing import (
AnyArrayLike,
ArrayLike,
Dtype,
NpDtype,
NumpySorter,
)
from pandas.util._decorators import (
cache_readonly,
Expand Down Expand Up @@ -74,6 +77,8 @@
import pandas.core.common as com

if TYPE_CHECKING:
from typing import Literal

from pandas.core.arrays import DatetimeArray

_shared_doc_kwargs = {
Expand Down Expand Up @@ -642,12 +647,19 @@ def astype(self, dtype, copy: bool = True):
return self.asfreq(dtype.freq)
return super().astype(dtype, copy=copy)

def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:
value = self._validate_searchsorted_value(value).view("M8[ns]")
def searchsorted(
self,
value: ArrayLike | object,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think de-facto the scalar needs to be either Period or Period-castable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but the ExtensionArray type has to allow any object, so the subtype does as well, and then it should raise if someone passes an incompatible object type.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

has to allow any object

aren't we supposed to use Any for that?

so the subtype does as well, and then it should raise if someone passes an incompatible object type

so if a subclass is more restrictive in what it accepts (without raising), we can't have that be reflected in the annotation?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

has to allow any object

aren't we supposed to use Any for that?

If we use Any, then that tells the type checker to ignore type checking on the result. If we use object, then it will continue type checking.

so the subtype does as well, and then it should raise if someone passes an incompatible object type

so if a subclass is more restrictive in what it accepts (without raising), we can't have that be reflected in the annotation?

Yes and no. The full signature has to accept the same type as the parent, and can widen that argument type, but not narrow it. But, I think we can use an overload signature to indicate that if a specific type is passed, it creates a result of that type. So the master signature could accept object, and then you can have an overload to correspond to the narrower type.

side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> np.ndarray:
npvalue = cast(
np.ndarray, self._validate_searchsorted_value(value).view("M8[ns]")
)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can we avoid the cast with the typing of _validate_searchsorted_value

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removed in next commit.


# Cast to M8 to get datetime-like NaT placement
m8arr = self._ndarray.view("M8[ns]")
return m8arr.searchsorted(value, side=side, sorter=sorter)
return m8arr.searchsorted(npvalue, side=side, sorter=sorter)

def fillna(self, value=None, method=None, limit=None) -> PeriodArray:
if method is not None:
Expand Down
14 changes: 13 additions & 1 deletion pandas/core/arrays/sparse/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import numbers
import operator
from typing import (
TYPE_CHECKING,
Any,
Callable,
Sequence,
Expand All @@ -25,8 +26,10 @@
)
from pandas._libs.tslibs import NaT
from pandas._typing import (
ArrayLike,
Dtype,
NpDtype,
NumpySorter,
Scalar,
)
from pandas.compat.numpy import function as nv
Expand Down Expand Up @@ -77,6 +80,9 @@

import pandas.io.formats.printing as printing

if TYPE_CHECKING:
from typing import Literal

# ----------------------------------------------------------------------------
# Array

Expand Down Expand Up @@ -996,7 +1002,13 @@ def _take_without_fill(self, indices) -> np.ndarray | SparseArray:

return taken

def searchsorted(self, v, side="left", sorter=None):
def searchsorted(
self,
v: ArrayLike | object,
side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> np.ndarray:

msg = "searchsorted requires high memory usage."
warnings.warn(msg, PerformanceWarning, stacklevel=2)
if not is_scalar(v):
Expand Down
9 changes: 8 additions & 1 deletion pandas/core/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
DtypeObj,
FrameOrSeries,
IndexLabel,
NumpySorter,
NumpyValueArrayLike,
Shape,
final,
)
Expand Down Expand Up @@ -1226,7 +1228,12 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1):
"""

@doc(_shared_docs["searchsorted"], klass="Index")
def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:
def searchsorted(
self,
value: NumpyValueArrayLike,
side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> np.ndarray:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this can sometimes return an int?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

searchsorted returns an np.ndarray according to the numpy docs and the signature in numpy 1.20.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

looks like that doc/signature is inaccurate then

>>> arr = np.arange(5)
>>> type(arr.searchsorted(3))
<class 'numpy.int64'>

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this seems to be an issue with numpy. This is using numpy 1.20.2:

import numpy as np
a = np.array([1, 3, 5])
# reveal_type(a)
s = a.searchsorted(4)
# reveal_type(s)
print(s, type(s), isinstance(s, np.ndarray))

Result of running this is:

2 <class 'numpy.int64'> False

If I uncomment the reveal_type lines, the types of a and s are both np.ndarray.

So that gives us this question:

Should the pandas searchsorted API match what numpy documents (only array elements allowed), or what numpy implements?

I've submitted a numpy issue here: numpy/numpy#19160

return algorithms.searchsorted(self._values, value, side=side, sorter=sorter)

def drop_duplicates(self, keep="first"):
Expand Down
6 changes: 3 additions & 3 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3564,7 +3564,7 @@ def _get_fill_indexer_searchsorted(
"if index and target are monotonic"
)

side = "left" if method == "pad" else "right"
side: Literal["left", "right"] = "left" if method == "pad" else "right"

# find exact matches first (this simplifies the algorithm)
indexer = self.get_indexer(target)
Expand Down Expand Up @@ -5740,7 +5740,7 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind=no_default):

return label

def _searchsorted_monotonic(self, label, side: str_t = "left"):
def _searchsorted_monotonic(self, label, side: Literal["left", "right"] = "left"):
if self.is_monotonic_increasing:
return self.searchsorted(label, side=side)
elif self.is_monotonic_decreasing:
Expand All @@ -5754,7 +5754,7 @@ def _searchsorted_monotonic(self, label, side: str_t = "left"):

raise ValueError("index must be monotonic increasing or decreasing")

def get_slice_bound(self, label, side: str_t, kind=None) -> int:
def get_slice_bound(self, label, side: Literal["left", "right"], kind=None) -> int:
"""
Calculate slice bound that corresponds to given label.

Expand Down
42 changes: 27 additions & 15 deletions pandas/core/indexes/extension.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,21 @@
"""
Shared methods for Index subclasses backed by ExtensionArray.
"""
from __future__ import annotations

from typing import (
TYPE_CHECKING,
Hashable,
List,
Type,
TypeVar,
Union,
)

import numpy as np

from pandas._typing import ArrayLike
from pandas._typing import (
ArrayLike,
NumpySorter,
Scalar,
)
from pandas.compat.numpy import function as nv
from pandas.errors import AbstractMethodError
from pandas.util._decorators import (
Expand Down Expand Up @@ -45,6 +49,9 @@
from pandas.core.indexes.base import Index
from pandas.core.ops import get_op_result_name

if TYPE_CHECKING:
from typing import Literal

_T = TypeVar("_T", bound="NDArrayBackedExtensionIndex")


Expand Down Expand Up @@ -120,7 +127,7 @@ def method(self, *args, **kwargs):
return method


def inherit_names(names: List[str], delegate, cache: bool = False, wrap: bool = False):
def inherit_names(names: list[str], delegate, cache: bool = False, wrap: bool = False):
"""
Class decorator to pin attributes from an ExtensionArray to a Index subclass.

Expand Down Expand Up @@ -230,20 +237,20 @@ class ExtensionIndex(Index):
# The base class already passes through to _data:
# size, __len__, dtype

_data: Union[IntervalArray, NDArrayBackedExtensionArray]
_data: IntervalArray | NDArrayBackedExtensionArray

_data_cls: Union[
Type[Categorical],
Type[DatetimeArray],
Type[TimedeltaArray],
Type[PeriodArray],
Type[IntervalArray],
]
_data_cls: (
type[Categorical]
| type[DatetimeArray]
| type[TimedeltaArray]
| type[PeriodArray]
| type[IntervalArray]
)

@classmethod
def _simple_new(
cls,
array: Union[IntervalArray, NDArrayBackedExtensionArray],
array: IntervalArray | NDArrayBackedExtensionArray,
name: Hashable = None,
):
"""
Expand Down Expand Up @@ -292,7 +299,12 @@ def __getitem__(self, key):
deprecate_ndim_indexing(result)
return result

def searchsorted(self, value, side="left", sorter=None) -> np.ndarray:
def searchsorted(
self,
value: ArrayLike | Scalar,
side: Literal["left", "right"] = "left",
sorter: NumpySorter = None,
) -> np.ndarray:
# overriding IndexOpsMixin improves performance GH#38083
return self._data.searchsorted(value, side=side, sorter=sorter)

Expand Down
Loading