Skip to content

Commit 6cbd87b

Browse files
authored
ENH: Implement fastpath for masked indexes in join (#50310)
* Implement fastpath for ea indexes in join * Fix post processing * Add comments * Add whatsnew * Add gh ref * Refactor
1 parent d20c02b commit 6cbd87b

File tree

2 files changed

+27
-10
lines changed

2 files changed

+27
-10
lines changed

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -742,6 +742,7 @@ Performance improvements
742742
- Performance improvement in :meth:`Index.union` and :meth:`MultiIndex.union` when index contains duplicates (:issue:`48900`)
743743
- Performance improvement in :meth:`Series.rank` for pyarrow-backed dtypes (:issue:`50264`)
744744
- Performance improvement in :meth:`Series.fillna` for extension array dtypes (:issue:`49722`, :issue:`50078`)
745+
- Performance improvement in :meth:`Index.join`, :meth:`Index.intersection` and :meth:`Index.union` for masked dtypes when :class:`Index` is monotonic (:issue:`50310`)
745746
- Performance improvement for :meth:`Series.value_counts` with nullable dtype (:issue:`48338`)
746747
- Performance improvement for :class:`Series` constructor passing integer numpy array with nullable dtype (:issue:`48338`)
747748
- Performance improvement for :class:`DatetimeIndex` constructor passing a list (:issue:`48609`)

pandas/core/indexes/base.py

+26-10
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@
144144
validate_putmask,
145145
)
146146
from pandas.core.arrays import (
147+
BaseMaskedArray,
147148
Categorical,
148149
ExtensionArray,
149150
)
@@ -323,8 +324,8 @@ class Index(IndexOpsMixin, PandasObject):
323324
@final
324325
def _left_indexer_unique(self: _IndexT, other: _IndexT) -> npt.NDArray[np.intp]:
325326
# Caller is responsible for ensuring other.dtype == self.dtype
326-
sv = self._get_engine_target()
327-
ov = other._get_engine_target()
327+
sv = self._get_join_target()
328+
ov = other._get_join_target()
328329
# can_use_libjoin assures sv and ov are ndarrays
329330
sv = cast(np.ndarray, sv)
330331
ov = cast(np.ndarray, ov)
@@ -335,8 +336,8 @@ def _left_indexer(
335336
self: _IndexT, other: _IndexT
336337
) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
337338
# Caller is responsible for ensuring other.dtype == self.dtype
338-
sv = self._get_engine_target()
339-
ov = other._get_engine_target()
339+
sv = self._get_join_target()
340+
ov = other._get_join_target()
340341
# can_use_libjoin assures sv and ov are ndarrays
341342
sv = cast(np.ndarray, sv)
342343
ov = cast(np.ndarray, ov)
@@ -349,8 +350,8 @@ def _inner_indexer(
349350
self: _IndexT, other: _IndexT
350351
) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
351352
# Caller is responsible for ensuring other.dtype == self.dtype
352-
sv = self._get_engine_target()
353-
ov = other._get_engine_target()
353+
sv = self._get_join_target()
354+
ov = other._get_join_target()
354355
# can_use_libjoin assures sv and ov are ndarrays
355356
sv = cast(np.ndarray, sv)
356357
ov = cast(np.ndarray, ov)
@@ -363,8 +364,8 @@ def _outer_indexer(
363364
self: _IndexT, other: _IndexT
364365
) -> tuple[ArrayLike, npt.NDArray[np.intp], npt.NDArray[np.intp]]:
365366
# Caller is responsible for ensuring other.dtype == self.dtype
366-
sv = self._get_engine_target()
367-
ov = other._get_engine_target()
367+
sv = self._get_join_target()
368+
ov = other._get_join_target()
368369
# can_use_libjoin assures sv and ov are ndarrays
369370
sv = cast(np.ndarray, sv)
370371
ov = cast(np.ndarray, ov)
@@ -4680,8 +4681,11 @@ def _can_use_libjoin(self) -> bool:
46804681
Whether we can use the fastpaths implement in _libs.join
46814682
"""
46824683
if type(self) is Index:
4683-
# excludes EAs
4684-
return isinstance(self.dtype, np.dtype)
4684+
# excludes EAs, but include masks, we get here with monotonic
4685+
# values only, meaning no NA
4686+
return isinstance(self.dtype, np.dtype) or isinstance(
4687+
self.values, BaseMaskedArray
4688+
)
46854689
return not is_interval_dtype(self.dtype)
46864690

46874691
# --------------------------------------------------------------------
@@ -4759,11 +4763,23 @@ def _get_engine_target(self) -> ArrayLike:
47594763
return self._values.astype(object)
47604764
return vals
47614765

4766+
def _get_join_target(self) -> ArrayLike:
4767+
"""
4768+
Get the ndarray or ExtensionArray that we can pass to the join
4769+
functions.
4770+
"""
4771+
if isinstance(self._values, BaseMaskedArray):
4772+
# This is only used if our array is monotonic, so no NAs present
4773+
return self._values._data
4774+
return self._get_engine_target()
4775+
47624776
def _from_join_target(self, result: np.ndarray) -> ArrayLike:
47634777
"""
47644778
Cast the ndarray returned from one of the libjoin.foo_indexer functions
47654779
back to type(self)._data.
47664780
"""
4781+
if isinstance(self.values, BaseMaskedArray):
4782+
return type(self.values)(result, np.zeros(result.shape, dtype=np.bool_))
47674783
return result
47684784

47694785
@doc(IndexOpsMixin._memory_usage)

0 commit comments

Comments
 (0)