Skip to content

Commit 3b484ee

Browse files
lukemanleyim-vinicius
authored and
im-vinicius
committed
PERF: Indexing with pyarrow timestamp & duration dtypes (pandas-dev#53368)
* PERF: Indexing with pyarrow timestamp & duration dtypes * whatsnew
1 parent 63365ac commit 3b484ee

File tree

2 files changed

+37
-2
lines changed

2 files changed

+37
-2
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -301,6 +301,7 @@ Performance improvements
301301
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.astype` when converting from a pyarrow timestamp or duration dtype to numpy (:issue:`53326`)
302302
- Performance improvement in :meth:`~arrays.ArrowExtensionArray.to_numpy` (:issue:`52525`)
303303
- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`)
304+
- Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`)
304305

305306
.. ---------------------------------------------------------------------------
306307
.. _whatsnew_210.bug_fixes:

pandas/core/indexes/base.py

+36-2
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@
151151
Categorical,
152152
ExtensionArray,
153153
)
154+
from pandas.core.arrays.datetimes import tz_to_dtype
154155
from pandas.core.arrays.string_ import StringArray
155156
from pandas.core.base import (
156157
IndexOpsMixin,
@@ -191,8 +192,11 @@
191192
MultiIndex,
192193
Series,
193194
)
194-
from pandas.core.arrays import PeriodArray
195-
195+
from pandas.core.arrays import (
196+
DatetimeArray,
197+
PeriodArray,
198+
TimedeltaArray,
199+
)
196200

197201
__all__ = ["Index"]
198202

@@ -835,6 +839,22 @@ def _engine(
835839
) -> libindex.IndexEngine | libindex.ExtensionEngine | libindex.MaskedIndexEngine:
836840
# For base class (object dtype) we get ObjectEngine
837841
target_values = self._get_engine_target()
842+
843+
if isinstance(self._values, ArrowExtensionArray) and self.dtype.kind in "Mm":
844+
import pyarrow as pa
845+
846+
pa_type = self._values._pa_array.type
847+
if pa.types.is_timestamp(pa_type):
848+
dtype = tz_to_dtype(pa_type.tz, pa_type.unit)
849+
target_values = self._values.astype(dtype)
850+
target_values = cast("DatetimeArray", target_values)
851+
return libindex.DatetimeEngine(target_values._ndarray)
852+
elif pa.types.is_duration(pa_type):
853+
dtype = np.dtype(f"m8[{pa_type.unit}]")
854+
target_values = self._values.astype(dtype)
855+
target_values = cast("TimedeltaArray", target_values)
856+
return libindex.TimedeltaEngine(target_values._ndarray)
857+
838858
if isinstance(target_values, ExtensionArray):
839859
if isinstance(target_values, (BaseMaskedArray, ArrowExtensionArray)):
840860
try:
@@ -5053,6 +5073,20 @@ def _get_engine_target(self) -> ArrayLike:
50535073
if isinstance(vals, StringArray):
50545074
# GH#45652 much more performant than ExtensionEngine
50555075
return vals._ndarray
5076+
if isinstance(vals, ArrowExtensionArray) and self.dtype.kind in "Mm":
5077+
import pyarrow as pa
5078+
5079+
pa_type = vals._pa_array.type
5080+
if pa.types.is_timestamp(pa_type):
5081+
dtype = tz_to_dtype(pa_type.tz, pa_type.unit)
5082+
vals = vals.astype(dtype)
5083+
vals = cast("DatetimeArray", vals)
5084+
return vals._ndarray.view("i8")
5085+
elif pa.types.is_duration(pa_type):
5086+
dtype = np.dtype(f"m8[{pa_type.unit}]")
5087+
vals = vals.astype(dtype)
5088+
vals = cast("TimedeltaArray", vals)
5089+
return vals._ndarray.view("i8")
50565090
if (
50575091
type(self) is Index
50585092
and isinstance(self._values, ExtensionArray)

0 commit comments

Comments
 (0)