Skip to content

Commit fd43d4b

Browse files
PERF: Do not init cache in RangeIndex.take (#53397)
* PERF: Do not init cache in RangeIndex.take Improve performance when passing an array to RangeIndex.take, DataFrame.loc, or DataFrame.iloc and the DataFrame is using a RangeIndex * Explicitly raise ValueError in RangeIndex.take when allow_fill is True and fill_value is not None * Add test_take_when_index_has_negative_step * Override RangeIndex._find_common_type_compat * Revert "Override RangeIndex._find_common_type_compat" This reverts commit 102644c. * Change find_result_type to take a dtype instead of values * Call _maybe_disallow_fill * Move checks from test_cache_after_calling_loc_with_array to test_cache
1 parent 3a2ec64 commit fd43d4b

File tree

8 files changed

+127
-20
lines changed

8 files changed

+127
-20
lines changed

doc/source/whatsnew/v2.1.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,7 @@ Performance improvements
412412
- Performance improvement in various :class:`MultiIndex` set and indexing operations (:issue:`53955`)
413413
- Performance improvement when doing various reshaping operations on :class:`arrays.IntegerArrays` & :class:`arrays.FloatingArray` by avoiding doing unnecessary validation (:issue:`53013`)
414414
- Performance improvement when indexing with pyarrow timestamp and duration dtypes (:issue:`53368`)
415+
- Performance improvement when passing an array to :meth:`RangeIndex.take`, :meth:`DataFrame.loc`, or :meth:`DataFrame.iloc` and the DataFrame is using a RangeIndex (:issue:`53387`)
415416
-
416417

417418
.. ---------------------------------------------------------------------------

pandas/core/dtypes/cast.py

+13-15
Original file line numberDiff line numberDiff line change
@@ -1266,17 +1266,17 @@ def _ensure_nanosecond_dtype(dtype: DtypeObj) -> None:
12661266

12671267
# TODO: other value-dependent functions to standardize here include
12681268
# Index._find_common_type_compat
1269-
def find_result_type(left: ArrayLike, right: Any) -> DtypeObj:
1269+
def find_result_type(left_dtype: DtypeObj, right: Any) -> DtypeObj:
12701270
"""
1271-
Find the type/dtype for a the result of an operation between these objects.
1271+
Find the type/dtype for the result of an operation between objects.
12721272
1273-
This is similar to find_common_type, but looks at the objects instead
1274-
of just their dtypes. This can be useful in particular when one of the
1275-
objects does not have a `dtype`.
1273+
This is similar to find_common_type, but looks at the right object instead
1274+
of just its dtype. This can be useful in particular when the right
1275+
object does not have a `dtype`.
12761276
12771277
Parameters
12781278
----------
1279-
left : np.ndarray or ExtensionArray
1279+
left_dtype : np.dtype or ExtensionDtype
12801280
right : Any
12811281
12821282
Returns
@@ -1291,26 +1291,24 @@ def find_result_type(left: ArrayLike, right: Any) -> DtypeObj:
12911291
new_dtype: DtypeObj
12921292

12931293
if (
1294-
isinstance(left, np.ndarray)
1295-
and left.dtype.kind in "iuc"
1294+
isinstance(left_dtype, np.dtype)
1295+
and left_dtype.kind in "iuc"
12961296
and (lib.is_integer(right) or lib.is_float(right))
12971297
):
12981298
# e.g. with int8 dtype and right=512, we want to end up with
12991299
# np.int16, whereas infer_dtype_from(512) gives np.int64,
13001300
# which will make us upcast too far.
1301-
if lib.is_float(right) and right.is_integer() and left.dtype.kind != "f":
1301+
if lib.is_float(right) and right.is_integer() and left_dtype.kind != "f":
13021302
right = int(right)
1303+
new_dtype = np.result_type(left_dtype, right)
13031304

1304-
new_dtype = np.result_type(left, right)
1305-
1306-
elif is_valid_na_for_dtype(right, left.dtype):
1305+
elif is_valid_na_for_dtype(right, left_dtype):
13071306
# e.g. IntervalDtype[int] and None/np.nan
1308-
new_dtype = ensure_dtype_can_hold_na(left.dtype)
1307+
new_dtype = ensure_dtype_can_hold_na(left_dtype)
13091308

13101309
else:
13111310
dtype, _ = infer_dtype_from(right)
1312-
1313-
new_dtype = find_common_type([left.dtype, dtype])
1311+
new_dtype = find_common_type([left_dtype, dtype])
13141312

13151313
return new_dtype
13161314

pandas/core/indexes/base.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1162,7 +1162,6 @@ def take(
11621162
taken = values.take(
11631163
indices, allow_fill=allow_fill, fill_value=self._na_value
11641164
)
1165-
# _constructor so RangeIndex-> Index with an int64 dtype
11661165
return self._constructor._simple_new(taken, name=self.name)
11671166

11681167
@final
@@ -5560,6 +5559,10 @@ def equals(self, other: Any) -> bool:
55605559
if not isinstance(other, Index):
55615560
return False
55625561

5562+
if len(self) != len(other):
5563+
# quickly return if the lengths are different
5564+
return False
5565+
55635566
if is_object_dtype(self.dtype) and not is_object_dtype(other.dtype):
55645567
# if other is not object, use other's logic for coercion
55655568
return other.equals(self)
@@ -6290,7 +6293,7 @@ def _find_common_type_compat(self, target) -> DtypeObj:
62906293
):
62916294
return _dtype_obj
62926295

6293-
dtype = find_result_type(self._values, target)
6296+
dtype = find_result_type(self.dtype, target)
62946297
dtype = common_dtype_categorical_compat([self, target], dtype)
62956298
return dtype
62966299

pandas/core/indexes/range.py

+42
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050

5151
if TYPE_CHECKING:
5252
from pandas._typing import (
53+
Axis,
5354
Dtype,
5455
NaPosition,
5556
Self,
@@ -1105,3 +1106,44 @@ def _arith_method(self, other, op):
11051106
except (ValueError, TypeError, ZeroDivisionError):
11061107
# test_arithmetic_explicit_conversions
11071108
return super()._arith_method(other, op)
1109+
1110+
def take(
1111+
self,
1112+
indices,
1113+
axis: Axis = 0,
1114+
allow_fill: bool = True,
1115+
fill_value=None,
1116+
**kwargs,
1117+
):
1118+
if kwargs:
1119+
nv.validate_take((), kwargs)
1120+
if is_scalar(indices):
1121+
raise TypeError("Expected indices to be array-like")
1122+
indices = ensure_platform_int(indices)
1123+
1124+
# raise an exception if allow_fill is True and fill_value is not None
1125+
self._maybe_disallow_fill(allow_fill, fill_value, indices)
1126+
1127+
if len(indices) == 0:
1128+
taken = np.array([], dtype=self.dtype)
1129+
else:
1130+
ind_max = indices.max()
1131+
if ind_max >= len(self):
1132+
raise IndexError(
1133+
f"index {ind_max} is out of bounds for axis 0 with size {len(self)}"
1134+
)
1135+
ind_min = indices.min()
1136+
if ind_min < -len(self):
1137+
raise IndexError(
1138+
f"index {ind_min} is out of bounds for axis 0 with size {len(self)}"
1139+
)
1140+
taken = indices.astype(self.dtype, casting="safe")
1141+
if ind_min < 0:
1142+
taken %= len(self)
1143+
if self.step != 1:
1144+
taken *= self.step
1145+
if self.start != 0:
1146+
taken += self.start
1147+
1148+
# _constructor so RangeIndex-> Index with an int64 dtype
1149+
return self._constructor._simple_new(taken, name=self.name)

pandas/core/internals/blocks.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -449,7 +449,7 @@ def coerce_to_target_dtype(self, other) -> Block:
449449
we can also safely try to coerce to the same dtype
450450
and will receive the same block
451451
"""
452-
new_dtype = find_result_type(self.values, other)
452+
new_dtype = find_result_type(self.values.dtype, other)
453453

454454
return self.astype(new_dtype, copy=False)
455455

pandas/tests/indexes/numeric/test_setops.py

+11
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,17 @@ def test_range_float_union_dtype(self):
7474
result = other.union(index)
7575
tm.assert_index_equal(result, expected)
7676

77+
def test_range_uint64_union_dtype(self):
78+
# https://github.com/pandas-dev/pandas/issues/26778
79+
index = RangeIndex(start=0, stop=3)
80+
other = Index([0, 10], dtype=np.uint64)
81+
result = index.union(other)
82+
expected = Index([0, 1, 2, 10], dtype=object)
83+
tm.assert_index_equal(result, expected)
84+
85+
result = other.union(index)
86+
tm.assert_index_equal(result, expected)
87+
7788
def test_float64_index_difference(self):
7889
# https://github.com/pandas-dev/pandas/issues/35217
7990
float_index = Index([1.0, 2, 3])

pandas/tests/indexes/ranges/test_indexing.py

+42
Original file line numberDiff line numberDiff line change
@@ -76,10 +76,52 @@ def test_take_fill_value(self):
7676
with pytest.raises(ValueError, match=msg):
7777
idx.take(np.array([1, 0, -5]), fill_value=True)
7878

79+
def test_take_raises_index_error(self):
80+
idx = RangeIndex(1, 4, name="xxx")
81+
7982
msg = "index -5 is out of bounds for (axis 0 with )?size 3"
8083
with pytest.raises(IndexError, match=msg):
8184
idx.take(np.array([1, -5]))
8285

86+
msg = "index -4 is out of bounds for (axis 0 with )?size 3"
87+
with pytest.raises(IndexError, match=msg):
88+
idx.take(np.array([1, -4]))
89+
90+
# no errors
91+
result = idx.take(np.array([1, -3]))
92+
expected = Index([2, 1], dtype=np.int64, name="xxx")
93+
tm.assert_index_equal(result, expected)
94+
95+
def test_take_accepts_empty_array(self):
96+
idx = RangeIndex(1, 4, name="foo")
97+
result = idx.take(np.array([]))
98+
expected = Index([], dtype=np.int64, name="foo")
99+
tm.assert_index_equal(result, expected)
100+
101+
# empty index
102+
idx = RangeIndex(0, name="foo")
103+
result = idx.take(np.array([]))
104+
expected = Index([], dtype=np.int64, name="foo")
105+
tm.assert_index_equal(result, expected)
106+
107+
def test_take_accepts_non_int64_array(self):
108+
idx = RangeIndex(1, 4, name="foo")
109+
result = idx.take(np.array([2, 1], dtype=np.uint32))
110+
expected = Index([3, 2], dtype=np.int64, name="foo")
111+
tm.assert_index_equal(result, expected)
112+
113+
def test_take_when_index_has_step(self):
114+
idx = RangeIndex(1, 11, 3, name="foo") # [1, 4, 7, 10]
115+
result = idx.take(np.array([1, 0, -1, -4]))
116+
expected = Index([4, 1, 10, 1], dtype=np.int64, name="foo")
117+
tm.assert_index_equal(result, expected)
118+
119+
def test_take_when_index_has_negative_step(self):
120+
idx = RangeIndex(11, -4, -2, name="foo") # [11, 9, 7, 5, 3, 1, -1, -3]
121+
result = idx.take(np.array([1, 0, -1, -8]))
122+
expected = Index([9, 11, -3, 11], dtype=np.int64, name="foo")
123+
tm.assert_index_equal(result, expected)
124+
83125

84126
class TestWhere:
85127
def test_where_putmask_range_cast(self):

pandas/tests/indexes/ranges/test_range.py

+12-2
Original file line numberDiff line numberDiff line change
@@ -210,7 +210,7 @@ def test_dtype(self, simple_index):
210210
assert index.dtype == np.int64
211211

212212
def test_cache(self):
213-
# GH 26565, GH26617, GH35432
213+
# GH 26565, GH26617, GH35432, GH53387
214214
# This test checks whether _cache has been set.
215215
# Calling RangeIndex._cache["_data"] creates an int64 array of the same length
216216
# as the RangeIndex and stores it in _cache.
@@ -264,11 +264,21 @@ def test_cache(self):
264264
df.iloc[5:10]
265265
assert idx._cache == {}
266266

267+
# after calling take, _cache may contain other keys, but not "_data"
268+
idx.take([3, 0, 1])
269+
assert "_data" not in idx._cache
270+
271+
df.loc[[50]]
272+
assert "_data" not in idx._cache
273+
274+
df.iloc[[5, 6, 7, 8, 9]]
275+
assert "_data" not in idx._cache
276+
267277
# idx._cache should contain a _data entry after call to idx._data
268278
idx._data
269279
assert isinstance(idx._data, np.ndarray)
270280
assert idx._data is idx._data # check cached value is reused
271-
assert len(idx._cache) == 1
281+
assert "_data" in idx._cache
272282
expected = np.arange(0, 100, 10, dtype="int64")
273283
tm.assert_numpy_array_equal(idx._cache["_data"], expected)
274284

0 commit comments

Comments
 (0)