Skip to content

Commit 1ced878

Browse files
PERF: further improve take (reindex/unstack) for ArrayManager (#40300)
1 parent d5fc3ac commit 1ced878

File tree

2 files changed

+57
-38
lines changed

2 files changed

+57
-38
lines changed

pandas/core/array_algos/take.py

+38-34
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@ def take_nd(
6767
This dispatches to ``take`` defined on ExtensionArrays. It does not
6868
currently dispatch to ``SparseArray.take`` for sparse ``arr``.
6969
70+
Note: this function assumes that the indexer is a valid(ated) indexer with
71+
no out of bound indices.
72+
7073
Parameters
7174
----------
7275
arr : np.ndarray or ExtensionArray
@@ -113,8 +116,13 @@ def _take_nd_ndarray(
113116
allow_fill: bool,
114117
) -> np.ndarray:
115118

119+
if indexer is None:
120+
indexer = np.arange(arr.shape[axis], dtype=np.int64)
121+
dtype, fill_value = arr.dtype, arr.dtype.type()
122+
else:
123+
indexer = ensure_int64(indexer, copy=False)
116124
indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
117-
arr, indexer, axis, out, fill_value, allow_fill
125+
arr, indexer, out, fill_value, allow_fill
118126
)
119127

120128
flip_order = False
@@ -159,13 +167,17 @@ def take_1d(
159167
allow_fill: bool = True,
160168
) -> ArrayLike:
161169
"""
162-
Specialized version for 1D arrays. Differences compared to take_nd:
170+
Specialized version for 1D arrays. Differences compared to `take_nd`:
163171
164-
- Assumes input (arr, indexer) has already been converted to numpy array / EA
172+
- Assumes input array has already been converted to numpy array / EA
173+
- Assumes indexer is already guaranteed to be int64 dtype ndarray
165174
- Only works for 1D arrays
166175
167176
To ensure the lowest possible overhead.
168177
178+
Note: similarly to `take_nd`, this function assumes that the indexer is
179+
a valid(ated) indexer with no out of bound indices.
180+
169181
TODO(ArrayManager): mainly useful for ArrayManager, otherwise can potentially
170182
be removed again if we don't end up with ArrayManager.
171183
"""
@@ -180,8 +192,11 @@ def take_1d(
180192
allow_fill=allow_fill,
181193
)
182194

195+
if not allow_fill:
196+
return arr.take(indexer)
197+
183198
indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value(
184-
arr, indexer, 0, None, fill_value, allow_fill
199+
arr, indexer, None, fill_value, allow_fill
185200
)
186201

187202
# at this point, it's guaranteed that dtype can hold both the arr values
@@ -502,43 +517,32 @@ def _take_2d_multi_object(
502517

503518
def _take_preprocess_indexer_and_fill_value(
504519
arr: np.ndarray,
505-
indexer: Optional[np.ndarray],
506-
axis: int,
520+
indexer: np.ndarray,
507521
out: Optional[np.ndarray],
508522
fill_value,
509523
allow_fill: bool,
510524
):
511525
mask_info = None
512526

513-
if indexer is None:
514-
indexer = np.arange(arr.shape[axis], dtype=np.int64)
527+
if not allow_fill:
515528
dtype, fill_value = arr.dtype, arr.dtype.type()
529+
mask_info = None, False
516530
else:
517-
indexer = ensure_int64(indexer, copy=False)
518-
if not allow_fill:
519-
dtype, fill_value = arr.dtype, arr.dtype.type()
520-
mask_info = None, False
521-
else:
522-
# check for promotion based on types only (do this first because
523-
# it's faster than computing a mask)
524-
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
525-
if dtype != arr.dtype and (out is None or out.dtype != dtype):
526-
# check if promotion is actually required based on indexer
527-
mask = indexer == -1
528-
# error: Item "bool" of "Union[Any, bool]" has no attribute "any"
529-
# [union-attr]
530-
needs_masking = mask.any() # type: ignore[union-attr]
531-
# error: Incompatible types in assignment (expression has type
532-
# "Tuple[Union[Any, bool], Any]", variable has type
533-
# "Optional[Tuple[None, bool]]")
534-
mask_info = mask, needs_masking # type: ignore[assignment]
535-
if needs_masking:
536-
if out is not None and out.dtype != dtype:
537-
raise TypeError("Incompatible type for fill_value")
538-
else:
539-
# if not, then depromote, set fill_value to dummy
540-
# (it won't be used but we don't want the cython code
541-
# to crash when trying to cast it to dtype)
542-
dtype, fill_value = arr.dtype, arr.dtype.type()
531+
# check for promotion based on types only (do this first because
532+
# it's faster than computing a mask)
533+
dtype, fill_value = maybe_promote(arr.dtype, fill_value)
534+
if dtype != arr.dtype and (out is None or out.dtype != dtype):
535+
# check if promotion is actually required based on indexer
536+
mask = indexer == -1
537+
needs_masking = mask.any()
538+
mask_info = mask, needs_masking
539+
if needs_masking:
540+
if out is not None and out.dtype != dtype:
541+
raise TypeError("Incompatible type for fill_value")
542+
else:
543+
# if not, then depromote, set fill_value to dummy
544+
# (it won't be used but we don't want the cython code
545+
# to crash when trying to cast it to dtype)
546+
dtype, fill_value = arr.dtype, arr.dtype.type()
543547

544548
return indexer, dtype, fill_value, mask_info

pandas/core/internals/array_manager.py

+19-4
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
soft_convert_objects,
3333
)
3434
from pandas.core.dtypes.common import (
35+
ensure_int64,
3536
is_bool_dtype,
3637
is_datetime64_ns_dtype,
3738
is_dtype_equal,
@@ -1005,11 +1006,16 @@ def _reindex_indexer(
10051006

10061007
else:
10071008
validate_indices(indexer, len(self._axes[0]))
1009+
indexer = ensure_int64(indexer)
1010+
if (indexer == -1).any():
1011+
allow_fill = True
1012+
else:
1013+
allow_fill = False
10081014
new_arrays = [
10091015
take_1d(
10101016
arr,
10111017
indexer,
1012-
allow_fill=True,
1018+
allow_fill=allow_fill,
10131019
fill_value=fill_value,
10141020
# if fill_value is not None else blk.fill_value
10151021
)
@@ -1080,15 +1086,24 @@ def unstack(self, unstacker, fill_value) -> ArrayManager:
10801086
unstacked : BlockManager
10811087
"""
10821088
indexer, _ = unstacker._indexer_and_to_sort
1083-
new_indexer = np.full(unstacker.mask.shape, -1)
1084-
new_indexer[unstacker.mask] = indexer
1089+
if unstacker.mask.all():
1090+
new_indexer = indexer
1091+
allow_fill = False
1092+
else:
1093+
new_indexer = np.full(unstacker.mask.shape, -1)
1094+
new_indexer[unstacker.mask] = indexer
1095+
allow_fill = True
10851096
new_indexer2D = new_indexer.reshape(*unstacker.full_shape)
1097+
new_indexer2D = ensure_int64(new_indexer2D)
10861098

10871099
new_arrays = []
10881100
for arr in self.arrays:
10891101
for i in range(unstacker.full_shape[1]):
10901102
new_arr = take_1d(
1091-
arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value
1103+
arr,
1104+
new_indexer2D[:, i],
1105+
allow_fill=allow_fill,
1106+
fill_value=fill_value,
10921107
)
10931108
new_arrays.append(new_arr)
10941109

0 commit comments

Comments
 (0)