-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
PERF: further improve take (reindex/unstack) for ArrayManager #40300
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
072118a
85cccb2
f8adf7b
cb76a91
5768466
f85ebea
53fef41
12adf4e
35f933f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -82,8 +82,13 @@ def _take_nd_ndarray( | |
allow_fill: bool, | ||
) -> np.ndarray: | ||
|
||
if indexer is None: | ||
indexer = np.arange(arr.shape[axis], dtype=np.int64) | ||
dtype, fill_value = arr.dtype, arr.dtype.type() | ||
else: | ||
indexer = ensure_int64(indexer, copy=False) | ||
indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( | ||
arr, indexer, axis, out, fill_value, allow_fill | ||
arr, indexer, out, fill_value, allow_fill | ||
) | ||
|
||
flip_order = False | ||
|
@@ -130,7 +135,8 @@ def take_1d( | |
""" | ||
Specialized version for 1D arrays. Differences compared to take_nd: | ||
|
||
- Assumes input (arr, indexer) has already been converted to numpy array / EA | ||
- Assumes input array has already been converted to numpy array / EA | ||
jreback marked this conversation as resolved.
Show resolved
Hide resolved
|
||
- Assumes indexer is already guaranteed to be int64 dtype ndarray | ||
- Only works for 1D arrays | ||
|
||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
To ensure the lowest possible overhead. | ||
|
@@ -142,8 +148,11 @@ def take_1d( | |
# ExtensionArray -> dispatch to their method | ||
return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) | ||
|
||
if not allow_fill: | ||
jbrockmendel marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return np.take(arr, indexer) | ||
|
||
indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( | ||
arr, indexer, 0, None, fill_value, allow_fill | ||
arr, indexer, None, fill_value, allow_fill | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. do we ever get to _take_preprocess_indexer_and_fill_value with allow_fill=False? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, that's currently possible in |
||
) | ||
|
||
# at this point, it's guaranteed that dtype can hold both the arr values | ||
|
@@ -463,37 +472,31 @@ def _take_2d_multi_object( | |
def _take_preprocess_indexer_and_fill_value( | ||
arr: np.ndarray, | ||
indexer: Optional[np.ndarray], | ||
axis: int, | ||
out: Optional[np.ndarray], | ||
fill_value, | ||
allow_fill: bool, | ||
): | ||
mask_info = None | ||
|
||
if indexer is None: | ||
indexer = np.arange(arr.shape[axis], dtype=np.int64) | ||
if not allow_fill: | ||
dtype, fill_value = arr.dtype, arr.dtype.type() | ||
mask_info = None, False | ||
else: | ||
indexer = ensure_int64(indexer, copy=False) | ||
if not allow_fill: | ||
dtype, fill_value = arr.dtype, arr.dtype.type() | ||
mask_info = None, False | ||
else: | ||
# check for promotion based on types only (do this first because | ||
# it's faster than computing a mask) | ||
dtype, fill_value = maybe_promote(arr.dtype, fill_value) | ||
if dtype != arr.dtype and (out is None or out.dtype != dtype): | ||
# check if promotion is actually required based on indexer | ||
mask = indexer == -1 | ||
needs_masking = mask.any() | ||
mask_info = mask, needs_masking | ||
if needs_masking: | ||
if out is not None and out.dtype != dtype: | ||
raise TypeError("Incompatible type for fill_value") | ||
else: | ||
# if not, then depromote, set fill_value to dummy | ||
# (it won't be used but we don't want the cython code | ||
# to crash when trying to cast it to dtype) | ||
dtype, fill_value = arr.dtype, arr.dtype.type() | ||
# check for promotion based on types only (do this first because | ||
# it's faster than computing a mask) | ||
dtype, fill_value = maybe_promote(arr.dtype, fill_value) | ||
if dtype != arr.dtype and (out is None or out.dtype != dtype): | ||
# check if promotion is actually required based on indexer | ||
mask = indexer == -1 | ||
needs_masking = mask.any() | ||
mask_info = mask, needs_masking | ||
if needs_masking: | ||
if out is not None and out.dtype != dtype: | ||
raise TypeError("Incompatible type for fill_value") | ||
else: | ||
# if not, then depromote, set fill_value to dummy | ||
# (it won't be used but we don't want the cython code | ||
# to crash when trying to cast it to dtype) | ||
dtype, fill_value = arr.dtype, arr.dtype.type() | ||
Comment on lines
+531
to
+546
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nothing changed in those lines of the code; it's only a change in indentation (which github does't seem to detect nicely). Only the first lines:
have been moved out into |
||
|
||
return indexer, dtype, fill_value, mask_info |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,7 @@ | |
soft_convert_objects, | ||
) | ||
from pandas.core.dtypes.common import ( | ||
ensure_int64, | ||
is_bool_dtype, | ||
is_datetime64_ns_dtype, | ||
is_dtype_equal, | ||
|
@@ -205,7 +206,7 @@ def get_dtypes(self): | |
def __repr__(self) -> str: | ||
output = type(self).__name__ | ||
output += f"\nIndex: {self._axes[0]}" | ||
if self.ndim == 1: | ||
if self.ndim == 2: | ||
output += f"\nColumns: {self._axes[1]}" | ||
output += f"\n{len(self.arrays)} arrays:" | ||
for arr in self.arrays: | ||
|
@@ -1075,15 +1076,24 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: | |
unstacked : BlockManager | ||
""" | ||
indexer, _ = unstacker._indexer_and_to_sort | ||
new_indexer = np.full(unstacker.mask.shape, -1) | ||
new_indexer[unstacker.mask] = indexer | ||
if np.all(unstacker.mask): | ||
new_indexer = indexer | ||
allow_fill = False | ||
else: | ||
new_indexer = np.full(unstacker.mask.shape, -1) | ||
new_indexer[unstacker.mask] = indexer | ||
allow_fill = True | ||
new_indexer2D = new_indexer.reshape(*unstacker.full_shape) | ||
new_indexer2D = ensure_int64(new_indexer2D) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this chunk of code be made to match Block._unstack (or vice-versa) more closely? e.g. on L1094 you check unstacker.mask.all() which I think is related to the comment in Block._unstack There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assuming you are referring to the EA-specific
As far as I understand the Block version, I don't think that's related. For Block/BlockManager.unstack, that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
OK.
at some point we should make sure that one of us (active maintainers, not just the two of us) have a handle on this. |
||
|
||
new_arrays = [] | ||
for arr in self.arrays: | ||
for i in range(unstacker.full_shape[1]): | ||
new_arr = take_1d( | ||
arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value | ||
arr, | ||
new_indexer2D[:, i], | ||
allow_fill=allow_fill, | ||
fill_value=fill_value, | ||
) | ||
new_arrays.append(new_arr) | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This part is copied from
_take_preprocess_indexer_and_fill_value
, so we can avoid doing that intake_1d