From 072118a47730c103ada0bd698d5ea38b2261aa8d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 8 Mar 2021 10:05:12 +0100 Subject: [PATCH 1/5] PERF: further improve take (reindex/unstack) for ArrayManager --- pandas/core/array_algos/take.py | 57 ++++++++++++++------------ pandas/core/internals/array_manager.py | 18 ++++++-- 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index 054497089c5ab..b06145aca51d2 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -82,8 +82,13 @@ def _take_nd_ndarray( allow_fill: bool, ) -> np.ndarray: + if indexer is None: + indexer = np.arange(arr.shape[axis], dtype=np.int64) + dtype, fill_value = arr.dtype, arr.dtype.type() + else: + indexer = ensure_int64(indexer, copy=False) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( - arr, indexer, axis, out, fill_value, allow_fill + arr, indexer, out, fill_value, allow_fill ) flip_order = False @@ -130,7 +135,8 @@ def take_1d( """ Specialized version for 1D arrays. Differences compared to take_nd: - - Assumes input (arr, indexer) has already been converted to numpy array / EA + - Assumes input array has already been converted to numpy array / EA + - Assumes indexer is already guaranteed to be int64 dtype ndarray - Only works for 1D arrays To ensure the lowest possible overhead. @@ -142,8 +148,11 @@ def take_1d( # ExtensionArray -> dispatch to their method return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + if not allow_fill: + return np.take(arr, indexer) + indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( - arr, indexer, 0, None, fill_value, allow_fill + arr, indexer, None, fill_value, allow_fill ) # at this point, it's guaranteed that dtype can hold both the arr values @@ -463,37 +472,31 @@ def _take_2d_multi_object( def _take_preprocess_indexer_and_fill_value( arr: np.ndarray, indexer: Optional[np.ndarray], - axis: int, out: Optional[np.ndarray], fill_value, allow_fill: bool, ): mask_info = None - if indexer is None: - indexer = np.arange(arr.shape[axis], dtype=np.int64) + if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() + mask_info = None, False else: - indexer = ensure_int64(indexer, copy=False) - if not allow_fill: - dtype, fill_value = arr.dtype, arr.dtype.type() - mask_info = None, False - else: - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype and (out is None or out.dtype != dtype): - # check if promotion is actually required based on indexer - mask = indexer == -1 - needs_masking = mask.any() - mask_info = mask, needs_masking - if needs_masking: - if out is not None and out.dtype != dtype: - raise TypeError("Incompatible type for fill_value") - else: - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype and (out is None or out.dtype != dtype): + # check if promotion is actually required based on indexer + mask = indexer == -1 + needs_masking = mask.any() + mask_info = mask, needs_masking + if needs_masking: + if out is not None and out.dtype != dtype: + raise TypeError("Incompatible type for fill_value") + else: + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() return indexer, dtype, fill_value, mask_info diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 97a2d4037bf26..f6e0c755a0bd2 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -34,6 +34,7 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( + ensure_int64, is_bool_dtype, is_datetime64_ns_dtype, is_dtype_equal, @@ -205,7 +206,7 @@ def get_dtypes(self): def __repr__(self) -> str: output = type(self).__name__ output += f"\nIndex: {self._axes[0]}" - if self.ndim == 1: + if self.ndim == 2: output += f"\nColumns: {self._axes[1]}" output += f"\n{len(self.arrays)} arrays:" for arr in self.arrays: @@ -1075,15 +1076,24 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: unstacked : BlockManager """ indexer, _ = unstacker._indexer_and_to_sort - new_indexer = np.full(unstacker.mask.shape, -1) - new_indexer[unstacker.mask] = indexer + if np.all(unstacker.mask): + new_indexer = indexer + allow_fill = False + else: + new_indexer = np.full(unstacker.mask.shape, -1) + new_indexer[unstacker.mask] = indexer + allow_fill = True new_indexer2D = new_indexer.reshape(*unstacker.full_shape) + new_indexer2D = ensure_int64(new_indexer2D) new_arrays = [] for arr in self.arrays: for i in range(unstacker.full_shape[1]): new_arr = take_1d( - arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value + arr, + new_indexer2D[:, i], + allow_fill=allow_fill, + fill_value=fill_value, ) new_arrays.append(new_arr) From 85cccb23fb902287e1e8fe219ab6b1b0dcd2204d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 8 Mar 2021 10:30:54 +0100 Subject: [PATCH 2/5] add check for reindex_indexer axis=0 as well --- pandas/core/array_algos/take.py | 2 +- pandas/core/internals/array_manager.py | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index b06145aca51d2..f7e2f7dc7fadf 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -149,7 +149,7 @@ def take_1d( return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) if not allow_fill: - return np.take(arr, indexer) + return arr.take(indexer) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( arr, indexer, None, fill_value, allow_fill diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index f6e0c755a0bd2..5d0968abd24ff 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -1000,11 +1000,16 @@ def _reindex_indexer( else: validate_indices(indexer, len(self._axes[0])) + indexer = ensure_int64(indexer) + if (indexer == -1).any(): + allow_fill = True + else: + allow_fill = False new_arrays = [ take_1d( arr, indexer, - allow_fill=True, + allow_fill=allow_fill, fill_value=fill_value, # if fill_value is not None else blk.fill_value ) @@ -1076,7 +1081,7 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: unstacked : BlockManager """ indexer, _ = unstacker._indexer_and_to_sort - if np.all(unstacker.mask): + if unstacker.mask.all(): new_indexer = indexer allow_fill = False else: From f8adf7bcaa05418b6685d288b32ca0b6eabab599 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 8 Mar 2021 11:56:58 +0100 Subject: [PATCH 3/5] typing --- pandas/core/array_algos/take.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index f7e2f7dc7fadf..e1c89a972c675 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -471,7 +471,7 @@ def _take_2d_multi_object( def _take_preprocess_indexer_and_fill_value( arr: np.ndarray, - indexer: Optional[np.ndarray], + indexer: np.ndarray, out: Optional[np.ndarray], fill_value, allow_fill: bool, From 57684663ed2a2647229e26e51bba89bd7ae4b29c Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Mar 2021 11:54:24 +0100 Subject: [PATCH 4/5] add note about validated indexer --- pandas/core/array_algos/take.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index ff00827edaa38..61463fc6c8b36 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -36,6 +36,9 @@ def take_nd( This dispatches to ``take`` defined on ExtensionArrays. It does not currently dispatch to ``SparseArray.take`` for sparse ``arr``. + Note: this function assumes that the indexer is a valid(ated) indexer with + no out of bound indices. + Parameters ---------- arr : np.ndarray or ExtensionArray @@ -133,7 +136,7 @@ def take_1d( allow_fill: bool = True, ) -> ArrayLike: """ - Specialized version for 1D arrays. Differences compared to take_nd: + Specialized version for 1D arrays. Differences compared to `take_nd`: - Assumes input array has already been converted to numpy array / EA - Assumes indexer is already guaranteed to be int64 dtype ndarray @@ -141,6 +144,9 @@ def take_1d( To ensure the lowest possible overhead. + Note: similarly to `take_nd`, this function assumes that the indexer is + a valid(ated) indexer with no out of bound indices. + TODO(ArrayManager): mainly useful for ArrayManager, otherwise can potentially be removed again if we don't end up with ArrayManager. """ From 35f933ff455f33717cf56b8c9cf830d9ab2348b6 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Mar 2021 15:57:00 +0100 Subject: [PATCH 5/5] remove typing ignores --- pandas/core/array_algos/take.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index af164d7f576ed..b1311efa718ae 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -534,13 +534,8 @@ def _take_preprocess_indexer_and_fill_value( if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer mask = indexer == -1 - # error: Item "bool" of "Union[Any, bool]" has no attribute "any" - # [union-attr] - needs_masking = mask.any() # type: ignore[union-attr] - # error: Incompatible types in assignment (expression has type - # "Tuple[Union[Any, bool], Any]", variable has type - # "Optional[Tuple[None, bool]]") - mask_info = mask, needs_masking # type: ignore[assignment] + needs_masking = mask.any() + mask_info = mask, needs_masking if needs_masking: if out is not None and out.dtype != dtype: raise TypeError("Incompatible type for fill_value")