diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index 110b47a11c3a9..b1311efa718ae 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -67,6 +67,9 @@ def take_nd( This dispatches to ``take`` defined on ExtensionArrays. It does not currently dispatch to ``SparseArray.take`` for sparse ``arr``. + Note: this function assumes that the indexer is a valid(ated) indexer with + no out of bound indices. + Parameters ---------- arr : np.ndarray or ExtensionArray @@ -113,8 +116,13 @@ def _take_nd_ndarray( allow_fill: bool, ) -> np.ndarray: + if indexer is None: + indexer = np.arange(arr.shape[axis], dtype=np.int64) + dtype, fill_value = arr.dtype, arr.dtype.type() + else: + indexer = ensure_int64(indexer, copy=False) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( - arr, indexer, axis, out, fill_value, allow_fill + arr, indexer, out, fill_value, allow_fill ) flip_order = False @@ -159,13 +167,17 @@ def take_1d( allow_fill: bool = True, ) -> ArrayLike: """ - Specialized version for 1D arrays. Differences compared to take_nd: + Specialized version for 1D arrays. Differences compared to `take_nd`: - - Assumes input (arr, indexer) has already been converted to numpy array / EA + - Assumes input array has already been converted to numpy array / EA + - Assumes indexer is already guaranteed to be int64 dtype ndarray - Only works for 1D arrays To ensure the lowest possible overhead. + Note: similarly to `take_nd`, this function assumes that the indexer is + a valid(ated) indexer with no out of bound indices. + TODO(ArrayManager): mainly useful for ArrayManager, otherwise can potentially be removed again if we don't end up with ArrayManager. """ @@ -180,8 +192,11 @@ def take_1d( allow_fill=allow_fill, ) + if not allow_fill: + return arr.take(indexer) + indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( - arr, indexer, 0, None, fill_value, allow_fill + arr, indexer, None, fill_value, allow_fill ) # at this point, it's guaranteed that dtype can hold both the arr values @@ -502,43 +517,32 @@ def _take_2d_multi_object( def _take_preprocess_indexer_and_fill_value( arr: np.ndarray, - indexer: Optional[np.ndarray], - axis: int, + indexer: np.ndarray, out: Optional[np.ndarray], fill_value, allow_fill: bool, ): mask_info = None - if indexer is None: - indexer = np.arange(arr.shape[axis], dtype=np.int64) + if not allow_fill: dtype, fill_value = arr.dtype, arr.dtype.type() + mask_info = None, False else: - indexer = ensure_int64(indexer, copy=False) - if not allow_fill: - dtype, fill_value = arr.dtype, arr.dtype.type() - mask_info = None, False - else: - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype and (out is None or out.dtype != dtype): - # check if promotion is actually required based on indexer - mask = indexer == -1 - # error: Item "bool" of "Union[Any, bool]" has no attribute "any" - # [union-attr] - needs_masking = mask.any() # type: ignore[union-attr] - # error: Incompatible types in assignment (expression has type - # "Tuple[Union[Any, bool], Any]", variable has type - # "Optional[Tuple[None, bool]]") - mask_info = mask, needs_masking # type: ignore[assignment] - if needs_masking: - if out is not None and out.dtype != dtype: - raise TypeError("Incompatible type for fill_value") - else: - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype and (out is None or out.dtype != dtype): + # check if promotion is actually required based on indexer + mask = indexer == -1 + needs_masking = mask.any() + mask_info = mask, needs_masking + if needs_masking: + if out is not None and out.dtype != dtype: + raise TypeError("Incompatible type for fill_value") + else: + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() return indexer, dtype, fill_value, mask_info diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 5987fdf956b68..afc5cbbe84503 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -32,6 +32,7 @@ soft_convert_objects, ) from pandas.core.dtypes.common import ( + ensure_int64, is_bool_dtype, is_datetime64_ns_dtype, is_dtype_equal, @@ -202,7 +203,7 @@ def get_dtypes(self): def __repr__(self) -> str: output = type(self).__name__ output += f"\nIndex: {self._axes[0]}" - if self.ndim == 1: + if self.ndim == 2: output += f"\nColumns: {self._axes[1]}" output += f"\n{len(self.arrays)} arrays:" for arr in self.arrays: @@ -1005,11 +1006,16 @@ def _reindex_indexer( else: validate_indices(indexer, len(self._axes[0])) + indexer = ensure_int64(indexer) + if (indexer == -1).any(): + allow_fill = True + else: + allow_fill = False new_arrays = [ take_1d( arr, indexer, - allow_fill=True, + allow_fill=allow_fill, fill_value=fill_value, # if fill_value is not None else blk.fill_value ) @@ -1085,15 +1091,24 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: unstacked : BlockManager """ indexer, _ = unstacker._indexer_and_to_sort - new_indexer = np.full(unstacker.mask.shape, -1) - new_indexer[unstacker.mask] = indexer + if unstacker.mask.all(): + new_indexer = indexer + allow_fill = False + else: + new_indexer = np.full(unstacker.mask.shape, -1) + new_indexer[unstacker.mask] = indexer + allow_fill = True new_indexer2D = new_indexer.reshape(*unstacker.full_shape) + new_indexer2D = ensure_int64(new_indexer2D) new_arrays = [] for arr in self.arrays: for i in range(unstacker.full_shape[1]): new_arr = take_1d( - arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value + arr, + new_indexer2D[:, i], + allow_fill=allow_fill, + fill_value=fill_value, ) new_arrays.append(new_arr)