diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4dac4dd557af2..24a8fbcb3bb2d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -39,7 +39,6 @@ from pandas.core.dtypes.cast import ( construct_1d_object_array_from_listlike, infer_dtype_from_array, - maybe_promote, ) from pandas.core.dtypes.common import ( ensure_float64, @@ -83,6 +82,7 @@ na_value_for_dtype, ) +from pandas.core.array_algos.take import take_nd from pandas.core.construction import ( array, ensure_wrapped_if_datetimelike, @@ -1414,219 +1414,6 @@ def get_indexer(current_indexer, other_indexer): # ---- # -def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): - def wrapper( - arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan - ): - if arr_dtype is not None: - arr = arr.view(arr_dtype) - if out_dtype is not None: - out = out.view(out_dtype) - if fill_wrap is not None: - fill_value = fill_wrap(fill_value) - f(arr, indexer, out, fill_value=fill_value) - - return wrapper - - -def _convert_wrapper(f, conv_dtype): - def wrapper( - arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan - ): - if conv_dtype == object: - # GH#39755 avoid casting dt64/td64 to integers - arr = ensure_wrapped_if_datetimelike(arr) - arr = arr.astype(conv_dtype) - f(arr, indexer, out, fill_value=fill_value) - - return wrapper - - -def _take_2d_multi_object( - arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value, mask_info -) -> None: - # this is not ideal, performance-wise, but it's better than raising - # an exception (best to optimize in Cython to avoid getting here) - row_idx, col_idx = indexer - if mask_info is not None: - (row_mask, col_mask), (row_needs, col_needs) = mask_info - else: - row_mask = row_idx == -1 - col_mask = col_idx == -1 - row_needs = row_mask.any() - col_needs = col_mask.any() - if fill_value is not None: - if row_needs: - out[row_mask, :] = fill_value - if col_needs: - out[:, col_mask] = fill_value - for i in range(len(row_idx)): - u_ = row_idx[i] - for j in range(len(col_idx)): - v = col_idx[j] - out[i, j] = arr[u_, v] - - -def _take_nd_object( - arr: np.ndarray, - indexer: np.ndarray, - out: np.ndarray, - axis: int, - fill_value, - mask_info, -): - if mask_info is not None: - mask, needs_masking = mask_info - else: - mask = indexer == -1 - needs_masking = mask.any() - if arr.dtype != out.dtype: - arr = arr.astype(out.dtype) - if arr.shape[axis] > 0: - arr.take(ensure_platform_int(indexer), axis=axis, out=out) - if needs_masking: - outindexer = [slice(None)] * arr.ndim - outindexer[axis] = mask - out[tuple(outindexer)] = fill_value - - -_take_1d_dict = { - ("int8", "int8"): algos.take_1d_int8_int8, - ("int8", "int32"): algos.take_1d_int8_int32, - ("int8", "int64"): algos.take_1d_int8_int64, - ("int8", "float64"): algos.take_1d_int8_float64, - ("int16", "int16"): algos.take_1d_int16_int16, - ("int16", "int32"): algos.take_1d_int16_int32, - ("int16", "int64"): algos.take_1d_int16_int64, - ("int16", "float64"): algos.take_1d_int16_float64, - ("int32", "int32"): algos.take_1d_int32_int32, - ("int32", "int64"): algos.take_1d_int32_int64, - ("int32", "float64"): algos.take_1d_int32_float64, - ("int64", "int64"): algos.take_1d_int64_int64, - ("int64", "float64"): algos.take_1d_int64_float64, - ("float32", "float32"): algos.take_1d_float32_float32, - ("float32", "float64"): algos.take_1d_float32_float64, - ("float64", "float64"): algos.take_1d_float64_float64, - ("object", "object"): algos.take_1d_object_object, - ("bool", "bool"): _view_wrapper(algos.take_1d_bool_bool, np.uint8, np.uint8), - ("bool", "object"): _view_wrapper(algos.take_1d_bool_object, np.uint8, None), - ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( - algos.take_1d_int64_int64, np.int64, np.int64, np.int64 - ), -} - -_take_2d_axis0_dict = { - ("int8", "int8"): algos.take_2d_axis0_int8_int8, - ("int8", "int32"): algos.take_2d_axis0_int8_int32, - ("int8", "int64"): algos.take_2d_axis0_int8_int64, - ("int8", "float64"): algos.take_2d_axis0_int8_float64, - ("int16", "int16"): algos.take_2d_axis0_int16_int16, - ("int16", "int32"): algos.take_2d_axis0_int16_int32, - ("int16", "int64"): algos.take_2d_axis0_int16_int64, - ("int16", "float64"): algos.take_2d_axis0_int16_float64, - ("int32", "int32"): algos.take_2d_axis0_int32_int32, - ("int32", "int64"): algos.take_2d_axis0_int32_int64, - ("int32", "float64"): algos.take_2d_axis0_int32_float64, - ("int64", "int64"): algos.take_2d_axis0_int64_int64, - ("int64", "float64"): algos.take_2d_axis0_int64_float64, - ("float32", "float32"): algos.take_2d_axis0_float32_float32, - ("float32", "float64"): algos.take_2d_axis0_float32_float64, - ("float64", "float64"): algos.take_2d_axis0_float64_float64, - ("object", "object"): algos.take_2d_axis0_object_object, - ("bool", "bool"): _view_wrapper(algos.take_2d_axis0_bool_bool, np.uint8, np.uint8), - ("bool", "object"): _view_wrapper(algos.take_2d_axis0_bool_object, np.uint8, None), - ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( - algos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 - ), -} - -_take_2d_axis1_dict = { - ("int8", "int8"): algos.take_2d_axis1_int8_int8, - ("int8", "int32"): algos.take_2d_axis1_int8_int32, - ("int8", "int64"): algos.take_2d_axis1_int8_int64, - ("int8", "float64"): algos.take_2d_axis1_int8_float64, - ("int16", "int16"): algos.take_2d_axis1_int16_int16, - ("int16", "int32"): algos.take_2d_axis1_int16_int32, - ("int16", "int64"): algos.take_2d_axis1_int16_int64, - ("int16", "float64"): algos.take_2d_axis1_int16_float64, - ("int32", "int32"): algos.take_2d_axis1_int32_int32, - ("int32", "int64"): algos.take_2d_axis1_int32_int64, - ("int32", "float64"): algos.take_2d_axis1_int32_float64, - ("int64", "int64"): algos.take_2d_axis1_int64_int64, - ("int64", "float64"): algos.take_2d_axis1_int64_float64, - ("float32", "float32"): algos.take_2d_axis1_float32_float32, - ("float32", "float64"): algos.take_2d_axis1_float32_float64, - ("float64", "float64"): algos.take_2d_axis1_float64_float64, - ("object", "object"): algos.take_2d_axis1_object_object, - ("bool", "bool"): _view_wrapper(algos.take_2d_axis1_bool_bool, np.uint8, np.uint8), - ("bool", "object"): _view_wrapper(algos.take_2d_axis1_bool_object, np.uint8, None), - ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( - algos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 - ), -} - -_take_2d_multi_dict = { - ("int8", "int8"): algos.take_2d_multi_int8_int8, - ("int8", "int32"): algos.take_2d_multi_int8_int32, - ("int8", "int64"): algos.take_2d_multi_int8_int64, - ("int8", "float64"): algos.take_2d_multi_int8_float64, - ("int16", "int16"): algos.take_2d_multi_int16_int16, - ("int16", "int32"): algos.take_2d_multi_int16_int32, - ("int16", "int64"): algos.take_2d_multi_int16_int64, - ("int16", "float64"): algos.take_2d_multi_int16_float64, - ("int32", "int32"): algos.take_2d_multi_int32_int32, - ("int32", "int64"): algos.take_2d_multi_int32_int64, - ("int32", "float64"): algos.take_2d_multi_int32_float64, - ("int64", "int64"): algos.take_2d_multi_int64_int64, - ("int64", "float64"): algos.take_2d_multi_int64_float64, - ("float32", "float32"): algos.take_2d_multi_float32_float32, - ("float32", "float64"): algos.take_2d_multi_float32_float64, - ("float64", "float64"): algos.take_2d_multi_float64_float64, - ("object", "object"): algos.take_2d_multi_object_object, - ("bool", "bool"): _view_wrapper(algos.take_2d_multi_bool_bool, np.uint8, np.uint8), - ("bool", "object"): _view_wrapper(algos.take_2d_multi_bool_object, np.uint8, None), - ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( - algos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 - ), -} - - -def _get_take_nd_function( - ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: int = 0, mask_info=None -): - if ndim <= 2: - tup = (arr_dtype.name, out_dtype.name) - if ndim == 1: - func = _take_1d_dict.get(tup, None) - elif ndim == 2: - if axis == 0: - func = _take_2d_axis0_dict.get(tup, None) - else: - func = _take_2d_axis1_dict.get(tup, None) - if func is not None: - return func - - tup = (out_dtype.name, out_dtype.name) - if ndim == 1: - func = _take_1d_dict.get(tup, None) - elif ndim == 2: - if axis == 0: - func = _take_2d_axis0_dict.get(tup, None) - else: - func = _take_2d_axis1_dict.get(tup, None) - if func is not None: - func = _convert_wrapper(func, out_dtype) - return func - - def func2(arr, indexer, out, fill_value=np.nan): - indexer = ensure_int64(indexer) - _take_nd_object( - arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info - ) - - return func2 - - def take( arr, indices: np.ndarray, axis: int = 0, allow_fill: bool = False, fill_value=None ): @@ -1720,190 +1507,6 @@ def take( return result -def _take_preprocess_indexer_and_fill_value( - arr: np.ndarray, - indexer: Optional[np.ndarray], - axis: int, - out: Optional[np.ndarray], - fill_value, - allow_fill: bool, -): - mask_info = None - - if indexer is None: - indexer = np.arange(arr.shape[axis], dtype=np.int64) - dtype, fill_value = arr.dtype, arr.dtype.type() - else: - indexer = ensure_int64(indexer, copy=False) - if not allow_fill: - dtype, fill_value = arr.dtype, arr.dtype.type() - mask_info = None, False - else: - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype and (out is None or out.dtype != dtype): - # check if promotion is actually required based on indexer - mask = indexer == -1 - needs_masking = mask.any() - mask_info = mask, needs_masking - if needs_masking: - if out is not None and out.dtype != dtype: - raise TypeError("Incompatible type for fill_value") - else: - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() - - return indexer, dtype, fill_value, mask_info - - -def take_nd( - arr, - indexer, - axis: int = 0, - out: Optional[np.ndarray] = None, - fill_value=lib.no_default, - allow_fill: bool = True, -): - """ - Specialized Cython take which sets NaN values in one pass - - This dispatches to ``take`` defined on ExtensionArrays. It does not - currently dispatch to ``SparseArray.take`` for sparse ``arr``. - - Parameters - ---------- - arr : array-like - Input array. - indexer : ndarray - 1-D array of indices to take, subarrays corresponding to -1 value - indices are filed with fill_value - axis : int, default 0 - Axis to take from - out : ndarray or None, default None - Optional output array, must be appropriate type to hold input and - fill_value together, if indexer has any -1 value entries; call - maybe_promote to determine this type for any fill_value - fill_value : any, default np.nan - Fill value to replace -1 values with - allow_fill : boolean, default True - If False, indexer is assumed to contain no -1 values so no filling - will be done. This short-circuits computation of a mask. Result is - undefined if allow_fill == False and -1 is present in indexer. - - Returns - ------- - subarray : array-like - May be the same type as the input, or cast to an ndarray. - """ - if fill_value is lib.no_default: - fill_value = na_value_for_dtype(arr.dtype, compat=False) - - if isinstance(arr, ABCExtensionArray): - # Check for EA to catch DatetimeArray, TimedeltaArray - return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - - arr = extract_array(arr) - arr = np.asarray(arr) - - indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( - arr, indexer, axis, out, fill_value, allow_fill - ) - - flip_order = False - if arr.ndim == 2 and arr.flags.f_contiguous: - flip_order = True - - if flip_order: - arr = arr.T - axis = arr.ndim - axis - 1 - if out is not None: - out = out.T - - # at this point, it's guaranteed that dtype can hold both the arr values - # and the fill_value - if out is None: - out_shape_ = list(arr.shape) - out_shape_[axis] = len(indexer) - out_shape = tuple(out_shape_) - if arr.flags.f_contiguous and axis == arr.ndim - 1: - # minor tweak that can make an order-of-magnitude difference - # for dataframes initialized directly from 2-d ndarrays - # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its - # f-contiguous transpose) - out = np.empty(out_shape, dtype=dtype, order="F") - else: - out = np.empty(out_shape, dtype=dtype) - - func = _get_take_nd_function( - arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info - ) - func(arr, indexer, out, fill_value) - - if flip_order: - out = out.T - return out - - -def take_2d_multi( - arr: np.ndarray, indexer: np.ndarray, fill_value=np.nan -) -> np.ndarray: - """ - Specialized Cython take which sets NaN values in one pass. - """ - # This is only called from one place in DataFrame._reindex_multi, - # so we know indexer is well-behaved. - assert indexer is not None - assert indexer[0] is not None - assert indexer[1] is not None - - row_idx, col_idx = indexer - - row_idx = ensure_int64(row_idx) - col_idx = ensure_int64(col_idx) - indexer = row_idx, col_idx - mask_info = None - - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype: - # check if promotion is actually required based on indexer - row_mask = row_idx == -1 - col_mask = col_idx == -1 - row_needs = row_mask.any() - col_needs = col_mask.any() - mask_info = (row_mask, col_mask), (row_needs, col_needs) - - if not (row_needs or col_needs): - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() - - # at this point, it's guaranteed that dtype can hold both the arr values - # and the fill_value - out_shape = len(row_idx), len(col_idx) - out = np.empty(out_shape, dtype=dtype) - - func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) - if func is None and arr.dtype != out.dtype: - func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) - if func is not None: - func = _convert_wrapper(func, out.dtype) - if func is None: - - def func(arr, indexer, out, fill_value=np.nan): - _take_2d_multi_object( - arr, indexer, out, fill_value=fill_value, mask_info=mask_info - ) - - func(arr, indexer, out=out, fill_value=fill_value) - return out - - # ------------ # # searchsorted # # ------------ # diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py new file mode 100644 index 0000000000000..906aea0a90982 --- /dev/null +++ b/pandas/core/array_algos/take.py @@ -0,0 +1,433 @@ +from typing import Optional + +import numpy as np + +from pandas._libs import ( + algos as libalgos, + lib, +) + +from pandas.core.dtypes.cast import maybe_promote +from pandas.core.dtypes.common import ( + ensure_int64, + ensure_platform_int, +) +from pandas.core.dtypes.missing import na_value_for_dtype + +from pandas.core.construction import ( + ensure_wrapped_if_datetimelike, + extract_array, +) + + +def take_nd( + arr, + indexer, + axis: int = 0, + out: Optional[np.ndarray] = None, + fill_value=lib.no_default, + allow_fill: bool = True, +): + + """ + Specialized Cython take which sets NaN values in one pass + + This dispatches to ``take`` defined on ExtensionArrays. It does not + currently dispatch to ``SparseArray.take`` for sparse ``arr``. + + Parameters + ---------- + arr : array-like + Input array. + indexer : ndarray + 1-D array of indices to take, subarrays corresponding to -1 value + indices are filed with fill_value + axis : int, default 0 + Axis to take from + out : ndarray or None, default None + Optional output array, must be appropriate type to hold input and + fill_value together, if indexer has any -1 value entries; call + maybe_promote to determine this type for any fill_value + fill_value : any, default np.nan + Fill value to replace -1 values with + allow_fill : boolean, default True + If False, indexer is assumed to contain no -1 values so no filling + will be done. This short-circuits computation of a mask. Result is + undefined if allow_fill == False and -1 is present in indexer. + + Returns + ------- + subarray : array-like + May be the same type as the input, or cast to an ndarray. + """ + if fill_value is lib.no_default: + fill_value = na_value_for_dtype(arr.dtype, compat=False) + + arr = extract_array(arr, extract_numpy=True) + + if not isinstance(arr, np.ndarray): + # i.e. ExtensionArray, + # includes for EA to catch DatetimeArray, TimedeltaArray + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + + arr = np.asarray(arr) + + indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( + arr, indexer, axis, out, fill_value, allow_fill + ) + + flip_order = False + if arr.ndim == 2 and arr.flags.f_contiguous: + flip_order = True + + if flip_order: + arr = arr.T + axis = arr.ndim - axis - 1 + if out is not None: + out = out.T + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + if out is None: + out_shape_ = list(arr.shape) + out_shape_[axis] = len(indexer) + out_shape = tuple(out_shape_) + if arr.flags.f_contiguous and axis == arr.ndim - 1: + # minor tweak that can make an order-of-magnitude difference + # for dataframes initialized directly from 2-d ndarrays + # (s.t. df.values is c-contiguous and df._mgr.blocks[0] is its + # f-contiguous transpose) + out = np.empty(out_shape, dtype=dtype, order="F") + else: + out = np.empty(out_shape, dtype=dtype) + + func = _get_take_nd_function( + arr.ndim, arr.dtype, out.dtype, axis=axis, mask_info=mask_info + ) + func(arr, indexer, out, fill_value) + + if flip_order: + out = out.T + return out + + +def take_2d_multi( + arr: np.ndarray, indexer: np.ndarray, fill_value=np.nan +) -> np.ndarray: + """ + Specialized Cython take which sets NaN values in one pass. + """ + # This is only called from one place in DataFrame._reindex_multi, + # so we know indexer is well-behaved. + assert indexer is not None + assert indexer[0] is not None + assert indexer[1] is not None + + row_idx, col_idx = indexer + + row_idx = ensure_int64(row_idx) + col_idx = ensure_int64(col_idx) + indexer = row_idx, col_idx + mask_info = None + + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype: + # check if promotion is actually required based on indexer + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + mask_info = (row_mask, col_mask), (row_needs, col_needs) + + if not (row_needs or col_needs): + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + out_shape = len(row_idx), len(col_idx) + out = np.empty(out_shape, dtype=dtype) + + func = _take_2d_multi_dict.get((arr.dtype.name, out.dtype.name), None) + if func is None and arr.dtype != out.dtype: + func = _take_2d_multi_dict.get((out.dtype.name, out.dtype.name), None) + if func is not None: + func = _convert_wrapper(func, out.dtype) + + if func is not None: + func(arr, indexer, out=out, fill_value=fill_value) + else: + _take_2d_multi_object( + arr, indexer, out, fill_value=fill_value, mask_info=mask_info + ) + + return out + + +def _get_take_nd_function( + ndim: int, arr_dtype: np.dtype, out_dtype: np.dtype, axis: int = 0, mask_info=None +): + + if ndim <= 2: + tup = (arr_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + return func + + tup = (out_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + func = _convert_wrapper(func, out_dtype) + return func + + def func2(arr, indexer, out, fill_value=np.nan): + indexer = ensure_int64(indexer) + _take_nd_object( + arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info + ) + + return func2 + + +def _view_wrapper(f, arr_dtype=None, out_dtype=None, fill_wrap=None): + def wrapper( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan + ): + if arr_dtype is not None: + arr = arr.view(arr_dtype) + if out_dtype is not None: + out = out.view(out_dtype) + if fill_wrap is not None: + fill_value = fill_wrap(fill_value) + f(arr, indexer, out, fill_value=fill_value) + + return wrapper + + +def _convert_wrapper(f, conv_dtype): + def wrapper( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=np.nan + ): + if conv_dtype == object: + # GH#39755 avoid casting dt64/td64 to integers + arr = ensure_wrapped_if_datetimelike(arr) + arr = arr.astype(conv_dtype) + f(arr, indexer, out, fill_value=fill_value) + + return wrapper + + +_take_1d_dict = { + ("int8", "int8"): libalgos.take_1d_int8_int8, + ("int8", "int32"): libalgos.take_1d_int8_int32, + ("int8", "int64"): libalgos.take_1d_int8_int64, + ("int8", "float64"): libalgos.take_1d_int8_float64, + ("int16", "int16"): libalgos.take_1d_int16_int16, + ("int16", "int32"): libalgos.take_1d_int16_int32, + ("int16", "int64"): libalgos.take_1d_int16_int64, + ("int16", "float64"): libalgos.take_1d_int16_float64, + ("int32", "int32"): libalgos.take_1d_int32_int32, + ("int32", "int64"): libalgos.take_1d_int32_int64, + ("int32", "float64"): libalgos.take_1d_int32_float64, + ("int64", "int64"): libalgos.take_1d_int64_int64, + ("int64", "float64"): libalgos.take_1d_int64_float64, + ("float32", "float32"): libalgos.take_1d_float32_float32, + ("float32", "float64"): libalgos.take_1d_float32_float64, + ("float64", "float64"): libalgos.take_1d_float64_float64, + ("object", "object"): libalgos.take_1d_object_object, + ("bool", "bool"): _view_wrapper(libalgos.take_1d_bool_bool, np.uint8, np.uint8), + ("bool", "object"): _view_wrapper(libalgos.take_1d_bool_object, np.uint8, None), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_1d_int64_int64, np.int64, np.int64, np.int64 + ), +} + +_take_2d_axis0_dict = { + ("int8", "int8"): libalgos.take_2d_axis0_int8_int8, + ("int8", "int32"): libalgos.take_2d_axis0_int8_int32, + ("int8", "int64"): libalgos.take_2d_axis0_int8_int64, + ("int8", "float64"): libalgos.take_2d_axis0_int8_float64, + ("int16", "int16"): libalgos.take_2d_axis0_int16_int16, + ("int16", "int32"): libalgos.take_2d_axis0_int16_int32, + ("int16", "int64"): libalgos.take_2d_axis0_int16_int64, + ("int16", "float64"): libalgos.take_2d_axis0_int16_float64, + ("int32", "int32"): libalgos.take_2d_axis0_int32_int32, + ("int32", "int64"): libalgos.take_2d_axis0_int32_int64, + ("int32", "float64"): libalgos.take_2d_axis0_int32_float64, + ("int64", "int64"): libalgos.take_2d_axis0_int64_int64, + ("int64", "float64"): libalgos.take_2d_axis0_int64_float64, + ("float32", "float32"): libalgos.take_2d_axis0_float32_float32, + ("float32", "float64"): libalgos.take_2d_axis0_float32_float64, + ("float64", "float64"): libalgos.take_2d_axis0_float64_float64, + ("object", "object"): libalgos.take_2d_axis0_object_object, + ("bool", "bool"): _view_wrapper( + libalgos.take_2d_axis0_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos.take_2d_axis0_bool_object, np.uint8, None + ), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_2d_axis0_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + +_take_2d_axis1_dict = { + ("int8", "int8"): libalgos.take_2d_axis1_int8_int8, + ("int8", "int32"): libalgos.take_2d_axis1_int8_int32, + ("int8", "int64"): libalgos.take_2d_axis1_int8_int64, + ("int8", "float64"): libalgos.take_2d_axis1_int8_float64, + ("int16", "int16"): libalgos.take_2d_axis1_int16_int16, + ("int16", "int32"): libalgos.take_2d_axis1_int16_int32, + ("int16", "int64"): libalgos.take_2d_axis1_int16_int64, + ("int16", "float64"): libalgos.take_2d_axis1_int16_float64, + ("int32", "int32"): libalgos.take_2d_axis1_int32_int32, + ("int32", "int64"): libalgos.take_2d_axis1_int32_int64, + ("int32", "float64"): libalgos.take_2d_axis1_int32_float64, + ("int64", "int64"): libalgos.take_2d_axis1_int64_int64, + ("int64", "float64"): libalgos.take_2d_axis1_int64_float64, + ("float32", "float32"): libalgos.take_2d_axis1_float32_float32, + ("float32", "float64"): libalgos.take_2d_axis1_float32_float64, + ("float64", "float64"): libalgos.take_2d_axis1_float64_float64, + ("object", "object"): libalgos.take_2d_axis1_object_object, + ("bool", "bool"): _view_wrapper( + libalgos.take_2d_axis1_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos.take_2d_axis1_bool_object, np.uint8, None + ), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_2d_axis1_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + +_take_2d_multi_dict = { + ("int8", "int8"): libalgos.take_2d_multi_int8_int8, + ("int8", "int32"): libalgos.take_2d_multi_int8_int32, + ("int8", "int64"): libalgos.take_2d_multi_int8_int64, + ("int8", "float64"): libalgos.take_2d_multi_int8_float64, + ("int16", "int16"): libalgos.take_2d_multi_int16_int16, + ("int16", "int32"): libalgos.take_2d_multi_int16_int32, + ("int16", "int64"): libalgos.take_2d_multi_int16_int64, + ("int16", "float64"): libalgos.take_2d_multi_int16_float64, + ("int32", "int32"): libalgos.take_2d_multi_int32_int32, + ("int32", "int64"): libalgos.take_2d_multi_int32_int64, + ("int32", "float64"): libalgos.take_2d_multi_int32_float64, + ("int64", "int64"): libalgos.take_2d_multi_int64_int64, + ("int64", "float64"): libalgos.take_2d_multi_int64_float64, + ("float32", "float32"): libalgos.take_2d_multi_float32_float32, + ("float32", "float64"): libalgos.take_2d_multi_float32_float64, + ("float64", "float64"): libalgos.take_2d_multi_float64_float64, + ("object", "object"): libalgos.take_2d_multi_object_object, + ("bool", "bool"): _view_wrapper( + libalgos.take_2d_multi_bool_bool, np.uint8, np.uint8 + ), + ("bool", "object"): _view_wrapper( + libalgos.take_2d_multi_bool_object, np.uint8, None + ), + ("datetime64[ns]", "datetime64[ns]"): _view_wrapper( + libalgos.take_2d_multi_int64_int64, np.int64, np.int64, fill_wrap=np.int64 + ), +} + + +def _take_nd_object( + arr: np.ndarray, + indexer: np.ndarray, + out: np.ndarray, + axis: int, + fill_value, + mask_info, +): + if mask_info is not None: + mask, needs_masking = mask_info + else: + mask = indexer == -1 + needs_masking = mask.any() + if arr.dtype != out.dtype: + arr = arr.astype(out.dtype) + if arr.shape[axis] > 0: + arr.take(ensure_platform_int(indexer), axis=axis, out=out) + if needs_masking: + outindexer = [slice(None)] * arr.ndim + outindexer[axis] = mask + out[tuple(outindexer)] = fill_value + + +def _take_2d_multi_object( + arr: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value, mask_info +) -> None: + # this is not ideal, performance-wise, but it's better than raising + # an exception (best to optimize in Cython to avoid getting here) + row_idx, col_idx = indexer + if mask_info is not None: + (row_mask, col_mask), (row_needs, col_needs) = mask_info + else: + row_mask = row_idx == -1 + col_mask = col_idx == -1 + row_needs = row_mask.any() + col_needs = col_mask.any() + if fill_value is not None: + if row_needs: + out[row_mask, :] = fill_value + if col_needs: + out[:, col_mask] = fill_value + for i in range(len(row_idx)): + u_ = row_idx[i] + for j in range(len(col_idx)): + v = col_idx[j] + out[i, j] = arr[u_, v] + + +def _take_preprocess_indexer_and_fill_value( + arr: np.ndarray, + indexer: Optional[np.ndarray], + axis: int, + out: Optional[np.ndarray], + fill_value, + allow_fill: bool, +): + mask_info = None + + if indexer is None: + indexer = np.arange(arr.shape[axis], dtype=np.int64) + dtype, fill_value = arr.dtype, arr.dtype.type() + else: + indexer = ensure_int64(indexer, copy=False) + if not allow_fill: + dtype, fill_value = arr.dtype, arr.dtype.type() + mask_info = None, False + else: + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) + if dtype != arr.dtype and (out is None or out.dtype != dtype): + # check if promotion is actually required based on indexer + mask = indexer == -1 + needs_masking = mask.any() + mask_info = mask, needs_masking + if needs_masking: + if out is not None and out.dtype != dtype: + raise TypeError("Incompatible type for fill_value") + else: + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + return indexer, dtype, fill_value, mask_info diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 7a216e411dcc2..da15fd2a7d646 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -141,6 +141,7 @@ relabel_result, transform, ) +from pandas.core.array_algos.take import take_2d_multi from pandas.core.arraylike import OpsMixin from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseFrameAccessor @@ -4186,9 +4187,7 @@ def _reindex_multi(self, axes, copy: bool, fill_value) -> DataFrame: if row_indexer is not None and col_indexer is not None: indexer = row_indexer, col_indexer - new_values = algorithms.take_2d_multi( - self.values, indexer, fill_value=fill_value - ) + new_values = take_2d_multi(self.values, indexer, fill_value=fill_value) return self._constructor(new_values, index=new_index, columns=new_columns) else: return self._reindex_with_indexers(