From 4512f9c45fdcdae96b7ad0e44540e327ce45f3fd Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 9 Feb 2021 10:36:31 +0100 Subject: [PATCH 01/16] PERF: optimize algos.take for repeated calls --- pandas/core/algorithms.py | 175 +++++++++++++++++-------- pandas/core/internals/array_manager.py | 2 +- 2 files changed, 120 insertions(+), 57 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index cdbef673643e8..d28d23c49a6c3 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -4,6 +4,7 @@ """ from __future__ import annotations +import functools import operator from textwrap import dedent from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union, cast @@ -73,6 +74,9 @@ _shared_docs: Dict[str, str] = {} +maybe_promote_cached = functools.lru_cache(maxsize=128)(maybe_promote) + + # --------------- # # dtype access # # --------------- # @@ -1534,40 +1538,52 @@ def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info): } +@functools.lru_cache(maxsize=128) +def __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis): + """ + Part of _get_take_nd_function below that doesn't need the mask + and thus can be cached. + """ + tup = (arr_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + return func + + tup = (out_dtype.name, out_dtype.name) + if ndim == 1: + func = _take_1d_dict.get(tup, None) + elif ndim == 2: + if axis == 0: + func = _take_2d_axis0_dict.get(tup, None) + else: + func = _take_2d_axis1_dict.get(tup, None) + if func is not None: + func = _convert_wrapper(func, out_dtype) + return func + + return None + + def _get_take_nd_function( ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None ): - if ndim <= 2: - tup = (arr_dtype.name, out_dtype.name) - if ndim == 1: - func = _take_1d_dict.get(tup, None) - elif ndim == 2: - if axis == 0: - func = _take_2d_axis0_dict.get(tup, None) - else: - func = _take_2d_axis1_dict.get(tup, None) - if func is not None: - return func - - tup = (out_dtype.name, out_dtype.name) - if ndim == 1: - func = _take_1d_dict.get(tup, None) - elif ndim == 2: - if axis == 0: - func = _take_2d_axis0_dict.get(tup, None) - else: - func = _take_2d_axis1_dict.get(tup, None) - if func is not None: - func = _convert_wrapper(func, out_dtype) - return func + func = __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis) - def func2(arr, indexer, out, fill_value=np.nan): - indexer = ensure_int64(indexer) - _take_nd_object( - arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info - ) + if func is None: - return func2 + def func(arr, indexer, out, fill_value=np.nan): + indexer = ensure_int64(indexer) + _take_nd_object( + arr, indexer, out, axis=axis, fill_value=fill_value, mask_info=mask_info + ) + + return func def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None): @@ -1661,6 +1677,40 @@ def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None) return result +def _take_preprocess_indexer_and_fill_value( + arr, indexer, axis, out, fill_value, allow_fill +): + mask_info = None + + if indexer is None: + indexer = np.arange(arr.shape[axis], dtype=np.int64) + dtype, fill_value = arr.dtype, arr.dtype.type() + else: + indexer = ensure_int64(indexer, copy=False) + if not allow_fill: + dtype, fill_value = arr.dtype, arr.dtype.type() + mask_info = None, False + else: + # check for promotion based on types only (do this first because + # it's faster than computing a mask) + dtype, fill_value = maybe_promote_cached(arr.dtype, fill_value) + if dtype != arr.dtype and (out is None or out.dtype != dtype): + # check if promotion is actually required based on indexer + mask = indexer == -1 + needs_masking = mask.any() + mask_info = mask, needs_masking + if needs_masking: + if out is not None and out.dtype != dtype: + raise TypeError("Incompatible type for fill_value") + else: + # if not, then depromote, set fill_value to dummy + # (it won't be used but we don't want the cython code + # to crash when trying to cast it to dtype) + dtype, fill_value = arr.dtype, arr.dtype.type() + + return indexer, dtype, fill_value, mask_info + + def take_nd( arr, indexer, @@ -1700,8 +1750,6 @@ def take_nd( subarray : array-like May be the same type as the input, or cast to an ndarray. """ - mask_info = None - if fill_value is lib.no_default: fill_value = na_value_for_dtype(arr.dtype, compat=False) @@ -1712,31 +1760,9 @@ def take_nd( arr = extract_array(arr) arr = np.asarray(arr) - if indexer is None: - indexer = np.arange(arr.shape[axis], dtype=np.int64) - dtype, fill_value = arr.dtype, arr.dtype.type() - else: - indexer = ensure_int64(indexer, copy=False) - if not allow_fill: - dtype, fill_value = arr.dtype, arr.dtype.type() - mask_info = None, False - else: - # check for promotion based on types only (do this first because - # it's faster than computing a mask) - dtype, fill_value = maybe_promote(arr.dtype, fill_value) - if dtype != arr.dtype and (out is None or out.dtype != dtype): - # check if promotion is actually required based on indexer - mask = indexer == -1 - needs_masking = mask.any() - mask_info = mask, needs_masking - if needs_masking: - if out is not None and out.dtype != dtype: - raise TypeError("Incompatible type for fill_value") - else: - # if not, then depromote, set fill_value to dummy - # (it won't be used but we don't want the cython code - # to crash when trying to cast it to dtype) - dtype, fill_value = arr.dtype, arr.dtype.type() + indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( + arr, indexer, axis, out, fill_value, allow_fill + ) flip_order = False if arr.ndim == 2 and arr.flags.f_contiguous: @@ -1776,6 +1802,43 @@ def take_nd( take_1d = take_nd +def take_1d_array( + arr: np.ndarray, + indexer: np.ndarray, + out=None, + fill_value=lib.no_default, + allow_fill: bool = True, +): + """ + Specialized version for 1D arrays. Differences compared to take_nd/take_1d: + + - Assumes input (arr, indexer) has already been converted to numpy arrays + - Only works for 1D arrays + + """ + if fill_value is lib.no_default: + fill_value = na_value_for_dtype(arr.dtype, compat=False) + + if isinstance(arr, ABCExtensionArray): + # Check for EA to catch DatetimeArray, TimedeltaArray + return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) + + indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( + arr, indexer, 0, out, fill_value, allow_fill + ) + + # at this point, it's guaranteed that dtype can hold both the arr values + # and the fill_value + out = np.empty(indexer.shape, dtype=dtype) + + func = _get_take_nd_function( + arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info + ) + func(arr, indexer, out, fill_value) + + return out + + def take_2d_multi(arr, indexer, fill_value=np.nan): """ Specialized Cython take which sets NaN values in one pass. diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 0f677ff3180be..4b1bdb1e0e2fe 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -855,7 +855,7 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: new_arrays = [] for arr in self.arrays: for i in range(unstacker.full_shape[1]): - new_arr = algos.take( + new_arr = algos.take_1d_array( arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value ) new_arrays.append(new_arr) From 36c3ed2d4203af997a8efa2278a7fc28b16d7015 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 9 Feb 2021 11:54:17 +0100 Subject: [PATCH 02/16] fix nd check + fix cache differentiation of int / bool --- pandas/core/algorithms.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d28d23c49a6c3..d823c7cc47dd4 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -74,9 +74,6 @@ _shared_docs: Dict[str, str] = {} -maybe_promote_cached = functools.lru_cache(maxsize=128)(maybe_promote) - - # --------------- # # dtype access # # --------------- # @@ -1573,7 +1570,9 @@ def __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis): def _get_take_nd_function( ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None ): - func = __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis) + func = None + if ndim <= 2: + func = __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis) if func is None: @@ -1586,6 +1585,13 @@ def func(arr, indexer, out, fill_value=np.nan): return func +@functools.lru_cache(maxsize=128) +def _maybe_promote_cached(dtype, fill_value, fill_value_type): + # also use fill_value_type as (unused) argument to use this in the cache + # lookup -> differentiate 1 and True + return maybe_promote(dtype, fill_value) + + def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None): """ Take elements from an array. @@ -1693,7 +1699,9 @@ def _take_preprocess_indexer_and_fill_value( else: # check for promotion based on types only (do this first because # it's faster than computing a mask) - dtype, fill_value = maybe_promote_cached(arr.dtype, fill_value) + dtype, fill_value = _maybe_promote_cached( + arr.dtype, fill_value, type(fill_value) + ) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer mask = indexer == -1 From 6d529326d3d6023103e70d0174466ec3d18160ee Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 9 Feb 2021 12:07:30 +0100 Subject: [PATCH 03/16] fix non-scalar fill_value case --- pandas/core/algorithms.py | 12 +++++++++--- pandas/tests/test_take.py | 12 ++++++++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d823c7cc47dd4..5b4929bdf2885 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1592,6 +1592,14 @@ def _maybe_promote_cached(dtype, fill_value, fill_value_type): return maybe_promote(dtype, fill_value) +def _maybe_promote(dtype, fill_value): + try: + return _maybe_promote_cached(dtype, fill_value, type(fill_value)) + except TypeError: + # if fill_value is not hashable (required for caching) + return maybe_promote(dtype, fill_value) + + def take(arr, indices, axis: int = 0, allow_fill: bool = False, fill_value=None): """ Take elements from an array. @@ -1699,9 +1707,7 @@ def _take_preprocess_indexer_and_fill_value( else: # check for promotion based on types only (do this first because # it's faster than computing a mask) - dtype, fill_value = _maybe_promote_cached( - arr.dtype, fill_value, type(fill_value) - ) + dtype, fill_value = _maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer mask = indexer == -1 diff --git a/pandas/tests/test_take.py b/pandas/tests/test_take.py index 9f0632917037c..768f2e9af110a 100644 --- a/pandas/tests/test_take.py +++ b/pandas/tests/test_take.py @@ -417,6 +417,18 @@ def test_take_axis_1(self): with pytest.raises(IndexError, match="indices are out-of-bounds"): algos.take(arr, [0, 3], axis=1, allow_fill=True, fill_value=0) + def test_take_non_hashable_fill_value(self): + arr = np.array([1, 2, 3]) + indexer = np.array([1, -1]) + with pytest.raises(ValueError, match="fill_value must be a scalar"): + algos.take(arr, indexer, allow_fill=True, fill_value=[1]) + + # with object dtype it is allowed + arr = np.array([1, 2, 3], dtype=object) + result = algos.take(arr, indexer, allow_fill=True, fill_value=[1]) + expected = np.array([2, [1]], dtype=object) + tm.assert_numpy_array_equal(result, expected) + class TestExtensionTake: # The take method found in pd.api.extensions From ded773a1d9704b668c365f9bdfb796f584ee1cd0 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 9 Feb 2021 14:21:56 +0100 Subject: [PATCH 04/16] fix mypy --- pandas/core/algorithms.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5b4929bdf2885..e2fbca5bbaab2 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1594,7 +1594,11 @@ def _maybe_promote_cached(dtype, fill_value, fill_value_type): def _maybe_promote(dtype, fill_value): try: - return _maybe_promote_cached(dtype, fill_value, type(fill_value)) + # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type + # "Type[Any]"; expected "Hashable" [arg-type] + return _maybe_promote_cached( + dtype, fill_value, type(fill_value) + ) # type: ignore[override] except TypeError: # if fill_value is not hashable (required for caching) return maybe_promote(dtype, fill_value) From 2ee2543fbb0cf440f91841de525e2bc09358b165 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 9 Feb 2021 14:24:22 +0100 Subject: [PATCH 05/16] try fix mypy --- pandas/core/algorithms.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index e2fbca5bbaab2..a6d6ef80ec3aa 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1598,7 +1598,7 @@ def _maybe_promote(dtype, fill_value): # "Type[Any]"; expected "Hashable" [arg-type] return _maybe_promote_cached( dtype, fill_value, type(fill_value) - ) # type: ignore[override] + ) # type: ignore[arg-type] except TypeError: # if fill_value is not hashable (required for caching) return maybe_promote(dtype, fill_value) From f489ba5f4895191129a08cb47233d6e825402b04 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Feb 2021 08:44:44 +0100 Subject: [PATCH 06/16] fix annotation --- pandas/core/algorithms.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a6d6ef80ec3aa..081da4bbce7b8 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1821,7 +1821,7 @@ def take_nd( def take_1d_array( - arr: np.ndarray, + arr: ArrayLike, indexer: np.ndarray, out=None, fill_value=lib.no_default, @@ -1830,9 +1830,10 @@ def take_1d_array( """ Specialized version for 1D arrays. Differences compared to take_nd/take_1d: - - Assumes input (arr, indexer) has already been converted to numpy arrays + - Assumes input (arr, indexer) has already been converted to numpy array / EA - Only works for 1D arrays + To ensure the lowest possible overhead. """ if fill_value is lib.no_default: fill_value = na_value_for_dtype(arr.dtype, compat=False) From 96305c5146a535b471214b9d0f5cd08c00f33dd3 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Feb 2021 15:08:39 +0100 Subject: [PATCH 07/16] improve docstrings --- pandas/core/algorithms.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 081da4bbce7b8..73a27f44f5983 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1538,8 +1538,9 @@ def _take_nd_object(arr, indexer, out, axis: int, fill_value, mask_info): @functools.lru_cache(maxsize=128) def __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis): """ - Part of _get_take_nd_function below that doesn't need the mask - and thus can be cached. + Part of _get_take_nd_function below that doesn't need `mask_info` and thus + can be cached (mask_info potentially contains a numpy ndarray which is not + hashable and thus cannot be used as argument for cached function). """ tup = (arr_dtype.name, out_dtype.name) if ndim == 1: @@ -1570,8 +1571,13 @@ def __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis): def _get_take_nd_function( ndim: int, arr_dtype, out_dtype, axis: int = 0, mask_info=None ): + """ + Get the appropriate "take" implementation for the given dimension, axis + and dtypes. + """ func = None if ndim <= 2: + # for this part we don't need `mask_info` -> use the cached algo lookup func = __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis) if func is None: From c70ac4d261214d137bb06e284057de506a8e9011 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 10 Feb 2021 22:14:07 +0100 Subject: [PATCH 08/16] faster EA check --- pandas/core/algorithms.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 73a27f44f5983..276c6dc816238 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1844,8 +1844,8 @@ def take_1d_array( if fill_value is lib.no_default: fill_value = na_value_for_dtype(arr.dtype, compat=False) - if isinstance(arr, ABCExtensionArray): - # Check for EA to catch DatetimeArray, TimedeltaArray + if not isinstance(arr, np.ndarray): + # ExtensionArray -> dispatch to their method return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( From 5273cd5cf95c91f09f2c88b41a9e5f8e0272b63e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Feb 2021 16:43:41 +0100 Subject: [PATCH 09/16] rename take_1d_array to take_1d --- pandas/core/algorithms.py | 4 ++-- pandas/core/internals/array_manager.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index a6a64299bdb0d..edae32b5a7bca 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1823,7 +1823,7 @@ def take_nd( return out -def take_1d_array( +def take_1d( arr: ArrayLike, indexer: np.ndarray, out=None, @@ -1831,7 +1831,7 @@ def take_1d_array( allow_fill: bool = True, ): """ - Specialized version for 1D arrays. Differences compared to take_nd/take_1d: + Specialized version for 1D arrays. Differences compared to take_nd: - Assumes input (arr, indexer) has already been converted to numpy array / EA - Only works for 1D arrays diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 26d47aeea76a4..06c8bf1553f94 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -862,7 +862,7 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: new_arrays = [] for arr in self.arrays: for i in range(unstacker.full_shape[1]): - new_arr = algos.take_1d_array( + new_arr = algos.take_1d( arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value ) new_arrays.append(new_arr) From d3dd4e4b7f8db65a1d83069bc3e488ced6c466c5 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Feb 2021 17:12:18 +0100 Subject: [PATCH 10/16] add comment about being useful for array manager --- pandas/core/algorithms.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index edae32b5a7bca..5311fe563cd30 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1837,6 +1837,9 @@ def take_1d( - Only works for 1D arrays To ensure the lowest possible overhead. + + TODO(ArrayManager): mainly useful for ArrayManager, otherwise can potentially + be removed again if we don't end up with ArrayManager. """ if fill_value is lib.no_default: fill_value = na_value_for_dtype(arr.dtype, compat=False) From ca3048743027b4fef36c02dd9edcf975642f2b68 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Mar 2021 17:48:38 +0100 Subject: [PATCH 11/16] use take_nd for now --- pandas/core/array_algos/take.py | 41 -------------------------- pandas/core/internals/array_manager.py | 4 +-- 2 files changed, 2 insertions(+), 43 deletions(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index 4c656e7ee33c7..a4db610afa9b0 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -121,47 +121,6 @@ def _take_nd_ndarray( return out -def take_1d( - arr: ArrayLike, - indexer: np.ndarray, - out=None, - fill_value=lib.no_default, - allow_fill: bool = True, -): - """ - Specialized version for 1D arrays. Differences compared to take_nd: - - - Assumes input (arr, indexer) has already been converted to numpy array / EA - - Only works for 1D arrays - - To ensure the lowest possible overhead. - - TODO(ArrayManager): mainly useful for ArrayManager, otherwise can potentially - be removed again if we don't end up with ArrayManager. - """ - if fill_value is lib.no_default: - fill_value = na_value_for_dtype(arr.dtype, compat=False) - - if not isinstance(arr, np.ndarray): - # ExtensionArray -> dispatch to their method - return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - - indexer, dtype, fill_value, mask_info = _take_preprocess_indexer_and_fill_value( - arr, indexer, 0, out, fill_value, allow_fill - ) - - # at this point, it's guaranteed that dtype can hold both the arr values - # and the fill_value - out = np.empty(indexer.shape, dtype=dtype) - - func = _get_take_nd_function( - arr.ndim, arr.dtype, out.dtype, axis=0, mask_info=mask_info - ) - func(arr, indexer, out, fill_value) - - return out - - def take_2d_multi( arr: np.ndarray, indexer: np.ndarray, fill_value=np.nan ) -> np.ndarray: diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index ed3296cc31e2c..de3d7b8817116 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -53,7 +53,7 @@ ) import pandas.core.algorithms as algos -from pandas.core.array_algos.take import take_1d +from pandas.core.array_algos.take import take_nd from pandas.core.arrays import ExtensionArray from pandas.core.arrays.sparse import SparseDtype from pandas.core.construction import ( @@ -997,7 +997,7 @@ def unstack(self, unstacker, fill_value) -> ArrayManager: new_arrays = [] for arr in self.arrays: for i in range(unstacker.full_shape[1]): - new_arr = take_1d( + new_arr = take_nd( arr, new_indexer2D[:, i], allow_fill=True, fill_value=fill_value ) new_arrays.append(new_arr) From 05b6b878dbdb4fd7bae799100d3aac7af5c09a98 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Mar 2021 22:07:34 +0100 Subject: [PATCH 12/16] move caching of maybe_promote to cast.py --- pandas/core/array_algos/take.py | 21 +-------------------- pandas/core/dtypes/cast.py | 28 +++++++++++++++++++++++++++- 2 files changed, 28 insertions(+), 21 deletions(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index a4db610afa9b0..fe3d28410ab07 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -423,25 +423,6 @@ def _take_2d_multi_object( out[i, j] = arr[u_, v] -@functools.lru_cache(maxsize=128) -def _maybe_promote_cached(dtype, fill_value, fill_value_type): - # also use fill_value_type as (unused) argument to use this in the cache - # lookup -> differentiate 1 and True - return maybe_promote(dtype, fill_value) - - -def _maybe_promote(dtype, fill_value): - try: - # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type - # "Type[Any]"; expected "Hashable" [arg-type] - return _maybe_promote_cached( - dtype, fill_value, type(fill_value) - ) # type: ignore[arg-type] - except TypeError: - # if fill_value is not hashable (required for caching) - return maybe_promote(dtype, fill_value) - - def _take_preprocess_indexer_and_fill_value( arr: np.ndarray, indexer: Optional[np.ndarray], @@ -463,7 +444,7 @@ def _take_preprocess_indexer_and_fill_value( else: # check for promotion based on types only (do this first because # it's faster than computing a mask) - dtype, fill_value = _maybe_promote(arr.dtype, fill_value) + dtype, fill_value = maybe_promote(arr.dtype, fill_value) if dtype != arr.dtype and (out is None or out.dtype != dtype): # check if promotion is actually required based on indexer mask = indexer == -1 diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index d456f9c56e309..affbde909cbdb 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -10,6 +10,7 @@ datetime, timedelta, ) +import functools import inspect from typing import ( TYPE_CHECKING, @@ -572,6 +573,31 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan): ValueError If fill_value is a non-scalar and dtype is not object. """ + # for performance, we are using a cached version of the actual implementation + # of the function in _maybe_promote. However, this doesn't always work (in case + # of non-hashable arguments), so we fallback to the actual implementation if needed + try: + # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type + # "Type[Any]"; expected "Hashable" [arg-type] + return _maybe_promote_cached( + dtype, fill_value, type(fill_value) + ) # type: ignore[arg-type] + except TypeError: + # if fill_value is not hashable (required for caching) + return maybe_promote(dtype, fill_value) + + +@functools.lru_cache(maxsize=128) +def _maybe_promote_cached(dtype, fill_value, fill_value_type): + # The cached version of _maybe_promote below + # This also use fill_value_type as (unused) argument to use this in the + # cache lookup -> to differentiate 1 and True + return _maybe_promote(dtype, fill_value) + + +def _maybe_promote(dtype: np.dtype, fill_value=np.nan): + # The actual implementation of the function, use `maybe_promote` above for + # a cached version. if not is_scalar(fill_value): # with object dtype there is nothing to promote, and the user can # pass pretty much any weird fill_value they like @@ -622,7 +648,7 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan): "dtype is deprecated. In a future version, this will be cast " "to object dtype. Pass `fill_value=Timestamp(date_obj)` instead.", FutureWarning, - stacklevel=7, + stacklevel=9, ) return dtype, fv elif isinstance(fill_value, str): From 2284813de1ae23b733abe8b70736814413563d60 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Mar 2021 22:23:56 +0100 Subject: [PATCH 13/16] move type comment --- pandas/core/dtypes/cast.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index affbde909cbdb..009d3c21a8dfa 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -580,8 +580,8 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan): # error: Argument 3 to "__call__" of "_lru_cache_wrapper" has incompatible type # "Type[Any]"; expected "Hashable" [arg-type] return _maybe_promote_cached( - dtype, fill_value, type(fill_value) - ) # type: ignore[arg-type] + dtype, fill_value, type(fill_value) # type: ignore[arg-type] + ) except TypeError: # if fill_value is not hashable (required for caching) return maybe_promote(dtype, fill_value) From a41ee6b713b109dafb5b3979d403646da46c0485 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Tue, 2 Mar 2021 22:55:19 +0100 Subject: [PATCH 14/16] typo --- pandas/core/dtypes/cast.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 009d3c21a8dfa..bf2223b0971f1 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -584,7 +584,7 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan): ) except TypeError: # if fill_value is not hashable (required for caching) - return maybe_promote(dtype, fill_value) + return _maybe_promote(dtype, fill_value) @functools.lru_cache(maxsize=128) From 76371cf5ee57ad3ab53119afe0c95d68e2e700e9 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 4 Mar 2021 08:14:45 +0100 Subject: [PATCH 15/16] ensure deprecation warning is always raised --- pandas/core/dtypes/cast.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 35a07ab443944..c33e567478475 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -569,6 +569,10 @@ def maybe_promote(dtype: np.dtype, fill_value=np.nan): ValueError If fill_value is a non-scalar and dtype is not object. """ + # TODO(2.0): need to directly use the non-cached version as long as we + # possibly raise a deprecation warning for datetime dtype + if dtype.kind == "M": + return _maybe_promote(dtype, fill_value) # for performance, we are using a cached version of the actual implementation # of the function in _maybe_promote. However, this doesn't always work (in case # of non-hashable arguments), so we fallback to the actual implementation if needed @@ -644,7 +648,7 @@ def _maybe_promote(dtype: np.dtype, fill_value=np.nan): "dtype is deprecated. In a future version, this will be cast " "to object dtype. Pass `fill_value=Timestamp(date_obj)` instead.", FutureWarning, - stacklevel=9, + stacklevel=8, ) return dtype, fv elif isinstance(fill_value, str): From 2faf70b4624b258e733ba1bff168b88120c32cbf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 4 Mar 2021 08:15:19 +0100 Subject: [PATCH 16/16] single underscore --- pandas/core/array_algos/take.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/array_algos/take.py b/pandas/core/array_algos/take.py index fe3d28410ab07..2179085d9ad9b 100644 --- a/pandas/core/array_algos/take.py +++ b/pandas/core/array_algos/take.py @@ -179,7 +179,7 @@ def take_2d_multi( @functools.lru_cache(maxsize=128) -def __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis): +def _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis): """ Part of _get_take_nd_function below that doesn't need `mask_info` and thus can be cached (mask_info potentially contains a numpy ndarray which is not @@ -221,7 +221,7 @@ def _get_take_nd_function( func = None if ndim <= 2: # for this part we don't need `mask_info` -> use the cached algo lookup - func = __get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis) + func = _get_take_nd_function_cached(ndim, arr_dtype, out_dtype, axis) if func is None: