From 0e612c26cf46f598ffda4c13308a2e5ee5f86c6b Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Nov 2021 15:46:06 +0100 Subject: [PATCH 1/9] [ArrayManager] Array version of putmask logic --- pandas/core/array_algos/putmask.py | 93 +++++++++++++++++++++++++- pandas/core/internals/array_manager.py | 35 ++++++++-- 2 files changed, 121 insertions(+), 7 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index 77e38e6c6e3fc..df777b3b11ed0 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -15,6 +15,7 @@ ) from pandas.core.dtypes.cast import ( + can_hold_element, convert_scalar_for_putitemlike, find_common_type, infer_dtype_from, @@ -22,11 +23,22 @@ from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, + is_interval_dtype, is_list_like, ) -from pandas.core.dtypes.missing import isna_compat +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + isna_compat, + na_value_for_dtype, +) from pandas.core.arrays import ExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None: @@ -225,3 +237,82 @@ def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other): other = list(other) return other + + +def putmask_flexible_ndarray(array: np.ndarray, mask, new): + """ + Putmask implementation for ArrayManager putmask for ndarray. + + Flexible version that will upcast if needed. + """ + mask, noop = validate_putmask(array, mask) + assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) + + # if we are passed a scalar None, convert it here + if not array.dtype == "object" and is_valid_na_for_dtype(new, array.dtype): + new = na_value_for_dtype(array.dtype, compat=False) + + if can_hold_element(array, new): + putmask_without_repeat(array, mask, new) + return array + + elif noop: + return array + + dtype, _ = infer_dtype_from(new) + if dtype.kind in ["m", "M"]: + array = array.astype(object) + # convert to list to avoid numpy coercing datetimelikes to integers + new = setitem_datetimelike_compat( + array, mask.sum(), new # type: ignore[arg-type] + ) + # putmask_smart below converts it back to array + np.putmask(array, mask, new) + return array + + new_values = putmask_smart(array, mask, new) + return new_values + + +def _coerce_to_target_dtype(array, new): + dtype, _ = infer_dtype_from(new, pandas_dtype=True) + new_dtype = find_common_type([array.dtype, dtype]) + return array.astype(new_dtype, copy=False) + + +def putmask_flexible_ea(array: ExtensionArray, mask, new): + """ + Putmask implementation for ArrayManager putmask for EA. + + Flexible version that will upcast if needed. + """ + mask = extract_bool_array(mask) + + if isinstance(array, NDArrayBackedExtensionArray): + + if not can_hold_element(array, new): + array = _coerce_to_target_dtype(array, new) + return putmask_flexible_ndarray(array, mask, new) + + array.putmask(mask, new) + return array + + if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): + new = new[mask] + + try: + array[mask] = new + except TypeError: + if not is_interval_dtype(array.dtype): + # Discussion about what we want to support in the general + # case GH#39584 + raise + + array = _coerce_to_target_dtype(array, new) + if array.dtype == np.dtype("object"): + # For now at least, only support casting e.g. + # Interval[int64]->Interval[float64], + raise + return putmask_flexible_ea(array, mask, new) + + return array diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 543b2ea26f750..12c3b7708cd03 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -54,6 +54,10 @@ ) import pandas.core.algorithms as algos +from pandas.core.array_algos.putmask import ( + putmask_flexible_ea, + putmask_flexible_ndarray, +) from pandas.core.array_algos.quantile import quantile_compat from pandas.core.array_algos.take import take_1d from pandas.core.arrays import ( @@ -352,12 +356,31 @@ def putmask(self, mask, new, align: bool = True): align_keys = ["mask"] new = extract_array(new, extract_numpy=True) - return self.apply_with_block( - "putmask", - align_keys=align_keys, - mask=mask, - new=new, - ) + kwargs = {"mask": mask, "new": new} + aligned_kwargs = {k: kwargs[k] for k in align_keys} + + for i, arr in enumerate(self.arrays): + + for k, obj in aligned_kwargs.items(): + if isinstance(obj, (ABCSeries, ABCDataFrame)): + # The caller is responsible for ensuring that + # obj.axes[-1].equals(self.items) + if obj.ndim == 1: + kwargs[k] = obj._values + else: + kwargs[k] = obj.iloc[:, i]._values + else: + # otherwise we have an ndarray + if self.ndim == 2: + kwargs[k] = obj[i] + + if isinstance(arr, np.ndarray): + new = putmask_flexible_ndarray(arr, **kwargs) + else: + new = putmask_flexible_ea(arr, **kwargs) + self.arrays[i] = new + + return self def diff(self: T, n: int, axis: int) -> T: if axis == 1: From a9f520a8cb545cbbce537a60a46a54396968b5bf Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 11 Nov 2021 21:35:47 +0100 Subject: [PATCH 2/9] simplify now EA._putmask exists --- pandas/core/array_algos/putmask.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index df777b3b11ed0..bf626bb70c39c 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -289,19 +289,12 @@ def putmask_flexible_ea(array: ExtensionArray, mask, new): mask = extract_bool_array(mask) if isinstance(array, NDArrayBackedExtensionArray): - if not can_hold_element(array, new): array = _coerce_to_target_dtype(array, new) return putmask_flexible_ndarray(array, mask, new) - array.putmask(mask, new) - return array - - if isinstance(new, (np.ndarray, ExtensionArray)) and len(new) == len(mask): - new = new[mask] - try: - array[mask] = new + array._putmask(mask, new) except TypeError: if not is_interval_dtype(array.dtype): # Discussion about what we want to support in the general From 4193142fb24712b52c5be32e587286a2fdf77dcc Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Nov 2021 21:05:29 +0100 Subject: [PATCH 3/9] remove align logic from apply --- pandas/core/internals/array_manager.py | 26 +------------------------- 1 file changed, 1 insertion(+), 25 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 12c3b7708cd03..d5e2baa187c32 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -191,13 +191,7 @@ def __repr__(self) -> str: output += f"\n{arr.dtype}" return output - def apply( - self: T, - f, - align_keys: list[str] | None = None, - ignore_failures: bool = False, - **kwargs, - ) -> T: + def apply(self: T, f, ignore_failures: bool = False, **kwargs) -> T: """ Iterate over the arrays, collect and create a new ArrayManager. @@ -216,32 +210,14 @@ def apply( """ assert "filter" not in kwargs - align_keys = align_keys or [] result_arrays: list[np.ndarray] = [] result_indices: list[int] = [] # fillna: Series/DataFrame is responsible for making sure value is aligned - aligned_args = {k: kwargs[k] for k in align_keys} - if f == "apply": f = kwargs.pop("func") for i, arr in enumerate(self.arrays): - - if aligned_args: - - for k, obj in aligned_args.items(): - if isinstance(obj, (ABCSeries, ABCDataFrame)): - # The caller is responsible for ensuring that - # obj.axes[-1].equals(self.items) - if obj.ndim == 1: - kwargs[k] = obj.iloc[i] - else: - kwargs[k] = obj.iloc[:, i]._values - else: - # otherwise we have an array-like - kwargs[k] = obj[i] - try: if callable(f): applied = f(arr, **kwargs) From ad113c14b405f7a8dc8824d0234d69c27b346e58 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Nov 2021 21:28:50 +0100 Subject: [PATCH 4/9] move to internals.methods --- pandas/core/array_algos/putmask.py | 86 +------------------------- pandas/core/internals/array_manager.py | 10 +-- 2 files changed, 3 insertions(+), 93 deletions(-) diff --git a/pandas/core/array_algos/putmask.py b/pandas/core/array_algos/putmask.py index bf626bb70c39c..77e38e6c6e3fc 100644 --- a/pandas/core/array_algos/putmask.py +++ b/pandas/core/array_algos/putmask.py @@ -15,7 +15,6 @@ ) from pandas.core.dtypes.cast import ( - can_hold_element, convert_scalar_for_putitemlike, find_common_type, infer_dtype_from, @@ -23,22 +22,11 @@ from pandas.core.dtypes.common import ( is_float_dtype, is_integer_dtype, - is_interval_dtype, is_list_like, ) -from pandas.core.dtypes.generic import ( - ABCDataFrame, - ABCIndex, - ABCSeries, -) -from pandas.core.dtypes.missing import ( - is_valid_na_for_dtype, - isna_compat, - na_value_for_dtype, -) +from pandas.core.dtypes.missing import isna_compat from pandas.core.arrays import ExtensionArray -from pandas.core.arrays._mixins import NDArrayBackedExtensionArray def putmask_inplace(values: ArrayLike, mask: npt.NDArray[np.bool_], value: Any) -> None: @@ -237,75 +225,3 @@ def setitem_datetimelike_compat(values: np.ndarray, num_set: int, other): other = list(other) return other - - -def putmask_flexible_ndarray(array: np.ndarray, mask, new): - """ - Putmask implementation for ArrayManager putmask for ndarray. - - Flexible version that will upcast if needed. - """ - mask, noop = validate_putmask(array, mask) - assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) - - # if we are passed a scalar None, convert it here - if not array.dtype == "object" and is_valid_na_for_dtype(new, array.dtype): - new = na_value_for_dtype(array.dtype, compat=False) - - if can_hold_element(array, new): - putmask_without_repeat(array, mask, new) - return array - - elif noop: - return array - - dtype, _ = infer_dtype_from(new) - if dtype.kind in ["m", "M"]: - array = array.astype(object) - # convert to list to avoid numpy coercing datetimelikes to integers - new = setitem_datetimelike_compat( - array, mask.sum(), new # type: ignore[arg-type] - ) - # putmask_smart below converts it back to array - np.putmask(array, mask, new) - return array - - new_values = putmask_smart(array, mask, new) - return new_values - - -def _coerce_to_target_dtype(array, new): - dtype, _ = infer_dtype_from(new, pandas_dtype=True) - new_dtype = find_common_type([array.dtype, dtype]) - return array.astype(new_dtype, copy=False) - - -def putmask_flexible_ea(array: ExtensionArray, mask, new): - """ - Putmask implementation for ArrayManager putmask for EA. - - Flexible version that will upcast if needed. - """ - mask = extract_bool_array(mask) - - if isinstance(array, NDArrayBackedExtensionArray): - if not can_hold_element(array, new): - array = _coerce_to_target_dtype(array, new) - return putmask_flexible_ndarray(array, mask, new) - - try: - array._putmask(mask, new) - except TypeError: - if not is_interval_dtype(array.dtype): - # Discussion about what we want to support in the general - # case GH#39584 - raise - - array = _coerce_to_target_dtype(array, new) - if array.dtype == np.dtype("object"): - # For now at least, only support casting e.g. - # Interval[int64]->Interval[float64], - raise - return putmask_flexible_ea(array, mask, new) - - return array diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index d5e2baa187c32..bc7aa65efd560 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -54,10 +54,6 @@ ) import pandas.core.algorithms as algos -from pandas.core.array_algos.putmask import ( - putmask_flexible_ea, - putmask_flexible_ndarray, -) from pandas.core.array_algos.quantile import quantile_compat from pandas.core.array_algos.take import take_1d from pandas.core.arrays import ( @@ -93,6 +89,7 @@ new_block, to_native_types, ) +from pandas.core.internals.methods import putmask_flexible if TYPE_CHECKING: from pandas import Float64Index @@ -350,10 +347,7 @@ def putmask(self, mask, new, align: bool = True): if self.ndim == 2: kwargs[k] = obj[i] - if isinstance(arr, np.ndarray): - new = putmask_flexible_ndarray(arr, **kwargs) - else: - new = putmask_flexible_ea(arr, **kwargs) + new = putmask_flexible(arr, **kwargs) self.arrays[i] = new return self From ee29fed02a7befef9d56a64361cb5415273a3a0a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Nov 2021 21:30:31 +0100 Subject: [PATCH 5/9] actually add new internals/methods.py file --- pandas/core/internals/methods.py | 114 +++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 pandas/core/internals/methods.py diff --git a/pandas/core/internals/methods.py b/pandas/core/internals/methods.py new file mode 100644 index 0000000000000..a44302ce7716e --- /dev/null +++ b/pandas/core/internals/methods.py @@ -0,0 +1,114 @@ +""" +Wrappers around array_algos with internals-specific logic +""" +from __future__ import annotations + +import numpy as np + +from pandas.core.dtypes.cast import ( + can_hold_element, + find_common_type, + infer_dtype_from, +) +from pandas.core.dtypes.common import is_interval_dtype +from pandas.core.dtypes.generic import ( + ABCDataFrame, + ABCIndex, + ABCSeries, +) +from pandas.core.dtypes.missing import ( + is_valid_na_for_dtype, + na_value_for_dtype, +) + +from pandas.core.array_algos.putmask import ( + extract_bool_array, + putmask_smart, + putmask_without_repeat, + setitem_datetimelike_compat, + validate_putmask, +) +from pandas.core.arrays import ExtensionArray +from pandas.core.arrays._mixins import NDArrayBackedExtensionArray + + +def putmask_flexible(array: np.ndarray | ExtensionArray, mask, new): + """ + Putmask implementation for ArrayManager putmask for ndarray. + + Flexible version that will upcast if needed. + """ + if isinstance(array, np.ndarray): + return putmask_flexible_ndarray(array, mask=mask, new=new) + else: + return putmask_flexible_ea(array, mask=mask, new=new) + + +def putmask_flexible_ndarray(array: np.ndarray, mask, new): + """ + Putmask implementation for ArrayManager putmask for ndarray. + + Flexible version that will upcast if needed. + """ + mask, noop = validate_putmask(array, mask) + assert not isinstance(new, (ABCIndex, ABCSeries, ABCDataFrame)) + + # if we are passed a scalar None, convert it here + if not array.dtype == "object" and is_valid_na_for_dtype(new, array.dtype): + new = na_value_for_dtype(array.dtype, compat=False) + + if can_hold_element(array, new): + putmask_without_repeat(array, mask, new) + return array + + elif noop: + return array + + dtype, _ = infer_dtype_from(new) + if dtype.kind in ["m", "M"]: + array = array.astype(object) + # convert to list to avoid numpy coercing datetimelikes to integers + new = setitem_datetimelike_compat(array, mask.sum(), new) + # putmask_smart below converts it back to array + np.putmask(array, mask, new) + return array + + new_values = putmask_smart(array, mask, new) + return new_values + + +def _coerce_to_target_dtype(array, new): + dtype, _ = infer_dtype_from(new, pandas_dtype=True) + new_dtype = find_common_type([array.dtype, dtype]) + return array.astype(new_dtype, copy=False) + + +def putmask_flexible_ea(array: ExtensionArray, mask, new): + """ + Putmask implementation for ArrayManager putmask for EA. + + Flexible version that will upcast if needed. + """ + mask = extract_bool_array(mask) + + if isinstance(array, NDArrayBackedExtensionArray): + if not can_hold_element(array, new): + array = _coerce_to_target_dtype(array, new) + return putmask_flexible(array, mask, new) + + try: + array._putmask(mask, new) + except TypeError: + if not is_interval_dtype(array.dtype): + # Discussion about what we want to support in the general + # case GH#39584 + raise + + array = _coerce_to_target_dtype(array, new) + if array.dtype == np.dtype("object"): + # For now at least, only support casting e.g. + # Interval[int64]->Interval[float64], + raise + return putmask_flexible(array, mask, new) + + return array From 3c9a8a71c8ae51bb2ab2b5d6a661cda1dadf2d33 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Nov 2021 21:31:21 +0100 Subject: [PATCH 6/9] update docstring --- pandas/core/internals/methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/internals/methods.py b/pandas/core/internals/methods.py index a44302ce7716e..086973f002b43 100644 --- a/pandas/core/internals/methods.py +++ b/pandas/core/internals/methods.py @@ -34,7 +34,7 @@ def putmask_flexible(array: np.ndarray | ExtensionArray, mask, new): """ - Putmask implementation for ArrayManager putmask for ndarray. + Putmask implementation for ArrayManager.putmask. Flexible version that will upcast if needed. """ From 2a4e23ede579407bf28fcf538ddf20c807947d19 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Fri, 12 Nov 2021 22:19:39 +0100 Subject: [PATCH 7/9] update test_namespace --- pandas/tests/internals/test_api.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/internals/test_api.py b/pandas/tests/internals/test_api.py index c759cc163106d..3afcf8917cea8 100644 --- a/pandas/tests/internals/test_api.py +++ b/pandas/tests/internals/test_api.py @@ -19,6 +19,7 @@ def test_namespace(): "blocks", "concat", "managers", + "methods", "construction", "array_manager", "base", From cf3486080730a20c02128e627fd97bfcc7f09c63 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Sat, 13 Nov 2021 20:29:05 +0100 Subject: [PATCH 8/9] share EA putmask version between array and block manager --- pandas/core/internals/blocks.py | 26 ++------------------------ pandas/core/internals/methods.py | 6 +++++- 2 files changed, 7 insertions(+), 25 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 66a40b962e183..33896b65cd9f2 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -110,6 +110,7 @@ is_empty_indexer, is_scalar_indexer, ) +from pandas.core.internals.methods import putmask_flexible_ea import pandas.core.missing as missing if TYPE_CHECKING: @@ -1411,30 +1412,7 @@ def putmask(self, mask, new) -> list[Block]: """ See Block.putmask.__doc__ """ - mask = extract_bool_array(mask) - - new_values = self.values - - if mask.ndim == new_values.ndim + 1: - # TODO(EA2D): unnecessary with 2D EAs - mask = mask.reshape(new_values.shape) - - try: - # Caller is responsible for ensuring matching lengths - new_values._putmask(mask, new) - except TypeError: - if not is_interval_dtype(self.dtype): - # Discussion about what we want to support in the general - # case GH#39584 - raise - - blk = self.coerce_to_target_dtype(new) - if blk.dtype == _dtype_obj: - # For now at least, only support casting e.g. - # Interval[int64]->Interval[float64], - raise - return blk.putmask(mask, new) - + new_values = putmask_flexible_ea(self.values, mask, new) nb = type(self)(new_values, placement=self._mgr_locs, ndim=self.ndim) return [nb] diff --git a/pandas/core/internals/methods.py b/pandas/core/internals/methods.py index 086973f002b43..e844c905accd3 100644 --- a/pandas/core/internals/methods.py +++ b/pandas/core/internals/methods.py @@ -91,6 +91,10 @@ def putmask_flexible_ea(array: ExtensionArray, mask, new): """ mask = extract_bool_array(mask) + if mask.ndim == array.ndim + 1: + # TODO(EA2D): unnecessary with 2D EAs + mask = mask.reshape(array.shape) + if isinstance(array, NDArrayBackedExtensionArray): if not can_hold_element(array, new): array = _coerce_to_target_dtype(array, new) @@ -109,6 +113,6 @@ def putmask_flexible_ea(array: ExtensionArray, mask, new): # For now at least, only support casting e.g. # Interval[int64]->Interval[float64], raise - return putmask_flexible(array, mask, new) + return putmask_flexible_ea(array, mask, new) return array From 78017a05fc482d0689267f17442526b20f4a298e Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Nov 2021 09:17:51 +0100 Subject: [PATCH 9/9] fixup signature for mypy --- pandas/core/internals/array_manager.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 3b958db827c7c..e318659d9f355 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -188,7 +188,13 @@ def __repr__(self) -> str: output += f"\n{arr.dtype}" return output - def apply(self: T, f, ignore_failures: bool = False, **kwargs) -> T: + def apply( + self: T, + f, + align_keys: list[str] | None = None, # not used for ArrayManager + ignore_failures: bool = False, + **kwargs, + ) -> T: """ Iterate over the arrays, collect and create a new ArrayManager. @@ -196,7 +202,6 @@ def apply(self: T, f, ignore_failures: bool = False, **kwargs) -> T: ---------- f : str or callable Name of the Array method to apply. - align_keys: List[str] or None, default None ignore_failures: bool, default False **kwargs Keywords to pass to `f`