From dce234171c53befb76789505723d5aeccea184be Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Thu, 22 Apr 2021 14:38:35 +0200 Subject: [PATCH 1/6] [ArrayManager] Array version of fillna logic --- pandas/core/internals/array_manager.py | 5 +- pandas/core/missing.py | 116 +++++++++++++++++++++- pandas/tests/frame/methods/test_fillna.py | 6 -- 3 files changed, 118 insertions(+), 9 deletions(-) diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 8c9902d330eee..180446b4d3136 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -88,6 +88,7 @@ new_block, to_native_types, ) +from pandas.core.missing import fillna_array if TYPE_CHECKING: from pandas import Float64Index @@ -462,8 +463,8 @@ def shift(self: T, periods: int, axis: int, fill_value) -> T: ) def fillna(self: T, value, limit, inplace: bool, downcast) -> T: - return self.apply_with_block( - "fillna", value=value, limit=limit, inplace=inplace, downcast=downcast + return self.apply( + fillna_array, value=value, limit=limit, inplace=inplace, downcast=downcast ) def downcast(self: T) -> T: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 8849eb0670faa..c2c003da5ffbe 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -26,9 +26,17 @@ ) from pandas.compat._optional import import_optional_dependency -from pandas.core.dtypes.cast import infer_dtype_from +from pandas.core.dtypes.cast import ( + astype_array_safe, + can_hold_element, + find_common_type, + infer_dtype_from, + maybe_downcast_to_dtype, + soft_convert_objects, +) from pandas.core.dtypes.common import ( is_array_like, + is_categorical_dtype, is_numeric_v_string_like, needs_i8_conversion, ) @@ -38,6 +46,8 @@ na_value_for_dtype, ) +from pandas.core.construction import extract_array + if TYPE_CHECKING: from pandas import Index @@ -964,3 +974,107 @@ def _rolling_window(a: np.ndarray, window: int): shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) strides = a.strides + (a.strides[-1],) return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) + + +def _can_hold_element(values, element: Any) -> bool: + """ + Expanded version of core.dtypes.cast.can_hold_element + """ + from pandas.core.arrays import ( + ExtensionArray, + FloatingArray, + IntegerArray, + ) + + if isinstance(values, ExtensionArray): + if hasattr(values, "_validate_setitem_value"): + # NDArrayBackedExtensionArray + try: + # error: "Callable[..., Any]" has no attribute "_validate_setitem_value" + values._validate_setitem_value(element) # type: ignore[attr-defined] + return True + except (ValueError, TypeError): + return False + else: + # other ExtensionArrays + return True + else: + element = extract_array(element, extract_numpy=True) + if isinstance(element, (IntegerArray, FloatingArray)): + if element._mask.any(): + return False + return can_hold_element(values, element) + + +def coerce_to_target_dtype(values, other): + """ + Coerce the values to a dtype compat for other. This will always + return values, possibly object dtype, and not raise. + """ + dtype, _ = infer_dtype_from(other, pandas_dtype=True) + new_dtype = find_common_type([values.dtype, dtype]) + + return astype_array_safe(values, new_dtype, copy=False) + + +def fillna_array(values, value, limit=None, inplace: bool = False, downcast=None): + from pandas.core.array_algos.putmask import ( + putmask_inplace, + validate_putmask, + ) + from pandas.core.arrays import ExtensionArray + + # inplace = validate_bool_kwarg(inplace, "inplace") + + if isinstance(values, ExtensionArray): + if ( + not _can_hold_element(values, value) + and values.dtype.kind != "m" + and not is_categorical_dtype(values.dtype) + ): + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. + # TODO: don't special-case td64 + values = values.astype(object) + return fillna_array( + values, value, limit=limit, inplace=True, downcast=downcast + ) + + values = values if inplace else values.copy() + return values.fillna(value, limit=limit) + + mask = isna(values) + mask, noop = validate_putmask(values, mask) + + if limit is not None: + limit = algos.validate_limit(None, limit=limit) + mask[mask.cumsum(values.ndim - 1) > limit] = False + + if values.dtype.kind in ["b", "i", "u"]: + # those dtypes can never hold NAs + if inplace: + return values + else: + return values.copy() + + if _can_hold_element(values, value): + values = values if inplace else values.copy() + putmask_inplace(values, mask, value) + + if values.dtype == np.dtype(object): + if downcast is None: + values = soft_convert_objects(values, datetime=True, numeric=False) + + if downcast is None and values.dtype.kind not in ["f", "m", "M"]: + downcast = "infer" + if downcast: + values = maybe_downcast_to_dtype(values, downcast) + return values + + if noop: + # we can't process the value, but nothing to do + return values if inplace else values.copy() + else: + values = coerce_to_target_dtype(values, value) + # bc we have already cast, inplace=True may avoid an extra copy + return fillna_array(values, value, limit=limit, inplace=True, downcast=None) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index b827547b0753e..e8b5a8ebc7f52 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -1,8 +1,6 @@ import numpy as np import pytest -import pandas.util._test_decorators as td - from pandas import ( Categorical, DataFrame, @@ -232,7 +230,6 @@ def test_fillna_categorical_nan(self): df = DataFrame({"a": Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=NaT), df) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) implement downcast def test_fillna_downcast(self): # GH#15277 # infer int64 from float64 @@ -247,7 +244,6 @@ def test_fillna_downcast(self): expected = DataFrame({"a": [1, 0]}) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) object upcasting def test_fillna_dtype_conversion(self): # make sure that fillna on an empty frame works df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5]) @@ -265,7 +261,6 @@ def test_fillna_dtype_conversion(self): expected = DataFrame("nan", index=range(3), columns=["A", "B"]) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) object upcasting @pytest.mark.parametrize("val", ["", 1, np.nan, 1.0]) def test_fillna_dtype_conversion_equiv_replace(self, val): df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]}) @@ -273,7 +268,6 @@ def test_fillna_dtype_conversion_equiv_replace(self, val): result = df.fillna(val) tm.assert_frame_equal(result, expected) - @td.skip_array_manager_invalid_test def test_fillna_datetime_columns(self): # GH#7095 df = DataFrame( From a1b3b6a802dca004f83cea1c4c3e264f73d0966d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 7 Jul 2021 21:24:56 +0200 Subject: [PATCH 2/6] simplify can_hold_element --- pandas/core/missing.py | 26 ++------------------------ 1 file changed, 2 insertions(+), 24 deletions(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 6797efda3130b..11da3699bd710 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -980,30 +980,8 @@ def _can_hold_element(values, element: Any) -> bool: """ Expanded version of core.dtypes.cast.can_hold_element """ - from pandas.core.arrays import ( - ExtensionArray, - FloatingArray, - IntegerArray, - ) - - if isinstance(values, ExtensionArray): - if hasattr(values, "_validate_setitem_value"): - # NDArrayBackedExtensionArray - try: - # error: "Callable[..., Any]" has no attribute "_validate_setitem_value" - values._validate_setitem_value(element) # type: ignore[attr-defined] - return True - except (ValueError, TypeError): - return False - else: - # other ExtensionArrays - return True - else: - element = extract_array(element, extract_numpy=True) - if isinstance(element, (IntegerArray, FloatingArray)): - if element._mask.any(): - return False - return can_hold_element(values, element) + element = extract_array(element, extract_numpy=True) + return can_hold_element(values, element) def coerce_to_target_dtype(values, other): From 3226ef53c9f7639895573feeddbb66411464a176 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Nov 2021 10:06:26 +0100 Subject: [PATCH 3/6] share EA fillna logic between Block and Array manager --- pandas/core/internals/blocks.py | 36 ++++++++++++--------------------- pandas/core/missing.py | 33 +++++++++++++++--------------- 2 files changed, 30 insertions(+), 39 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 46e5b5b9c53ad..776399e31f290 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1358,6 +1358,19 @@ def interpolate( new_values = values.fillna(value=fill_value, method=method, limit=limit) return self.make_block_same_class(new_values) + def fillna( + self, value, limit=None, inplace: bool = False, downcast=None + ) -> list[Block]: + + res_values = missing.fillna_ea_array( + self.values.ravel(), value, limit=limit, inplace=inplace, downcast=downcast + ) + res_values = ensure_block_shape(res_values, self.ndim) + + if res_values.dtype == object: + return [self.make_block(values=res_values)] + return [self.make_block_same_class(values=res_values)] + class ExtensionBlock(libinternals.Block, EABackedBlock): """ @@ -1564,12 +1577,6 @@ def getitem_block_index(self, slicer: slice) -> ExtensionBlock: new_values = self.values[slicer] return type(self)(new_values, self._mgr_locs, ndim=self.ndim) - def fillna( - self, value, limit=None, inplace: bool = False, downcast=None - ) -> list[Block]: - values = self.values.fillna(value=value, limit=limit) - return [self.make_block_same_class(values=values)] - def diff(self, n: int, axis: int = 1) -> list[Block]: if axis == 0 and n != 0: # n==0 case will be a no-op so let is fall through @@ -1754,23 +1761,6 @@ def shift(self, periods: int, axis: int = 0, fill_value: Any = None) -> list[Blo new_values = values.shift(periods, fill_value=fill_value, axis=axis) return [self.make_block_same_class(new_values)] - def fillna( - self, value, limit=None, inplace: bool = False, downcast=None - ) -> list[Block]: - - if not self._can_hold_element(value) and self.dtype.kind != "m": - # We support filling a DatetimeTZ with a `value` whose timezone - # is different by coercing to object. - # TODO: don't special-case td64 - return self.coerce_to_target_dtype(value).fillna( - value, limit, inplace, downcast - ) - - values = self.values - values = values if inplace else values.copy() - new_values = values.fillna(value=value, limit=limit) - return [self.make_block_same_class(values=new_values)] - class DatetimeLikeBlock(NDArrayBackedExtensionBlock): """Block for datetime64[ns], timedelta64[ns].""" diff --git a/pandas/core/missing.py b/pandas/core/missing.py index ac97dddff551e..c011d20c4d24c 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -37,7 +37,6 @@ ) from pandas.core.dtypes.common import ( is_array_like, - is_categorical_dtype, is_numeric_v_string_like, needs_i8_conversion, ) @@ -1004,6 +1003,20 @@ def coerce_to_target_dtype(values, other): return astype_array_safe(values, new_dtype, copy=False) +def fillna_ea_array(values, value, limit=None, inplace: bool = False, downcast=None): + + if not _can_hold_element(values, value) and values.dtype.kind == "M": + # We support filling a DatetimeTZ with a `value` whose timezone + # is different by coercing to object. + # TODO: don't special-case td64 + values = values.astype(object) + # breakpoint() + return fillna_array(values, value, limit=limit, inplace=True, downcast=downcast) + + values = values if inplace else values.copy() + return values.fillna(value, limit=limit) + + def fillna_array(values, value, limit=None, inplace: bool = False, downcast=None): from pandas.core.array_algos.putmask import ( putmask_inplace, @@ -1014,21 +1027,9 @@ def fillna_array(values, value, limit=None, inplace: bool = False, downcast=None # inplace = validate_bool_kwarg(inplace, "inplace") if isinstance(values, ExtensionArray): - if ( - not _can_hold_element(values, value) - and values.dtype.kind != "m" - and not is_categorical_dtype(values.dtype) - ): - # We support filling a DatetimeTZ with a `value` whose timezone - # is different by coercing to object. - # TODO: don't special-case td64 - values = values.astype(object) - return fillna_array( - values, value, limit=limit, inplace=True, downcast=downcast - ) - - values = values if inplace else values.copy() - return values.fillna(value, limit=limit) + return fillna_ea_array( + values, value, limit=limit, inplace=inplace, downcast=downcast + ) mask = isna(values) mask, noop = validate_putmask(values, mask) From 643b6a50b3e699c072509d3bf2275cb34a74370a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Nov 2021 10:13:17 +0100 Subject: [PATCH 4/6] add docstrings --- pandas/core/missing.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/core/missing.py b/pandas/core/missing.py index c011d20c4d24c..1fefbc7b107c2 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -1004,13 +1004,18 @@ def coerce_to_target_dtype(values, other): def fillna_ea_array(values, value, limit=None, inplace: bool = False, downcast=None): + """ + Fillna logic for ExtensionArrays. + Dispatches to the EA.fillna method (in which case downcast is currently + ignored), except for datetime64 in which case fallback to object dtype + is currently allowed. + """ if not _can_hold_element(values, value) and values.dtype.kind == "M": # We support filling a DatetimeTZ with a `value` whose timezone # is different by coercing to object. # TODO: don't special-case td64 values = values.astype(object) - # breakpoint() return fillna_array(values, value, limit=limit, inplace=True, downcast=downcast) values = values if inplace else values.copy() @@ -1018,6 +1023,11 @@ def fillna_ea_array(values, value, limit=None, inplace: bool = False, downcast=N def fillna_array(values, value, limit=None, inplace: bool = False, downcast=None): + """ + Fillna logic for np.ndarray/ExtensionArray. + + This includes the logic for downcasting if needed. + """ from pandas.core.array_algos.putmask import ( putmask_inplace, validate_putmask, From e0de7b86d135aecddb102d509315c770ae9aee25 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 15 Nov 2021 10:14:52 +0100 Subject: [PATCH 5/6] move coerce_to_target_dtype to dtypes.cast --- pandas/core/dtypes/cast.py | 11 +++++++++++ pandas/core/missing.py | 14 +------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 9cd67ad293f63..bc076a5d1080d 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -1849,6 +1849,17 @@ def find_common_type(types: list[DtypeObj]) -> DtypeObj: return np.find_common_type(types, []) # type: ignore[arg-type] +def coerce_to_target_dtype(values, other): + """ + Coerce the values to a dtype compat for other. This will always + return values, possibly object dtype, and not raise. + """ + dtype, _ = infer_dtype_from(other, pandas_dtype=True) + new_dtype = find_common_type([values.dtype, dtype]) + + return astype_array_safe(values, new_dtype, copy=False) + + def construct_2d_arraylike_from_scalar( value: Scalar, length: int, width: int, dtype: np.dtype, copy: bool ) -> np.ndarray: diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 1fefbc7b107c2..0bcf5bb8f181a 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -28,9 +28,8 @@ from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.cast import ( - astype_array_safe, can_hold_element, - find_common_type, + coerce_to_target_dtype, infer_dtype_from, maybe_downcast_to_dtype, soft_convert_objects, @@ -992,17 +991,6 @@ def _can_hold_element(values, element: Any) -> bool: return can_hold_element(values, element) -def coerce_to_target_dtype(values, other): - """ - Coerce the values to a dtype compat for other. This will always - return values, possibly object dtype, and not raise. - """ - dtype, _ = infer_dtype_from(other, pandas_dtype=True) - new_dtype = find_common_type([values.dtype, dtype]) - - return astype_array_safe(values, new_dtype, copy=False) - - def fillna_ea_array(values, value, limit=None, inplace: bool = False, downcast=None): """ Fillna logic for ExtensionArrays. From bdf9c5ac4d56780ef7ef54ba32c51ddccff1996a Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Wed, 24 Nov 2021 20:00:04 +0100 Subject: [PATCH 6/6] correct shape in Block.fillna --- pandas/core/internals/blocks.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index c584010132abc..43ed2d4eebdc9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -1408,7 +1408,7 @@ def fillna( res_values = missing.fillna_ea_array( self.values.ravel(), value, limit=limit, inplace=inplace, downcast=downcast ) - res_values = ensure_block_shape(res_values, self.ndim) + res_values = ensure_block_shape(res_values, self.ndim, self.shape) if res_values.dtype == object: return [self.make_block(values=res_values)] @@ -2026,7 +2026,9 @@ def extend_blocks(result, blocks=None) -> list[Block]: return blocks -def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: +def ensure_block_shape( + values: ArrayLike, ndim: int = 1, shape: tuple = (1, -1) +) -> ArrayLike: """ Reshape if possible to have values.ndim == ndim. """ @@ -2037,7 +2039,7 @@ def ensure_block_shape(values: ArrayLike, ndim: int = 1) -> ArrayLike: # block.shape is incorrect for "2D" ExtensionArrays # We can't, and don't need to, reshape. values = cast("np.ndarray | DatetimeArray | TimedeltaArray", values) - values = values.reshape(1, -1) + values = values.reshape(*shape) return values