Skip to content

[ArrayManager] Array version of fillna logic #41104

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
5 changes: 3 additions & 2 deletions pandas/core/internals/array_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
new_block,
to_native_types,
)
from pandas.core.missing import fillna_array

if TYPE_CHECKING:
from pandas import Float64Index
Expand Down Expand Up @@ -462,8 +463,8 @@ def shift(self: T, periods: int, axis: int, fill_value) -> T:
)

def fillna(self: T, value, limit, inplace: bool, downcast) -> T:
return self.apply_with_block(
"fillna", value=value, limit=limit, inplace=inplace, downcast=downcast
return self.apply(
fillna_array, value=value, limit=limit, inplace=inplace, downcast=downcast
)

def downcast(self: T) -> T:
Expand Down
116 changes: 115 additions & 1 deletion pandas/core/missing.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,17 @@
)
from pandas.compat._optional import import_optional_dependency

from pandas.core.dtypes.cast import infer_dtype_from
from pandas.core.dtypes.cast import (
astype_array_safe,
can_hold_element,
find_common_type,
infer_dtype_from,
maybe_downcast_to_dtype,
soft_convert_objects,
)
from pandas.core.dtypes.common import (
is_array_like,
is_categorical_dtype,
is_numeric_v_string_like,
needs_i8_conversion,
)
Expand All @@ -38,6 +46,8 @@
na_value_for_dtype,
)

from pandas.core.construction import extract_array

if TYPE_CHECKING:
from pandas import Index

Expand Down Expand Up @@ -964,3 +974,107 @@ def _rolling_window(a: np.ndarray, window: int):
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)


def _can_hold_element(values, element: Any) -> bool:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a version of this that makes the existing can_hold_element more accurate instead of implementing another function?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not fully sure what I was thinking in April / if that was already the case, but so right now the existing can_hold_element actually fully covers what I needed here.

It only doesn't do extract_array compared to Block._can_hold_element, so kept that here for now. But probably that could be moved to can_hold_element as well (will check later).

"""
Expanded version of core.dtypes.cast.can_hold_element
"""
from pandas.core.arrays import (
ExtensionArray,
FloatingArray,
IntegerArray,
)

if isinstance(values, ExtensionArray):
if hasattr(values, "_validate_setitem_value"):
# NDArrayBackedExtensionArray
try:
# error: "Callable[..., Any]" has no attribute "_validate_setitem_value"
values._validate_setitem_value(element) # type: ignore[attr-defined]
return True
except (ValueError, TypeError):
return False
else:
# other ExtensionArrays
return True
else:
element = extract_array(element, extract_numpy=True)
if isinstance(element, (IntegerArray, FloatingArray)):
if element._mask.any():
return False
return can_hold_element(values, element)


def coerce_to_target_dtype(values, other):
"""
Coerce the values to a dtype compat for other. This will always
return values, possibly object dtype, and not raise.
"""
dtype, _ = infer_dtype_from(other, pandas_dtype=True)
new_dtype = find_common_type([values.dtype, dtype])

return astype_array_safe(values, new_dtype, copy=False)


def fillna_array(values, value, limit=None, inplace: bool = False, downcast=None):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be used for Block.fillna too?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't directly see an easy way to do that. The Block.fillna additionally has the "split" logic, which I personally don't want to put in here. And it's not directly straightforward to split a part of Block.fillna.

I could for example let this function return some sentinel like NotImplemented or None to signal to Block.fillna that the filling hasn't been done yet, and it should split the block and call fillna again. In which case most of the splitting logic stays in the internals. I could try something like this if you want, but it's also a bit of a hack.

Copy link
Member

@jbrockmendel jbrockmendel Nov 16, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The Block.fillna additionally has the "split" logic, which I personally don't want to put in here

100% agree

I've been looking at simplifying the downcasting in a way that would make the splitting much easier to separate the splitting from non-splitting (not just for fillna) (xref #44241, #40988). It would be an API change so wouldn't be implement in time to simplify this (and your putmask PR), but would eventually allow for some cleanup.

from pandas.core.array_algos.putmask import (
putmask_inplace,
validate_putmask,
)
from pandas.core.arrays import ExtensionArray

# inplace = validate_bool_kwarg(inplace, "inplace")

if isinstance(values, ExtensionArray):
if (
not _can_hold_element(values, value)
and values.dtype.kind != "m"
and not is_categorical_dtype(values.dtype)
):
# We support filling a DatetimeTZ with a `value` whose timezone
# is different by coercing to object.
# TODO: don't special-case td64
values = values.astype(object)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK if you want to keep this copy/pasted version, but i think using coerce_to_target_dtype would be more Technically Correct here

return fillna_array(
values, value, limit=limit, inplace=True, downcast=downcast
)

values = values if inplace else values.copy()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i think EA.fillna is never inplace, so this may be making an unnecessary copy (or not being inplace even when the user specifies it) (applicable to status quo too)

return values.fillna(value, limit=limit)

mask = isna(values)
mask, noop = validate_putmask(values, mask)

if limit is not None:
limit = algos.validate_limit(None, limit=limit)
mask[mask.cumsum(values.ndim - 1) > limit] = False

if values.dtype.kind in ["b", "i", "u"]:
# those dtypes can never hold NAs
if inplace:
return values
else:
return values.copy()

if _can_hold_element(values, value):
values = values if inplace else values.copy()
putmask_inplace(values, mask, value)

if values.dtype == np.dtype(object):
if downcast is None:
values = soft_convert_objects(values, datetime=True, numeric=False)

if downcast is None and values.dtype.kind not in ["f", "m", "M"]:
downcast = "infer"
if downcast:
values = maybe_downcast_to_dtype(values, downcast)
return values

if noop:
# we can't process the value, but nothing to do
return values if inplace else values.copy()
else:
values = coerce_to_target_dtype(values, value)
# bc we have already cast, inplace=True may avoid an extra copy
return fillna_array(values, value, limit=limit, inplace=True, downcast=None)
6 changes: 0 additions & 6 deletions pandas/tests/frame/methods/test_fillna.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import numpy as np
import pytest

import pandas.util._test_decorators as td

from pandas import (
Categorical,
DataFrame,
Expand Down Expand Up @@ -232,7 +230,6 @@ def test_fillna_categorical_nan(self):
df = DataFrame({"a": Categorical(idx)})
tm.assert_frame_equal(df.fillna(value=NaT), df)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) implement downcast
def test_fillna_downcast(self):
# GH#15277
# infer int64 from float64
Expand All @@ -247,7 +244,6 @@ def test_fillna_downcast(self):
expected = DataFrame({"a": [1, 0]})
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) object upcasting
def test_fillna_dtype_conversion(self):
# make sure that fillna on an empty frame works
df = DataFrame(index=["A", "B", "C"], columns=[1, 2, 3, 4, 5])
Expand All @@ -265,15 +261,13 @@ def test_fillna_dtype_conversion(self):
expected = DataFrame("nan", index=range(3), columns=["A", "B"])
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_not_yet_implemented # TODO(ArrayManager) object upcasting
@pytest.mark.parametrize("val", ["", 1, np.nan, 1.0])
def test_fillna_dtype_conversion_equiv_replace(self, val):
df = DataFrame({"A": [1, np.nan], "B": [1.0, 2.0]})
expected = df.replace(np.nan, val)
result = df.fillna(val)
tm.assert_frame_equal(result, expected)

@td.skip_array_manager_invalid_test
def test_fillna_datetime_columns(self):
# GH#7095
df = DataFrame(
Expand Down