-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
[ArrayManager] Array version of fillna logic #41104
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
dce2341
d0e00b3
a1b3b6a
c645017
3226ef5
643b6a5
e0de7b8
a6f74db
bdf9c5a
e67faa1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,9 +26,17 @@ | |
) | ||
from pandas.compat._optional import import_optional_dependency | ||
|
||
from pandas.core.dtypes.cast import infer_dtype_from | ||
from pandas.core.dtypes.cast import ( | ||
astype_array_safe, | ||
can_hold_element, | ||
find_common_type, | ||
infer_dtype_from, | ||
maybe_downcast_to_dtype, | ||
soft_convert_objects, | ||
) | ||
from pandas.core.dtypes.common import ( | ||
is_array_like, | ||
is_categorical_dtype, | ||
is_numeric_v_string_like, | ||
needs_i8_conversion, | ||
) | ||
|
@@ -38,6 +46,8 @@ | |
na_value_for_dtype, | ||
) | ||
|
||
from pandas.core.construction import extract_array | ||
|
||
if TYPE_CHECKING: | ||
from pandas import Index | ||
|
||
|
@@ -964,3 +974,107 @@ def _rolling_window(a: np.ndarray, window: int): | |
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) | ||
strides = a.strides + (a.strides[-1],) | ||
return np.lib.stride_tricks.as_strided(a, shape=shape, strides=strides) | ||
|
||
|
||
def _can_hold_element(values, element: Any) -> bool: | ||
""" | ||
Expanded version of core.dtypes.cast.can_hold_element | ||
""" | ||
from pandas.core.arrays import ( | ||
ExtensionArray, | ||
FloatingArray, | ||
IntegerArray, | ||
) | ||
|
||
if isinstance(values, ExtensionArray): | ||
if hasattr(values, "_validate_setitem_value"): | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
# NDArrayBackedExtensionArray | ||
try: | ||
# error: "Callable[..., Any]" has no attribute "_validate_setitem_value" | ||
values._validate_setitem_value(element) # type: ignore[attr-defined] | ||
return True | ||
except (ValueError, TypeError): | ||
return False | ||
else: | ||
# other ExtensionArrays | ||
return True | ||
else: | ||
element = extract_array(element, extract_numpy=True) | ||
if isinstance(element, (IntegerArray, FloatingArray)): | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if element._mask.any(): | ||
return False | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
return can_hold_element(values, element) | ||
|
||
|
||
def coerce_to_target_dtype(values, other): | ||
jorisvandenbossche marked this conversation as resolved.
Show resolved
Hide resolved
|
||
""" | ||
Coerce the values to a dtype compat for other. This will always | ||
return values, possibly object dtype, and not raise. | ||
""" | ||
dtype, _ = infer_dtype_from(other, pandas_dtype=True) | ||
new_dtype = find_common_type([values.dtype, dtype]) | ||
|
||
return astype_array_safe(values, new_dtype, copy=False) | ||
|
||
|
||
def fillna_array(values, value, limit=None, inplace: bool = False, downcast=None): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could be used for Block.fillna too? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't directly see an easy way to do that. The Block.fillna additionally has the "split" logic, which I personally don't want to put in here. And it's not directly straightforward to split a part of Block.fillna. I could for example let this function return some sentinel like There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
100% agree I've been looking at simplifying the downcasting in a way that would make the splitting much easier to separate the splitting from non-splitting (not just for fillna) (xref #44241, #40988). It would be an API change so wouldn't be implement in time to simplify this (and your putmask PR), but would eventually allow for some cleanup. |
||
from pandas.core.array_algos.putmask import ( | ||
putmask_inplace, | ||
validate_putmask, | ||
) | ||
from pandas.core.arrays import ExtensionArray | ||
|
||
# inplace = validate_bool_kwarg(inplace, "inplace") | ||
|
||
if isinstance(values, ExtensionArray): | ||
if ( | ||
not _can_hold_element(values, value) | ||
and values.dtype.kind != "m" | ||
and not is_categorical_dtype(values.dtype) | ||
): | ||
# We support filling a DatetimeTZ with a `value` whose timezone | ||
# is different by coercing to object. | ||
# TODO: don't special-case td64 | ||
values = values.astype(object) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK if you want to keep this copy/pasted version, but i think using coerce_to_target_dtype would be more Technically Correct here |
||
return fillna_array( | ||
values, value, limit=limit, inplace=True, downcast=downcast | ||
) | ||
|
||
values = values if inplace else values.copy() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think EA.fillna is never inplace, so this may be making an unnecessary copy (or not being inplace even when the user specifies it) (applicable to status quo too) |
||
return values.fillna(value, limit=limit) | ||
|
||
mask = isna(values) | ||
mask, noop = validate_putmask(values, mask) | ||
|
||
if limit is not None: | ||
limit = algos.validate_limit(None, limit=limit) | ||
mask[mask.cumsum(values.ndim - 1) > limit] = False | ||
|
||
if values.dtype.kind in ["b", "i", "u"]: | ||
# those dtypes can never hold NAs | ||
if inplace: | ||
return values | ||
else: | ||
return values.copy() | ||
|
||
if _can_hold_element(values, value): | ||
values = values if inplace else values.copy() | ||
putmask_inplace(values, mask, value) | ||
|
||
if values.dtype == np.dtype(object): | ||
if downcast is None: | ||
values = soft_convert_objects(values, datetime=True, numeric=False) | ||
|
||
if downcast is None and values.dtype.kind not in ["f", "m", "M"]: | ||
downcast = "infer" | ||
if downcast: | ||
values = maybe_downcast_to_dtype(values, downcast) | ||
return values | ||
|
||
if noop: | ||
# we can't process the value, but nothing to do | ||
return values if inplace else values.copy() | ||
else: | ||
values = coerce_to_target_dtype(values, value) | ||
# bc we have already cast, inplace=True may avoid an extra copy | ||
return fillna_array(values, value, limit=limit, inplace=True, downcast=None) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
is there a version of this that makes the existing can_hold_element more accurate instead of implementing another function?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not fully sure what I was thinking in April / if that was already the case, but so right now the existing
can_hold_element
actually fully covers what I needed here.It only doesn't do
extract_array
compared toBlock._can_hold_element
, so kept that here for now. But probably that could be moved tocan_hold_element
as well (will check later).