-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: ExtensionArray.fillna #19909
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
ENH: ExtensionArray.fillna #19909
Changes from 12 commits
74614cb
67a19ba
280ac94
6705712
69a0f9b
4d6846b
f3b81dc
70efbb8
8fc3851
39096e5
9e44f88
e902f18
17126e6
1106ef2
744c381
9a3fa55
2bc43bc
f22fd7b
c26d261
b342efe
a609f48
a35f93c
3f78dec
1160e15
c9c7a48
05fced6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -216,6 +216,70 @@ def isna(self): | |
""" | ||
raise AbstractMethodError(self) | ||
|
||
def fillna(self, value=None, method=None, limit=None): | ||
""" Fill NA/NaN values using the specified method. | ||
|
||
Parameters | ||
---------- | ||
value : scalar, array-like | ||
If a scalar value is passed it is used to fill all missing values. | ||
Alternatively, an array-like 'value' can be given. It's expected | ||
that the array-like have the same length as 'self'. | ||
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None | ||
Method to use for filling holes in reindexed Series | ||
pad / ffill: propagate last valid observation forward to next valid | ||
backfill / bfill: use NEXT valid observation to fill gap | ||
limit : int, default None | ||
If method is specified, this is the maximum number of consecutive | ||
NaN values to forward/backward fill. In other words, if there is | ||
a gap with more than this number of consecutive NaNs, it will only | ||
be partially filled. If method is not specified, this is the | ||
maximum number of entries along the entire axis where NaNs will be | ||
filled. | ||
|
||
Returns | ||
------- | ||
filled : ExtensionArray with NA/NaN filled | ||
""" | ||
from pandas.api.types import is_scalar | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. A I'll see what I can do to simplify this. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How would that be different with |
||
from pandas.util._validators import validate_fillna_kwargs | ||
from pandas.core.missing import pad_1d, backfill_1d | ||
from pandas.core.dtypes.common import _ensure_platform_int | ||
from pandas._libs.tslib import iNaT | ||
|
||
value, method = validate_fillna_kwargs(value, method) | ||
|
||
mask = self.isna() | ||
|
||
if not is_scalar(value): | ||
if len(value) != len(self): | ||
raise ValueError("Length of 'value' does not match. Got ({}) " | ||
" expected {}".format(len(value), len(self))) | ||
value = value[mask] | ||
|
||
if mask.any(): | ||
if method is not None: | ||
# ffill / bfill | ||
# The basic idea is to create an array of integer positions. | ||
# Internally, we use iNaT and the datetime filling routines | ||
# to avoid floating-point NaN. Once filled, we take on `self` | ||
# to get the actual values. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am still not clear why you are writing another implementation of fill here. We already have one for object dtypes. I would suggest using that one. Yes I get that you want one for the extension classes, but if that is the case then I would replace the algos we have with THIS one. There is no reason to have 2 floating around. Then this will be fully tested and you can look at performance and such. I don't really have a preference to the these 2 options. The extension classes implementation MUST be integrated if its in the base class. |
||
func = pad_1d if method == 'pad' else backfill_1d | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, by |
||
idx = np.arange(len(self), dtype='int64') | ||
idx[mask] = iNaT | ||
idx = _ensure_platform_int(func(idx, mask=mask, | ||
limit=limit, | ||
dtype='datetime64[ns]')) | ||
idx[idx == iNaT] = -1 # missing value marker for take. | ||
new_values = self.take(idx) | ||
else: | ||
# fill with value | ||
new_values = self.copy() | ||
new_values[mask] = value | ||
else: | ||
new_values = self.copy() | ||
return new_values | ||
|
||
# ------------------------------------------------------------------------ | ||
# Indexing methods | ||
# ------------------------------------------------------------------------ | ||
|
@@ -253,6 +317,7 @@ def take(self, indexer, allow_fill=True, fill_value=None): | |
.. code-block:: python | ||
|
||
def take(self, indexer, allow_fill=True, fill_value=None): | ||
indexer = np.asarray(indexer) | ||
mask = indexer == -1 | ||
result = self.data.take(indexer) | ||
result[mask] = np.nan # NA for this type | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -13,7 +13,8 @@ | |
from pandas.core.dtypes.missing import isna, notna | ||
from pandas.core.dtypes.cast import ( | ||
maybe_infer_to_datetimelike, | ||
coerce_indexer_dtype) | ||
coerce_indexer_dtype, | ||
tolist) | ||
from pandas.core.dtypes.dtypes import CategoricalDtype | ||
from pandas.core.dtypes.common import ( | ||
_ensure_int64, | ||
|
@@ -475,9 +476,7 @@ def tolist(self): | |
(for str, int, float) or a pandas scalar | ||
(for Timestamp/Timedelta/Interval/Period) | ||
""" | ||
if is_datetimelike(self.categories): | ||
return [com._maybe_box_datetimelike(x) for x in self] | ||
return np.array(self).tolist() | ||
return tolist(self) | ||
|
||
@property | ||
def base(self): | ||
|
@@ -1587,16 +1586,16 @@ def fillna(self, value=None, method=None, limit=None): | |
|
||
Parameters | ||
---------- | ||
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None | ||
Method to use for filling holes in reindexed Series | ||
pad / ffill: propagate last valid observation forward to next valid | ||
backfill / bfill: use NEXT valid observation to fill gap | ||
value : scalar, dict, Series | ||
If a scalar value is passed it is used to fill all missing values. | ||
Alternatively, a Series or dict can be used to fill in different | ||
values for each index. The value should not be a list. The | ||
value(s) passed should either be in the categories or should be | ||
NaN. | ||
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None | ||
Method to use for filling holes in reindexed Series | ||
pad / ffill: propagate last valid observation forward to next valid | ||
backfill / bfill: use NEXT valid observation to fill gap | ||
limit : int, default None | ||
(Not implemented yet for Categorical!) | ||
If method is specified, this is the maximum number of consecutive | ||
|
@@ -1712,7 +1711,7 @@ def __len__(self): | |
|
||
def __iter__(self): | ||
"""Returns an Iterator over the values of this Categorical.""" | ||
return iter(self.get_values()) | ||
return iter(self.get_values().tolist()) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a reason this does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think to avoid an infinite loop.
Need to do There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It just feels that we are doing to much work. To iterate, we first create an array, iterate through it to create a list, and then iterate through the list .. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So get_values either returns a array or a Datetime(like)Index. For the array we can just iterate over it, but for the Datetime(like)Index I think as well . The tolist implementation there is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Sorry, that previous comment was of course wrong, as that is exactly checking the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, yes, therefore we need the tolist. It's all just complex with the different unboxing depending on the type :-) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Simplified slightly if you want to take one last look. Basically, we didn't have to worry about the different unboxing for different types, since There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Looks good! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is the whole There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any issues with the current implementation? Currently there's a tradeoff between how expensive Somewhere, we have to go from numpy scalars to Python scalars. The fastest way to do that is with
so that |
||
|
||
def _tidy_repr(self, max_vals=10, footer=True): | ||
""" a short repr displaying only max_vals and an optional (but default | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -1963,6 +1963,23 @@ def concat_same_type(self, to_concat, placement=None): | |
return self.make_block_same_class(values, ndim=self.ndim, | ||
placement=placement) | ||
|
||
def fillna(self, value, limit=None, inplace=False, downcast=None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was moved from Categorical.fillna with changes 1.) Removed check for limit, since that's done in |
||
mgr=None): | ||
values = self.values if inplace else self.values.copy() | ||
values = values.fillna(value=value, limit=limit) | ||
return [self.make_block_same_class(values=values, | ||
placement=self.mgr_locs, | ||
ndim=self.ndim)] | ||
|
||
def interpolate(self, method='pad', axis=0, inplace=False, limit=None, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was just a move from |
||
fill_value=None, **kwargs): | ||
|
||
values = self.values if inplace else self.values.copy() | ||
return self.make_block_same_class( | ||
values=values.fillna(value=fill_value, method=method, | ||
limit=limit), | ||
placement=self.mgr_locs) | ||
|
||
|
||
class NumericBlock(Block): | ||
__slots__ = () | ||
|
@@ -2522,27 +2539,6 @@ def _try_coerce_result(self, result): | |
|
||
return result | ||
|
||
def fillna(self, value, limit=None, inplace=False, downcast=None, | ||
mgr=None): | ||
# we may need to upcast our fill to match our dtype | ||
if limit is not None: | ||
raise NotImplementedError("specifying a limit for 'fillna' has " | ||
"not been implemented yet") | ||
|
||
values = self.values if inplace else self.values.copy() | ||
values = self._try_coerce_result(values.fillna(value=value, | ||
limit=limit)) | ||
return [self.make_block(values=values)] | ||
|
||
def interpolate(self, method='pad', axis=0, inplace=False, limit=None, | ||
fill_value=None, **kwargs): | ||
|
||
values = self.values if inplace else self.values.copy() | ||
return self.make_block_same_class( | ||
values=values.fillna(fill_value=fill_value, method=method, | ||
limit=limit), | ||
placement=self.mgr_locs) | ||
|
||
def shift(self, periods, axis=0, mgr=None): | ||
return self.make_block_same_class(values=self.values.shift(periods), | ||
placement=self.mgr_locs) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
value before method ? (order of signature)