Skip to content

Commit c52bf0a

Browse files
authored
REF: share Categorical.fillna with NDArrayBackedExtensionArray (#40383)
1 parent b92526b commit c52bf0a

File tree

6 files changed

+32
-84
lines changed

6 files changed

+32
-84
lines changed

pandas/core/arrays/_mixins.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,9 @@ def __getitem__(
282282
def fillna(
283283
self: NDArrayBackedExtensionArrayT, value=None, method=None, limit=None
284284
) -> NDArrayBackedExtensionArrayT:
285-
value, method = validate_fillna_kwargs(value, method)
285+
value, method = validate_fillna_kwargs(
286+
value, method, validate_scalar_dict_value=False
287+
)
286288

287289
mask = self.isna()
288290
# error: Argument 2 to "check_value_size" has incompatible type
@@ -306,6 +308,10 @@ def fillna(
306308
new_values = self.copy()
307309
new_values[mask] = value
308310
else:
311+
# We validate the fill_value even if there is nothing to fill
312+
if value is not None:
313+
self._validate_setitem_value(value)
314+
309315
new_values = self.copy()
310316
return new_values
311317

pandas/core/arrays/categorical.py

+1-66
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,7 @@
4040
cache_readonly,
4141
deprecate_kwarg,
4242
)
43-
from pandas.util._validators import (
44-
validate_bool_kwarg,
45-
validate_fillna_kwargs,
46-
)
43+
from pandas.util._validators import validate_bool_kwarg
4744

4845
from pandas.core.dtypes.cast import (
4946
coerce_indexer_dtype,
@@ -105,7 +102,6 @@
105102
sanitize_array,
106103
)
107104
from pandas.core.indexers import deprecate_ndim_indexing
108-
from pandas.core.missing import interpolate_2d
109105
from pandas.core.ops.common import unpack_zerodim_and_defer
110106
from pandas.core.sorting import nargsort
111107
from pandas.core.strings.object_array import ObjectStringArrayMixin
@@ -1752,67 +1748,6 @@ def to_dense(self):
17521748
)
17531749
return np.asarray(self)
17541750

1755-
def fillna(self, value=None, method=None, limit=None):
1756-
"""
1757-
Fill NA/NaN values using the specified method.
1758-
1759-
Parameters
1760-
----------
1761-
value : scalar, dict, Series
1762-
If a scalar value is passed it is used to fill all missing values.
1763-
Alternatively, a Series or dict can be used to fill in different
1764-
values for each index. The value should not be a list. The
1765-
value(s) passed should either be in the categories or should be
1766-
NaN.
1767-
method : {'backfill', 'bfill', 'pad', 'ffill', None}, default None
1768-
Method to use for filling holes in reindexed Series
1769-
pad / ffill: propagate last valid observation forward to next valid
1770-
backfill / bfill: use NEXT valid observation to fill gap
1771-
limit : int, default None
1772-
(Not implemented yet for Categorical!)
1773-
If method is specified, this is the maximum number of consecutive
1774-
NaN values to forward/backward fill. In other words, if there is
1775-
a gap with more than this number of consecutive NaNs, it will only
1776-
be partially filled. If method is not specified, this is the
1777-
maximum number of entries along the entire axis where NaNs will be
1778-
filled.
1779-
1780-
Returns
1781-
-------
1782-
filled : Categorical with NA/NaN filled
1783-
"""
1784-
value, method = validate_fillna_kwargs(
1785-
value, method, validate_scalar_dict_value=False
1786-
)
1787-
value = extract_array(value, extract_numpy=True)
1788-
1789-
if value is None:
1790-
value = np.nan
1791-
if limit is not None:
1792-
raise NotImplementedError(
1793-
"specifying a limit for fillna has not been implemented yet"
1794-
)
1795-
1796-
if method is not None:
1797-
# pad / bfill
1798-
1799-
# TODO: dispatch when self.categories is EA-dtype
1800-
values = np.asarray(self).reshape(-1, len(self))
1801-
values = interpolate_2d(values, method, 0, None).astype(
1802-
self.categories.dtype
1803-
)[0]
1804-
codes = _get_codes_for_values(values, self.categories)
1805-
1806-
else:
1807-
# We copy even if there is nothing to fill
1808-
codes = self._ndarray.copy()
1809-
mask = self.isna()
1810-
1811-
new_codes = self._validate_setitem_value(value)
1812-
np.putmask(codes, mask, new_codes)
1813-
1814-
return self._from_backing_data(codes)
1815-
18161751
# ------------------------------------------------------------------
18171752
# NDArrayBackedExtensionArray compat
18181753

pandas/core/indexes/category.py

+9-1
Original file line numberDiff line numberDiff line change
@@ -375,7 +375,15 @@ def __contains__(self, key: Any) -> bool:
375375
@doc(Index.fillna)
376376
def fillna(self, value, downcast=None):
377377
value = self._require_scalar(value)
378-
cat = self._data.fillna(value)
378+
try:
379+
cat = self._data.fillna(value)
380+
except (ValueError, TypeError):
381+
# invalid fill_value
382+
if not self.isna().any():
383+
# nothing to fill, we can get away without casting
384+
return self.copy()
385+
return self.astype(object).fillna(value, downcast=downcast)
386+
379387
return type(self)._simple_new(cat, name=self.name)
380388

381389
@doc(Index.unique)

pandas/tests/extension/test_numpy.py

-10
Original file line numberDiff line numberDiff line change
@@ -304,16 +304,6 @@ class TestBooleanReduce(BaseNumPyTests, base.BaseBooleanReduceTests):
304304

305305

306306
class TestMissing(BaseNumPyTests, base.BaseMissingTests):
307-
@skip_nested
308-
def test_fillna_scalar(self, data_missing):
309-
# Non-scalar "scalar" values.
310-
super().test_fillna_scalar(data_missing)
311-
312-
@skip_nested
313-
def test_fillna_no_op_returns_copy(self, data):
314-
# Non-scalar "scalar" values.
315-
super().test_fillna_no_op_returns_copy(data)
316-
317307
@skip_nested
318308
def test_fillna_series(self, data_missing):
319309
# Non-scalar "scalar" values.

pandas/tests/indexes/categorical/test_fillna.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,16 @@ def test_fillna_categorical(self):
1313
exp = CategoricalIndex([1.0, 1.0, 3.0, 1.0], name="x")
1414
tm.assert_index_equal(idx.fillna(1.0), exp)
1515

16-
# fill by value not in categories raises ValueError
16+
cat = idx._data
17+
18+
# fill by value not in categories raises ValueError on EA, casts on CI
1719
msg = "Cannot setitem on a Categorical with a new category"
1820
with pytest.raises(ValueError, match=msg):
19-
idx.fillna(2.0)
21+
cat.fillna(2.0)
22+
23+
result = idx.fillna(2.0)
24+
expected = idx.astype(object).fillna(2.0)
25+
tm.assert_index_equal(result, expected)
2026

2127
def test_fillna_copies_with_no_nas(self):
2228
# Nothing to fill, should still get a copy
@@ -37,8 +43,9 @@ def test_fillna_validates_with_no_nas(self):
3743
cat = ci._data
3844

3945
msg = "Cannot setitem on a Categorical with a new category"
40-
with pytest.raises(ValueError, match=msg):
41-
ci.fillna(False)
46+
res = ci.fillna(False)
47+
# nothing to fill, so we dont cast
48+
tm.assert_index_equal(res, ci)
4249

4350
# Same check directly on the Categorical
4451
with pytest.raises(ValueError, match=msg):

pandas/tests/series/methods/test_fillna.py

+4-2
Original file line numberDiff line numberDiff line change
@@ -671,13 +671,15 @@ def test_fillna_categorical_with_new_categories(self, fill_value, expected_outpu
671671
def test_fillna_categorical_raises(self):
672672
data = ["a", np.nan, "b", np.nan, np.nan]
673673
ser = Series(Categorical(data, categories=["a", "b"]))
674+
cat = ser._values
674675

675676
msg = "Cannot setitem on a Categorical with a new category"
676677
with pytest.raises(ValueError, match=msg):
677678
ser.fillna("d")
678679

679-
with pytest.raises(ValueError, match=msg):
680-
ser.fillna(Series("d"))
680+
msg2 = "Length of 'value' does not match."
681+
with pytest.raises(ValueError, match=msg2):
682+
cat.fillna(Series("d"))
681683

682684
with pytest.raises(ValueError, match=msg):
683685
ser.fillna({1: "d", 3: "a"})

0 commit comments

Comments
 (0)