Skip to content

Commit 103ea6f

Browse files
reidy-pjreback
authored andcommitted
BUG: Accept dict or Series in fillna for categorical Series (pandas-dev#18293)
1 parent b6b9f3f commit 103ea6f

File tree

6 files changed

+165
-90
lines changed

6 files changed

+165
-90
lines changed

doc/source/whatsnew/v0.22.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -28,13 +28,14 @@ Other Enhancements
2828
- :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`)
2929
- :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`)
3030
- Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`)
31+
- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
3132

3233
.. _whatsnew_0220.api_breaking:
3334

3435
Backwards incompatible API changes
3536
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
3637

37-
-
38+
- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`)
3839
-
3940
-
4041

pandas/core/categorical.py

+32-11
Original file line numberDiff line numberDiff line change
@@ -1623,8 +1623,12 @@ def fillna(self, value=None, method=None, limit=None):
16231623
Method to use for filling holes in reindexed Series
16241624
pad / ffill: propagate last valid observation forward to next valid
16251625
backfill / bfill: use NEXT valid observation to fill gap
1626-
value : scalar
1627-
Value to use to fill holes (e.g. 0)
1626+
value : scalar, dict, Series
1627+
If a scalar value is passed it is used to fill all missing values.
1628+
Alternatively, a Series or dict can be used to fill in different
1629+
values for each index. The value should not be a list. The
1630+
value(s) passed should either be in the categories or should be
1631+
NaN.
16281632
limit : int, default None
16291633
(Not implemented yet for Categorical!)
16301634
If method is specified, this is the maximum number of consecutive
@@ -1665,16 +1669,33 @@ def fillna(self, value=None, method=None, limit=None):
16651669

16661670
else:
16671671

1668-
if not isna(value) and value not in self.categories:
1669-
raise ValueError("fill value must be in categories")
1672+
# If value is a dict or a Series (a dict value has already
1673+
# been converted to a Series)
1674+
if isinstance(value, ABCSeries):
1675+
if not value[~value.isin(self.categories)].isna().all():
1676+
raise ValueError("fill value must be in categories")
1677+
1678+
values_codes = _get_codes_for_values(value, self.categories)
1679+
indexer = np.where(values_codes != -1)
1680+
values[indexer] = values_codes[values_codes != -1]
1681+
1682+
# If value is not a dict or Series it should be a scalar
1683+
elif is_scalar(value):
1684+
if not isna(value) and value not in self.categories:
1685+
raise ValueError("fill value must be in categories")
1686+
1687+
mask = values == -1
1688+
if mask.any():
1689+
values = values.copy()
1690+
if isna(value):
1691+
values[mask] = -1
1692+
else:
1693+
values[mask] = self.categories.get_loc(value)
16701694

1671-
mask = values == -1
1672-
if mask.any():
1673-
values = values.copy()
1674-
if isna(value):
1675-
values[mask] = -1
1676-
else:
1677-
values[mask] = self.categories.get_loc(value)
1695+
else:
1696+
raise TypeError('"value" parameter must be a scalar, dict '
1697+
'or Series, but you passed a '
1698+
'"{0}"'.format(type(value).__name__))
16781699

16791700
return self._constructor(values, categories=self.categories,
16801701
ordered=self.ordered, fastpath=True)

pandas/core/generic.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -4304,8 +4304,9 @@ def fillna(self, value=None, method=None, axis=None, inplace=False,
43044304
elif not is_list_like(value):
43054305
pass
43064306
else:
4307-
raise ValueError("invalid fill value with a %s" %
4308-
type(value))
4307+
raise TypeError('"value" parameter must be a scalar, dict '
4308+
'or Series, but you passed a '
4309+
'"{0}"'.format(type(value).__name__))
43094310

43104311
new_data = self._data.fillna(value=value, limit=limit,
43114312
inplace=inplace,

pandas/tests/frame/test_missing.py

+77-2
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010

1111
from pandas.compat import lrange
1212
from pandas import (DataFrame, Series, Timestamp,
13-
date_range)
13+
date_range, Categorical)
1414
import pandas as pd
1515

1616
from pandas.util.testing import assert_series_equal, assert_frame_equal
@@ -270,6 +270,81 @@ def test_fillna(self):
270270
pd.Timestamp('2012-11-11 00:00:00+01:00')]})
271271
assert_frame_equal(df.fillna(method='bfill'), exp)
272272

273+
def test_na_actions_categorical(self):
274+
275+
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
276+
vals = ["a", "b", np.nan, "d"]
277+
df = DataFrame({"cats": cat, "vals": vals})
278+
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
279+
vals2 = ["a", "b", "b", "d"]
280+
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
281+
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
282+
vals3 = ["a", "b", np.nan]
283+
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
284+
cat4 = Categorical([1, 2], categories=[1, 2, 3])
285+
vals4 = ["a", "b"]
286+
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
287+
288+
# fillna
289+
res = df.fillna(value={"cats": 3, "vals": "b"})
290+
tm.assert_frame_equal(res, df_exp_fill)
291+
292+
with tm.assert_raises_regex(ValueError, "fill value must be "
293+
"in categories"):
294+
df.fillna(value={"cats": 4, "vals": "c"})
295+
296+
res = df.fillna(method='pad')
297+
tm.assert_frame_equal(res, df_exp_fill)
298+
299+
# dropna
300+
res = df.dropna(subset=["cats"])
301+
tm.assert_frame_equal(res, df_exp_drop_cats)
302+
303+
res = df.dropna()
304+
tm.assert_frame_equal(res, df_exp_drop_all)
305+
306+
# make sure that fillna takes missing values into account
307+
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
308+
df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]})
309+
310+
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
311+
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
312+
313+
res = df.fillna("a")
314+
tm.assert_frame_equal(res, df_exp)
315+
316+
def test_fillna_categorical_nan(self):
317+
# GH 14021
318+
# np.nan should always be a valid filler
319+
cat = Categorical([np.nan, 2, np.nan])
320+
val = Categorical([np.nan, np.nan, np.nan])
321+
df = DataFrame({"cats": cat, "vals": val})
322+
res = df.fillna(df.median())
323+
v_exp = [np.nan, np.nan, np.nan]
324+
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
325+
dtype='category')
326+
tm.assert_frame_equal(res, df_exp)
327+
328+
result = df.cats.fillna(np.nan)
329+
tm.assert_series_equal(result, df.cats)
330+
result = df.vals.fillna(np.nan)
331+
tm.assert_series_equal(result, df.vals)
332+
333+
idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
334+
'2011-01-01 09:00', pd.NaT, pd.NaT])
335+
df = DataFrame({'a': Categorical(idx)})
336+
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
337+
338+
idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
339+
pd.NaT, pd.NaT], freq='M')
340+
df = DataFrame({'a': Categorical(idx)})
341+
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
342+
343+
idx = pd.TimedeltaIndex(['1 days', '2 days',
344+
'1 days', pd.NaT, pd.NaT])
345+
df = DataFrame({'a': Categorical(idx)})
346+
tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
347+
273348
def test_fillna_downcast(self):
274349
# GH 15277
275350
# infer int64 from float64
@@ -489,7 +564,7 @@ def test_fillna_invalid_value(self):
489564
# tuple
490565
pytest.raises(TypeError, self.frame.fillna, (1, 2))
491566
# frame with series
492-
pytest.raises(ValueError, self.frame.iloc[:, 0].fillna, self.frame)
567+
pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame)
493568

494569
def test_fillna_col_reordering(self):
495570
cols = ["COL." + str(i) for i in range(5, 0, -1)]

pandas/tests/series/test_missing.py

+51-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
import pandas as pd
1313

1414
from pandas import (Series, DataFrame, isna, date_range,
15-
MultiIndex, Index, Timestamp, NaT, IntervalIndex)
15+
MultiIndex, Index, Timestamp, NaT, IntervalIndex,
16+
Categorical)
1617
from pandas.compat import range
1718
from pandas._libs.tslib import iNaT
1819
from pandas.core.series import remove_na
@@ -363,6 +364,55 @@ def test_fillna_raise(self):
363364
with pytest.raises(ValueError):
364365
s.fillna(1, limit=limit, method=method)
365366

367+
@pytest.mark.parametrize('fill_value, expected_output', [
368+
('a', ['a', 'a', 'b', 'a', 'a']),
369+
({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']),
370+
({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]),
371+
({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]),
372+
(Series('a'), ['a', np.nan, 'b', np.nan, np.nan]),
373+
(Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]),
374+
(Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]),
375+
(Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b'])
376+
])
377+
def test_fillna_categorical(self, fill_value, expected_output):
378+
# GH 17033
379+
# Test fillna for a Categorical series
380+
data = ['a', np.nan, 'b', np.nan, np.nan]
381+
s = Series(Categorical(data, categories=['a', 'b']))
382+
exp = Series(Categorical(expected_output, categories=['a', 'b']))
383+
tm.assert_series_equal(s.fillna(fill_value), exp)
384+
385+
def test_fillna_categorical_raise(self):
386+
data = ['a', np.nan, 'b', np.nan, np.nan]
387+
s = Series(Categorical(data, categories=['a', 'b']))
388+
389+
with tm.assert_raises_regex(ValueError,
390+
"fill value must be in categories"):
391+
s.fillna('d')
392+
393+
with tm.assert_raises_regex(ValueError,
394+
"fill value must be in categories"):
395+
s.fillna(Series('d'))
396+
397+
with tm.assert_raises_regex(ValueError,
398+
"fill value must be in categories"):
399+
s.fillna({1: 'd', 3: 'a'})
400+
401+
with tm.assert_raises_regex(TypeError,
402+
'"value" parameter must be a scalar or '
403+
'dict, but you passed a "list"'):
404+
s.fillna(['a', 'b'])
405+
406+
with tm.assert_raises_regex(TypeError,
407+
'"value" parameter must be a scalar or '
408+
'dict, but you passed a "tuple"'):
409+
s.fillna(('a', 'b'))
410+
411+
with tm.assert_raises_regex(TypeError,
412+
'"value" parameter must be a scalar, dict '
413+
'or Series, but you passed a "DataFrame"'):
414+
s.fillna(DataFrame({1: ['a'], 3: ['b']}))
415+
366416
def test_fillna_nat(self):
367417
series = Series([0, 1, 2, iNaT], dtype='M8[ns]')
368418

pandas/tests/test_categorical.py

-73
Original file line numberDiff line numberDiff line change
@@ -4496,79 +4496,6 @@ def test_numpy_reshape(self):
44964496
tm.assert_raises_regex(ValueError, msg, np.reshape,
44974497
cat, cat.shape, order='F')
44984498

4499-
def test_na_actions(self):
4500-
4501-
cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3])
4502-
vals = ["a", "b", np.nan, "d"]
4503-
df = DataFrame({"cats": cat, "vals": vals})
4504-
cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3])
4505-
vals2 = ["a", "b", "b", "d"]
4506-
df_exp_fill = DataFrame({"cats": cat2, "vals": vals2})
4507-
cat3 = Categorical([1, 2, 3], categories=[1, 2, 3])
4508-
vals3 = ["a", "b", np.nan]
4509-
df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3})
4510-
cat4 = Categorical([1, 2], categories=[1, 2, 3])
4511-
vals4 = ["a", "b"]
4512-
df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4})
4513-
4514-
# fillna
4515-
res = df.fillna(value={"cats": 3, "vals": "b"})
4516-
tm.assert_frame_equal(res, df_exp_fill)
4517-
4518-
def f():
4519-
df.fillna(value={"cats": 4, "vals": "c"})
4520-
4521-
pytest.raises(ValueError, f)
4522-
4523-
res = df.fillna(method='pad')
4524-
tm.assert_frame_equal(res, df_exp_fill)
4525-
4526-
res = df.dropna(subset=["cats"])
4527-
tm.assert_frame_equal(res, df_exp_drop_cats)
4528-
4529-
res = df.dropna()
4530-
tm.assert_frame_equal(res, df_exp_drop_all)
4531-
4532-
# make sure that fillna takes missing values into account
4533-
c = Categorical([np.nan, "b", np.nan], categories=["a", "b"])
4534-
df = DataFrame({"cats": c, "vals": [1, 2, 3]})
4535-
4536-
cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"])
4537-
df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]})
4538-
4539-
res = df.fillna("a")
4540-
tm.assert_frame_equal(res, df_exp)
4541-
4542-
# GH 14021
4543-
# np.nan should always be a is a valid filler
4544-
cat = Categorical([np.nan, 2, np.nan])
4545-
val = Categorical([np.nan, np.nan, np.nan])
4546-
df = DataFrame({"cats": cat, "vals": val})
4547-
res = df.fillna(df.median())
4548-
v_exp = [np.nan, np.nan, np.nan]
4549-
df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
4550-
dtype='category')
4551-
tm.assert_frame_equal(res, df_exp)
4552-
4553-
result = df.cats.fillna(np.nan)
4554-
tm.assert_series_equal(result, df.cats)
4555-
result = df.vals.fillna(np.nan)
4556-
tm.assert_series_equal(result, df.vals)
4557-
4558-
idx = DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
4559-
'2011-01-01 09:00', NaT, NaT])
4560-
df = DataFrame({'a': Categorical(idx)})
4561-
tm.assert_frame_equal(df.fillna(value=NaT), df)
4562-
4563-
idx = PeriodIndex(
4564-
['2011-01', '2011-01', '2011-01', NaT, NaT], freq='M')
4565-
df = DataFrame({'a': Categorical(idx)})
4566-
tm.assert_frame_equal(df.fillna(value=NaT), df)
4567-
4568-
idx = TimedeltaIndex(['1 days', '2 days', '1 days', NaT, NaT])
4569-
df = DataFrame({'a': Categorical(idx)})
4570-
tm.assert_frame_equal(df.fillna(value=NaT), df)
4571-
45724499
def test_astype_to_other(self):
45734500

45744501
s = self.cat['value_group']

0 commit comments

Comments
 (0)