diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index e868c73914b6e..d5a136c5ab0c7 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -28,13 +28,14 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`) - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) +- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) .. _whatsnew_0220.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- +- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) - - diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d0851e3ab4f96..03bf09352862b 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1623,8 +1623,12 @@ def fillna(self, value=None, method=None, limit=None): Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap - value : scalar - Value to use to fill holes (e.g. 0) + value : scalar, dict, Series + If a scalar value is passed it is used to fill all missing values. + Alternatively, a Series or dict can be used to fill in different + values for each index. The value should not be a list. The + value(s) passed should either be in the categories or should be + NaN. limit : int, default None (Not implemented yet for Categorical!) If method is specified, this is the maximum number of consecutive @@ -1665,16 +1669,33 @@ def fillna(self, value=None, method=None, limit=None): else: - if not isna(value) and value not in self.categories: - raise ValueError("fill value must be in categories") + # If value is a dict or a Series (a dict value has already + # been converted to a Series) + if isinstance(value, ABCSeries): + if not value[~value.isin(self.categories)].isna().all(): + raise ValueError("fill value must be in categories") + + values_codes = _get_codes_for_values(value, self.categories) + indexer = np.where(values_codes != -1) + values[indexer] = values_codes[values_codes != -1] + + # If value is not a dict or Series it should be a scalar + elif is_scalar(value): + if not isna(value) and value not in self.categories: + raise ValueError("fill value must be in categories") + + mask = values == -1 + if mask.any(): + values = values.copy() + if isna(value): + values[mask] = -1 + else: + values[mask] = self.categories.get_loc(value) - mask = values == -1 - if mask.any(): - values = values.copy() - if isna(value): - values[mask] = -1 - else: - values[mask] = self.categories.get_loc(value) + else: + raise TypeError('"value" parameter must be a scalar, dict ' + 'or Series, but you passed a ' + '"{0}"'.format(type(value).__name__)) return self._constructor(values, categories=self.categories, ordered=self.ordered, fastpath=True) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d93fe52d5ca9c..e497679e2f3d9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4304,8 +4304,9 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, elif not is_list_like(value): pass else: - raise ValueError("invalid fill value with a %s" % - type(value)) + raise TypeError('"value" parameter must be a scalar, dict ' + 'or Series, but you passed a ' + '"{0}"'.format(type(value).__name__)) new_data = self._data.fillna(value=value, limit=limit, inplace=inplace, diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index ebd15b3180a33..d566c92e7738e 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -10,7 +10,7 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, Timestamp, - date_range) + date_range, Categorical) import pandas as pd from pandas.util.testing import assert_series_equal, assert_frame_equal @@ -270,6 +270,81 @@ def test_fillna(self): pd.Timestamp('2012-11-11 00:00:00+01:00')]}) assert_frame_equal(df.fillna(method='bfill'), exp) + def test_na_actions_categorical(self): + + cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) + vals = ["a", "b", np.nan, "d"] + df = DataFrame({"cats": cat, "vals": vals}) + cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) + vals2 = ["a", "b", "b", "d"] + df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) + cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) + vals3 = ["a", "b", np.nan] + df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) + cat4 = Categorical([1, 2], categories=[1, 2, 3]) + vals4 = ["a", "b"] + df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) + + # fillna + res = df.fillna(value={"cats": 3, "vals": "b"}) + tm.assert_frame_equal(res, df_exp_fill) + + with tm.assert_raises_regex(ValueError, "fill value must be " + "in categories"): + df.fillna(value={"cats": 4, "vals": "c"}) + + res = df.fillna(method='pad') + tm.assert_frame_equal(res, df_exp_fill) + + # dropna + res = df.dropna(subset=["cats"]) + tm.assert_frame_equal(res, df_exp_drop_cats) + + res = df.dropna() + tm.assert_frame_equal(res, df_exp_drop_all) + + # make sure that fillna takes missing values into account + c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) + df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) + + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) + df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) + + res = df.fillna("a") + tm.assert_frame_equal(res, df_exp) + + def test_fillna_categorical_nan(self): + # GH 14021 + # np.nan should always be a valid filler + cat = Categorical([np.nan, 2, np.nan]) + val = Categorical([np.nan, np.nan, np.nan]) + df = DataFrame({"cats": cat, "vals": val}) + res = df.fillna(df.median()) + v_exp = [np.nan, np.nan, np.nan] + df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, + dtype='category') + tm.assert_frame_equal(res, df_exp) + + result = df.cats.fillna(np.nan) + tm.assert_series_equal(result, df.cats) + result = df.vals.fillna(np.nan) + tm.assert_series_equal(result, df.vals) + + idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', + '2011-01-01 09:00', pd.NaT, pd.NaT]) + df = DataFrame({'a': Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01', + pd.NaT, pd.NaT], freq='M') + df = DataFrame({'a': Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.TimedeltaIndex(['1 days', '2 days', + '1 days', pd.NaT, pd.NaT]) + df = DataFrame({'a': Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + def test_fillna_downcast(self): # GH 15277 # infer int64 from float64 @@ -489,7 +564,7 @@ def test_fillna_invalid_value(self): # tuple pytest.raises(TypeError, self.frame.fillna, (1, 2)) # frame with series - pytest.raises(ValueError, self.frame.iloc[:, 0].fillna, self.frame) + pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame) def test_fillna_col_reordering(self): cols = ["COL." + str(i) for i in range(5, 0, -1)] diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 5ca4eba4da13b..2dbce45317639 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -12,7 +12,8 @@ import pandas as pd from pandas import (Series, DataFrame, isna, date_range, - MultiIndex, Index, Timestamp, NaT, IntervalIndex) + MultiIndex, Index, Timestamp, NaT, IntervalIndex, + Categorical) from pandas.compat import range from pandas._libs.tslib import iNaT from pandas.core.series import remove_na @@ -363,6 +364,55 @@ def test_fillna_raise(self): with pytest.raises(ValueError): s.fillna(1, limit=limit, method=method) + @pytest.mark.parametrize('fill_value, expected_output', [ + ('a', ['a', 'a', 'b', 'a', 'a']), + ({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']), + ({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]), + ({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]), + (Series('a'), ['a', np.nan, 'b', np.nan, np.nan]), + (Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]), + (Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]), + (Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b']) + ]) + def test_fillna_categorical(self, fill_value, expected_output): + # GH 17033 + # Test fillna for a Categorical series + data = ['a', np.nan, 'b', np.nan, np.nan] + s = Series(Categorical(data, categories=['a', 'b'])) + exp = Series(Categorical(expected_output, categories=['a', 'b'])) + tm.assert_series_equal(s.fillna(fill_value), exp) + + def test_fillna_categorical_raise(self): + data = ['a', np.nan, 'b', np.nan, np.nan] + s = Series(Categorical(data, categories=['a', 'b'])) + + with tm.assert_raises_regex(ValueError, + "fill value must be in categories"): + s.fillna('d') + + with tm.assert_raises_regex(ValueError, + "fill value must be in categories"): + s.fillna(Series('d')) + + with tm.assert_raises_regex(ValueError, + "fill value must be in categories"): + s.fillna({1: 'd', 3: 'a'}) + + with tm.assert_raises_regex(TypeError, + '"value" parameter must be a scalar or ' + 'dict, but you passed a "list"'): + s.fillna(['a', 'b']) + + with tm.assert_raises_regex(TypeError, + '"value" parameter must be a scalar or ' + 'dict, but you passed a "tuple"'): + s.fillna(('a', 'b')) + + with tm.assert_raises_regex(TypeError, + '"value" parameter must be a scalar, dict ' + 'or Series, but you passed a "DataFrame"'): + s.fillna(DataFrame({1: ['a'], 3: ['b']})) + def test_fillna_nat(self): series = Series([0, 1, 2, iNaT], dtype='M8[ns]') diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 7988d9ca72568..b570672124976 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4496,79 +4496,6 @@ def test_numpy_reshape(self): tm.assert_raises_regex(ValueError, msg, np.reshape, cat, cat.shape, order='F') - def test_na_actions(self): - - cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - vals = ["a", "b", np.nan, "d"] - df = DataFrame({"cats": cat, "vals": vals}) - cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) - vals2 = ["a", "b", "b", "d"] - df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) - cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) - vals3 = ["a", "b", np.nan] - df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) - cat4 = Categorical([1, 2], categories=[1, 2, 3]) - vals4 = ["a", "b"] - df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) - - # fillna - res = df.fillna(value={"cats": 3, "vals": "b"}) - tm.assert_frame_equal(res, df_exp_fill) - - def f(): - df.fillna(value={"cats": 4, "vals": "c"}) - - pytest.raises(ValueError, f) - - res = df.fillna(method='pad') - tm.assert_frame_equal(res, df_exp_fill) - - res = df.dropna(subset=["cats"]) - tm.assert_frame_equal(res, df_exp_drop_cats) - - res = df.dropna() - tm.assert_frame_equal(res, df_exp_drop_all) - - # make sure that fillna takes missing values into account - c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) - df = DataFrame({"cats": c, "vals": [1, 2, 3]}) - - cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) - df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) - - res = df.fillna("a") - tm.assert_frame_equal(res, df_exp) - - # GH 14021 - # np.nan should always be a is a valid filler - cat = Categorical([np.nan, 2, np.nan]) - val = Categorical([np.nan, np.nan, np.nan]) - df = DataFrame({"cats": cat, "vals": val}) - res = df.fillna(df.median()) - v_exp = [np.nan, np.nan, np.nan] - df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, - dtype='category') - tm.assert_frame_equal(res, df_exp) - - result = df.cats.fillna(np.nan) - tm.assert_series_equal(result, df.cats) - result = df.vals.fillna(np.nan) - tm.assert_series_equal(result, df.vals) - - idx = DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', - '2011-01-01 09:00', NaT, NaT]) - df = DataFrame({'a': Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=NaT), df) - - idx = PeriodIndex( - ['2011-01', '2011-01', '2011-01', NaT, NaT], freq='M') - df = DataFrame({'a': Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=NaT), df) - - idx = TimedeltaIndex(['1 days', '2 days', '1 days', NaT, NaT]) - df = DataFrame({'a': Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=NaT), df) - def test_astype_to_other(self): s = self.cat['value_group']