From 46de7e992b584be2bcf6462b9bac3365aa95f5ff Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Tue, 14 Nov 2017 21:39:50 +0000 Subject: [PATCH 01/11] BUG: Accept dict or Series in fillna for categorical Series --- pandas/core/categorical.py | 28 +++++++++++++++-------- pandas/tests/test_categorical.py | 39 ++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 9 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index d0851e3ab4f96..113df4fe9682d 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1665,16 +1665,26 @@ def fillna(self, value=None, method=None, limit=None): else: - if not isna(value) and value not in self.categories: - raise ValueError("fill value must be in categories") + if isinstance(value, ABCSeries): + if not value[~value.isin(self.categories)].isna().all(): + raise ValueError("fill value must be in categories") - mask = values == -1 - if mask.any(): - values = values.copy() - if isna(value): - values[mask] = -1 - else: - values[mask] = self.categories.get_loc(value) + values_codes = _get_codes_for_values(value, self.categories) + indexer = np.where(values_codes != -1) + values[indexer] = values_codes[values_codes != -1] + + # Scalar value + else: + if not isna(value) and value not in self.categories: + raise ValueError("fill value must be in categories") + + mask = values == -1 + if mask.any(): + values = values.copy() + if isna(value): + values[mask] = -1 + else: + values[mask] = self.categories.get_loc(value) return self._constructor(values, categories=self.categories, ordered=self.ordered, fastpath=True) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 7988d9ca72568..e14852b15fe69 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4569,6 +4569,45 @@ def f(): df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=NaT), df) + @pytest.mark.parametrize('fill_value expected_output', [ + ('a', ['a', 'a', 'b', 'a', 'a']), + ({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']), + ({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]), + ({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]), + (pd.Series('a'), ['a', np.nan, 'b', np.nan, np.nan]), + (pd.Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]), + (pd.Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]), + (pd.Series(['a', 'b'], index=[3, 4])) + ]) + def fillna_series_categorical(self, fill_value, expected_output): + # GH 17033 + # Test fillna for a Categorical series + data = ['a', np.nan, 'b', np.nan, np.nan] + s = pd.Series(pd.Categorical(data, categories=['a', 'b'])) + exp = pd.Series(pd.Categorical(expected_output, categories=['a', 'b'])) + tm.assert_series_equal(s.fillna(fill_value), exp) + + def fillna_series_categorical_errormsg(self): + data = ['a', np.nan, 'b', np.nan, np.nan] + s = pd.Series(pd.Categorical(data, categories=['a', 'b'])) + + with tm.assert_raises_regex(ValueError, + "fill value must be in categories"): + s.fillna('d') + + with tm.assert_raises_regex(ValueError, + "fill value must be in categories"): + s.fillna(pd.Series('d')) + + with tm.assert_raises_regex(ValueError, + "fill value must be in categories"): + s.fillna({1: 'd', 3: 'a'}) + + with tm.assert_raises_regex(TypeError, + '"value" parameter must be a scalar or ' + 'dict but you passed a "list"'): + s.fillna(['a', 'b']) + def test_astype_to_other(self): s = self.cat['value_group'] From 302411818f115f45cf2867c52e02374e0bd664d5 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Tue, 14 Nov 2017 22:29:09 +0000 Subject: [PATCH 02/11] Fix problems with new tests --- pandas/tests/test_categorical.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index e14852b15fe69..85d1de0dbf39d 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4569,7 +4569,7 @@ def f(): df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=NaT), df) - @pytest.mark.parametrize('fill_value expected_output', [ + @pytest.mark.parametrize('fill_value, expected_output', [ ('a', ['a', 'a', 'b', 'a', 'a']), ({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']), ({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]), @@ -4577,9 +4577,9 @@ def f(): (pd.Series('a'), ['a', np.nan, 'b', np.nan, np.nan]), (pd.Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]), (pd.Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]), - (pd.Series(['a', 'b'], index=[3, 4])) + (pd.Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b']) ]) - def fillna_series_categorical(self, fill_value, expected_output): + def test_fillna_series_categorical(self, fill_value, expected_output): # GH 17033 # Test fillna for a Categorical series data = ['a', np.nan, 'b', np.nan, np.nan] @@ -4587,7 +4587,7 @@ def fillna_series_categorical(self, fill_value, expected_output): exp = pd.Series(pd.Categorical(expected_output, categories=['a', 'b'])) tm.assert_series_equal(s.fillna(fill_value), exp) - def fillna_series_categorical_errormsg(self): + def test_fillna_series_categorical_errormsg(self): data = ['a', np.nan, 'b', np.nan, np.nan] s = pd.Series(pd.Categorical(data, categories=['a', 'b'])) @@ -4605,7 +4605,7 @@ def fillna_series_categorical_errormsg(self): with tm.assert_raises_regex(TypeError, '"value" parameter must be a scalar or ' - 'dict but you passed a "list"'): + 'dict, but you passed a "list"'): s.fillna(['a', 'b']) def test_astype_to_other(self): @@ -4941,3 +4941,4 @@ def test_map(self): assert isinstance(res, tm.SubclassedCategorical) exp = Categorical(['A', 'B', 'C']) tm.assert_categorical_equal(res, exp) + From 2ef544440d2bbe931267f158b2820ce7007a1a6c Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Tue, 14 Nov 2017 22:30:58 +0000 Subject: [PATCH 03/11] pep8 issue --- pandas/tests/test_categorical.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 85d1de0dbf39d..833e8647924f5 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4941,4 +4941,3 @@ def test_map(self): assert isinstance(res, tm.SubclassedCategorical) exp = Categorical(['A', 'B', 'C']) tm.assert_categorical_equal(res, exp) - From 5780c4303295297709c648784d22be456e8dc0b4 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 15 Nov 2017 23:17:39 +0000 Subject: [PATCH 04/11] move tests and add whatsnew --- doc/source/whatsnew/v0.22.0.txt | 1 + pandas/tests/frame/test_missing.py | 76 ++++++++++++++++++++++++++++- pandas/tests/series/test_missing.py | 47 +++++++++++++++++- pandas/tests/test_categorical.py | 3 ++ 4 files changed, 125 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index e868c73914b6e..2e13d9d721dbf 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -28,6 +28,7 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`) - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) +- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` (:issue:`17033`) .. _whatsnew_0220.api_breaking: diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index ebd15b3180a33..6a3c412d57382 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -10,7 +10,7 @@ from pandas.compat import lrange from pandas import (DataFrame, Series, Timestamp, - date_range) + date_range, Categorical) import pandas as pd from pandas.util.testing import assert_series_equal, assert_frame_equal @@ -270,6 +270,80 @@ def test_fillna(self): pd.Timestamp('2012-11-11 00:00:00+01:00')]}) assert_frame_equal(df.fillna(method='bfill'), exp) + def test_na_actions(self): + + cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) + vals = ["a", "b", np.nan, "d"] + df = pd.DataFrame({"cats": cat, "vals": vals}) + cat2 = pd.Categorical([1, 2, 3, 3], categories=[1, 2, 3]) + vals2 = ["a", "b", "b", "d"] + df_exp_fill = pd.DataFrame({"cats": cat2, "vals": vals2}) + cat3 = pd.Categorical([1, 2, 3], categories=[1, 2, 3]) + vals3 = ["a", "b", np.nan] + df_exp_drop_cats = pd.DataFrame({"cats": cat3, "vals": vals3}) + cat4 = pd.Categorical([1, 2], categories=[1, 2, 3]) + vals4 = ["a", "b"] + df_exp_drop_all = pd.DataFrame({"cats": cat4, "vals": vals4}) + + # fillna + res = df.fillna(value={"cats": 3, "vals": "b"}) + tm.assert_frame_equal(res, df_exp_fill) + + def f(): + df.fillna(value={"cats": 4, "vals": "c"}) + + pytest.raises(ValueError, f) + + res = df.fillna(method='pad') + tm.assert_frame_equal(res, df_exp_fill) + + res = df.dropna(subset=["cats"]) + tm.assert_frame_equal(res, df_exp_drop_cats) + + res = df.dropna() + tm.assert_frame_equal(res, df_exp_drop_all) + + # make sure that fillna takes missing values into account + c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) + df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) + + cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) + df_exp = pd.DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) + + res = df.fillna("a") + tm.assert_frame_equal(res, df_exp) + + # GH 14021 + # np.nan should always be a is a valid filler + cat = Categorical([np.nan, 2, np.nan]) + val = Categorical([np.nan, np.nan, np.nan]) + df = DataFrame({"cats": cat, "vals": val}) + res = df.fillna(df.median()) + v_exp = [np.nan, np.nan, np.nan] + df_exp = pd.DataFrame({"cats": [2, 2, 2], "vals": v_exp}, + dtype='category') + tm.assert_frame_equal(res, df_exp) + + result = df.cats.fillna(np.nan) + tm.assert_series_equal(result, df.cats) + result = df.vals.fillna(np.nan) + tm.assert_series_equal(result, df.vals) + + idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', + '2011-01-01 09:00', pd.NaT, pd.NaT]) + df = DataFrame({'a': pd.Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01', + pd.NaT, pd.NaT], freq='M') + df = DataFrame({'a': pd.Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + + idx = pd.TimedeltaIndex(['1 days', '2 days', + '1 days', pd.NaT, pd.NaT]) + df = pd.DataFrame({'a': pd.Categorical(idx)}) + tm.assert_frame_equal(df.fillna(value=pd.NaT), df) + def test_fillna_downcast(self): # GH 15277 # infer int64 from float64 diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index 5ca4eba4da13b..de6229b342948 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -12,7 +12,8 @@ import pandas as pd from pandas import (Series, DataFrame, isna, date_range, - MultiIndex, Index, Timestamp, NaT, IntervalIndex) + MultiIndex, Index, Timestamp, NaT, IntervalIndex, + Categorical) from pandas.compat import range from pandas._libs.tslib import iNaT from pandas.core.series import remove_na @@ -363,6 +364,50 @@ def test_fillna_raise(self): with pytest.raises(ValueError): s.fillna(1, limit=limit, method=method) + @pytest.mark.parametrize('fill_value, expected_output', [ + ('a', ['a', 'a', 'b', 'a', 'a']), + ({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']), + ({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]), + ({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]), + (Series('a'), ['a', np.nan, 'b', np.nan, np.nan]), + (Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]), + (Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]), + (Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b']) + ]) + def test_fillna_categorical(self, fill_value, expected_output): + # GH 17033 + # Test fillna for a Categorical series + data = ['a', np.nan, 'b', np.nan, np.nan] + s = Series(Categorical(data, categories=['a', 'b'])) + exp = Series(Categorical(expected_output, categories=['a', 'b'])) + tm.assert_series_equal(s.fillna(fill_value), exp) + + def test_fillna_categorical_raise(self): + data = ['a', np.nan, 'b', np.nan, np.nan] + s = Series(Categorical(data, categories=['a', 'b'])) + + with tm.assert_raises_regex(ValueError, + "fill value must be in categories"): + s.fillna('d') + + with tm.assert_raises_regex(ValueError, + "fill value must be in categories"): + s.fillna(Series('d')) + + with tm.assert_raises_regex(ValueError, + "fill value must be in categories"): + s.fillna({1: 'd', 3: 'a'}) + + with tm.assert_raises_regex(TypeError, + '"value" parameter must be a scalar or ' + 'dict, but you passed a "list"'): + s.fillna(['a', 'b']) + + with tm.assert_raises_regex(TypeError, + '"value" parameter must be a scalar or ' + 'dict, but you passed a "tuple"'): + s.fillna(('a', 'b')) + def test_fillna_nat(self): series = Series([0, 1, 2, iNaT], dtype='M8[ns]') diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 833e8647924f5..8672483bc15f5 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4496,6 +4496,7 @@ def test_numpy_reshape(self): tm.assert_raises_regex(ValueError, msg, np.reshape, cat, cat.shape, order='F') +<<<<<<< HEAD def test_na_actions(self): cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) @@ -4608,6 +4609,8 @@ def test_fillna_series_categorical_errormsg(self): 'dict, but you passed a "list"'): s.fillna(['a', 'b']) +======= +>>>>>>> move tests and add whatsnew def test_astype_to_other(self): s = self.cat['value_group'] From 2f4be6de4910920bf5e17f60763de26d25b35600 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Wed, 15 Nov 2017 23:26:33 +0000 Subject: [PATCH 05/11] fix test_categorical.py --- pandas/tests/test_categorical.py | 115 ------------------------------- 1 file changed, 115 deletions(-) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 8672483bc15f5..b570672124976 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -4496,121 +4496,6 @@ def test_numpy_reshape(self): tm.assert_raises_regex(ValueError, msg, np.reshape, cat, cat.shape, order='F') -<<<<<<< HEAD - def test_na_actions(self): - - cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) - vals = ["a", "b", np.nan, "d"] - df = DataFrame({"cats": cat, "vals": vals}) - cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) - vals2 = ["a", "b", "b", "d"] - df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) - cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) - vals3 = ["a", "b", np.nan] - df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) - cat4 = Categorical([1, 2], categories=[1, 2, 3]) - vals4 = ["a", "b"] - df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) - - # fillna - res = df.fillna(value={"cats": 3, "vals": "b"}) - tm.assert_frame_equal(res, df_exp_fill) - - def f(): - df.fillna(value={"cats": 4, "vals": "c"}) - - pytest.raises(ValueError, f) - - res = df.fillna(method='pad') - tm.assert_frame_equal(res, df_exp_fill) - - res = df.dropna(subset=["cats"]) - tm.assert_frame_equal(res, df_exp_drop_cats) - - res = df.dropna() - tm.assert_frame_equal(res, df_exp_drop_all) - - # make sure that fillna takes missing values into account - c = Categorical([np.nan, "b", np.nan], categories=["a", "b"]) - df = DataFrame({"cats": c, "vals": [1, 2, 3]}) - - cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) - df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) - - res = df.fillna("a") - tm.assert_frame_equal(res, df_exp) - - # GH 14021 - # np.nan should always be a is a valid filler - cat = Categorical([np.nan, 2, np.nan]) - val = Categorical([np.nan, np.nan, np.nan]) - df = DataFrame({"cats": cat, "vals": val}) - res = df.fillna(df.median()) - v_exp = [np.nan, np.nan, np.nan] - df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, - dtype='category') - tm.assert_frame_equal(res, df_exp) - - result = df.cats.fillna(np.nan) - tm.assert_series_equal(result, df.cats) - result = df.vals.fillna(np.nan) - tm.assert_series_equal(result, df.vals) - - idx = DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', - '2011-01-01 09:00', NaT, NaT]) - df = DataFrame({'a': Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=NaT), df) - - idx = PeriodIndex( - ['2011-01', '2011-01', '2011-01', NaT, NaT], freq='M') - df = DataFrame({'a': Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=NaT), df) - - idx = TimedeltaIndex(['1 days', '2 days', '1 days', NaT, NaT]) - df = DataFrame({'a': Categorical(idx)}) - tm.assert_frame_equal(df.fillna(value=NaT), df) - - @pytest.mark.parametrize('fill_value, expected_output', [ - ('a', ['a', 'a', 'b', 'a', 'a']), - ({1: 'a', 3: 'b', 4: 'b'}, ['a', 'a', 'b', 'b', 'b']), - ({1: 'a'}, ['a', 'a', 'b', np.nan, np.nan]), - ({1: 'a', 3: 'b'}, ['a', 'a', 'b', 'b', np.nan]), - (pd.Series('a'), ['a', np.nan, 'b', np.nan, np.nan]), - (pd.Series('a', index=[1]), ['a', 'a', 'b', np.nan, np.nan]), - (pd.Series({1: 'a', 3: 'b'}), ['a', 'a', 'b', 'b', np.nan]), - (pd.Series(['a', 'b'], index=[3, 4]), ['a', np.nan, 'b', 'a', 'b']) - ]) - def test_fillna_series_categorical(self, fill_value, expected_output): - # GH 17033 - # Test fillna for a Categorical series - data = ['a', np.nan, 'b', np.nan, np.nan] - s = pd.Series(pd.Categorical(data, categories=['a', 'b'])) - exp = pd.Series(pd.Categorical(expected_output, categories=['a', 'b'])) - tm.assert_series_equal(s.fillna(fill_value), exp) - - def test_fillna_series_categorical_errormsg(self): - data = ['a', np.nan, 'b', np.nan, np.nan] - s = pd.Series(pd.Categorical(data, categories=['a', 'b'])) - - with tm.assert_raises_regex(ValueError, - "fill value must be in categories"): - s.fillna('d') - - with tm.assert_raises_regex(ValueError, - "fill value must be in categories"): - s.fillna(pd.Series('d')) - - with tm.assert_raises_regex(ValueError, - "fill value must be in categories"): - s.fillna({1: 'd', 3: 'a'}) - - with tm.assert_raises_regex(TypeError, - '"value" parameter must be a scalar or ' - 'dict, but you passed a "list"'): - s.fillna(['a', 'b']) - -======= ->>>>>>> move tests and add whatsnew def test_astype_to_other(self): s = self.cat['value_group'] From a69e6964ff0e615437934989251f05cc470ca1df Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Fri, 17 Nov 2017 22:07:10 +0000 Subject: [PATCH 06/11] cleanup existing tests in frame/test_missing.py --- pandas/tests/frame/test_missing.py | 37 +++++++++++++++--------------- 1 file changed, 19 insertions(+), 18 deletions(-) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 6a3c412d57382..2a852bf957f7d 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -270,33 +270,33 @@ def test_fillna(self): pd.Timestamp('2012-11-11 00:00:00+01:00')]}) assert_frame_equal(df.fillna(method='bfill'), exp) - def test_na_actions(self): + def test_na_actions_categorical(self): - cat = pd.Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) + cat = Categorical([1, 2, 3, np.nan], categories=[1, 2, 3]) vals = ["a", "b", np.nan, "d"] - df = pd.DataFrame({"cats": cat, "vals": vals}) - cat2 = pd.Categorical([1, 2, 3, 3], categories=[1, 2, 3]) + df = DataFrame({"cats": cat, "vals": vals}) + cat2 = Categorical([1, 2, 3, 3], categories=[1, 2, 3]) vals2 = ["a", "b", "b", "d"] - df_exp_fill = pd.DataFrame({"cats": cat2, "vals": vals2}) - cat3 = pd.Categorical([1, 2, 3], categories=[1, 2, 3]) + df_exp_fill = DataFrame({"cats": cat2, "vals": vals2}) + cat3 = Categorical([1, 2, 3], categories=[1, 2, 3]) vals3 = ["a", "b", np.nan] - df_exp_drop_cats = pd.DataFrame({"cats": cat3, "vals": vals3}) - cat4 = pd.Categorical([1, 2], categories=[1, 2, 3]) + df_exp_drop_cats = DataFrame({"cats": cat3, "vals": vals3}) + cat4 = Categorical([1, 2], categories=[1, 2, 3]) vals4 = ["a", "b"] - df_exp_drop_all = pd.DataFrame({"cats": cat4, "vals": vals4}) + df_exp_drop_all = DataFrame({"cats": cat4, "vals": vals4}) # fillna res = df.fillna(value={"cats": 3, "vals": "b"}) tm.assert_frame_equal(res, df_exp_fill) - def f(): + with tm.assert_raises_regex(ValueError, "fill value must be " + "in categories"): df.fillna(value={"cats": 4, "vals": "c"}) - pytest.raises(ValueError, f) - res = df.fillna(method='pad') tm.assert_frame_equal(res, df_exp_fill) + # dropna res = df.dropna(subset=["cats"]) tm.assert_frame_equal(res, df_exp_drop_cats) @@ -308,19 +308,20 @@ def f(): df = pd.DataFrame({"cats": c, "vals": [1, 2, 3]}) cat_exp = Categorical(["a", "b", "a"], categories=["a", "b"]) - df_exp = pd.DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) + df_exp = DataFrame({"cats": cat_exp, "vals": [1, 2, 3]}) res = df.fillna("a") tm.assert_frame_equal(res, df_exp) + def test_fillna_categorical_nan(self): # GH 14021 - # np.nan should always be a is a valid filler + # np.nan should always be a valid filler cat = Categorical([np.nan, 2, np.nan]) val = Categorical([np.nan, np.nan, np.nan]) df = DataFrame({"cats": cat, "vals": val}) res = df.fillna(df.median()) v_exp = [np.nan, np.nan, np.nan] - df_exp = pd.DataFrame({"cats": [2, 2, 2], "vals": v_exp}, + df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, dtype='category') tm.assert_frame_equal(res, df_exp) @@ -331,17 +332,17 @@ def f(): idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45', '2011-01-01 09:00', pd.NaT, pd.NaT]) - df = DataFrame({'a': pd.Categorical(idx)}) + df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01', pd.NaT, pd.NaT], freq='M') - df = DataFrame({'a': pd.Categorical(idx)}) + df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) idx = pd.TimedeltaIndex(['1 days', '2 days', '1 days', pd.NaT, pd.NaT]) - df = pd.DataFrame({'a': pd.Categorical(idx)}) + df = DataFrame({'a': Categorical(idx)}) tm.assert_frame_equal(df.fillna(value=pd.NaT), df) def test_fillna_downcast(self): From 572d2468c81a889886d60757e5c66d90d0311dbb Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Fri, 17 Nov 2017 22:12:15 +0000 Subject: [PATCH 07/11] adding comments and fix docstring --- pandas/core/categorical.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index 113df4fe9682d..eac5bbe788266 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1623,8 +1623,12 @@ def fillna(self, value=None, method=None, limit=None): Method to use for filling holes in reindexed Series pad / ffill: propagate last valid observation forward to next valid backfill / bfill: use NEXT valid observation to fill gap - value : scalar - Value to use to fill holes (e.g. 0) + value : scalar, dict, Series + If a scalar value is passed it is used to fill all missing values. + Alternatively, a Series or dict can be used to fill in different + values for each index. The value should not be a list. The + value(s) passed should either be in the categories or should be + NaN. limit : int, default None (Not implemented yet for Categorical!) If method is specified, this is the maximum number of consecutive @@ -1665,6 +1669,8 @@ def fillna(self, value=None, method=None, limit=None): else: + # If value is a dict or a Series (a dict value has already + # been converted to a Series) if isinstance(value, ABCSeries): if not value[~value.isin(self.categories)].isna().all(): raise ValueError("fill value must be in categories") @@ -1673,7 +1679,7 @@ def fillna(self, value=None, method=None, limit=None): indexer = np.where(values_codes != -1) values[indexer] = values_codes[values_codes != -1] - # Scalar value + # If value is not a dict or Series it should be a scalar else: if not isna(value) and value not in self.categories: raise ValueError("fill value must be in categories") From 2dd2d4b090fe646d56382e3601ccc1c05d6b5352 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Fri, 17 Nov 2017 22:13:34 +0000 Subject: [PATCH 08/11] lint issue --- pandas/tests/frame/test_missing.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 2a852bf957f7d..0273df8fa8067 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -322,7 +322,7 @@ def test_fillna_categorical_nan(self): res = df.fillna(df.median()) v_exp = [np.nan, np.nan, np.nan] df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp}, - dtype='category') + dtype='category') tm.assert_frame_equal(res, df_exp) result = df.cats.fillna(np.nan) From 8f8f31637c231ec4eeb56e34e861e595b336922d Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sun, 19 Nov 2017 18:54:42 +0000 Subject: [PATCH 09/11] add is_scalar check and improve error msg --- pandas/core/categorical.py | 7 ++++++- pandas/core/generic.py | 5 +++-- pandas/tests/frame/test_missing.py | 2 +- pandas/tests/series/test_missing.py | 5 +++++ 4 files changed, 15 insertions(+), 4 deletions(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index eac5bbe788266..03bf09352862b 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -1680,7 +1680,7 @@ def fillna(self, value=None, method=None, limit=None): values[indexer] = values_codes[values_codes != -1] # If value is not a dict or Series it should be a scalar - else: + elif is_scalar(value): if not isna(value) and value not in self.categories: raise ValueError("fill value must be in categories") @@ -1692,6 +1692,11 @@ def fillna(self, value=None, method=None, limit=None): else: values[mask] = self.categories.get_loc(value) + else: + raise TypeError('"value" parameter must be a scalar, dict ' + 'or Series, but you passed a ' + '"{0}"'.format(type(value).__name__)) + return self._constructor(values, categories=self.categories, ordered=self.ordered, fastpath=True) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index d93fe52d5ca9c..e497679e2f3d9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -4304,8 +4304,9 @@ def fillna(self, value=None, method=None, axis=None, inplace=False, elif not is_list_like(value): pass else: - raise ValueError("invalid fill value with a %s" % - type(value)) + raise TypeError('"value" parameter must be a scalar, dict ' + 'or Series, but you passed a ' + '"{0}"'.format(type(value).__name__)) new_data = self._data.fillna(value=value, limit=limit, inplace=inplace, diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 0273df8fa8067..d566c92e7738e 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -564,7 +564,7 @@ def test_fillna_invalid_value(self): # tuple pytest.raises(TypeError, self.frame.fillna, (1, 2)) # frame with series - pytest.raises(ValueError, self.frame.iloc[:, 0].fillna, self.frame) + pytest.raises(TypeError, self.frame.iloc[:, 0].fillna, self.frame) def test_fillna_col_reordering(self): cols = ["COL." + str(i) for i in range(5, 0, -1)] diff --git a/pandas/tests/series/test_missing.py b/pandas/tests/series/test_missing.py index de6229b342948..2dbce45317639 100644 --- a/pandas/tests/series/test_missing.py +++ b/pandas/tests/series/test_missing.py @@ -408,6 +408,11 @@ def test_fillna_categorical_raise(self): 'dict, but you passed a "tuple"'): s.fillna(('a', 'b')) + with tm.assert_raises_regex(TypeError, + '"value" parameter must be a scalar, dict ' + 'or Series, but you passed a "DataFrame"'): + s.fillna(DataFrame({1: ['a'], 3: ['b']})) + def test_fillna_nat(self): series = Series([0, 1, 2, iNaT], dtype='M8[ns]') From 6ffec6cd6ae4bbd5154319ac1efcdf8545114c15 Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sun, 19 Nov 2017 23:16:16 +0000 Subject: [PATCH 10/11] whatsnew update --- doc/source/whatsnew/v0.22.0.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index 2e13d9d721dbf..ad7d32848bac0 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -28,14 +28,14 @@ Other Enhancements - :class:`pandas.io.formats.style.Styler` now has method ``hide_index()`` to determine whether the index will be rendered in ouptut (:issue:`14194`) - :class:`pandas.io.formats.style.Styler` now has method ``hide_columns()`` to determine whether columns will be hidden in output (:issue:`14194`) - Improved wording of ``ValueError`` raised in :func:`to_datetime` when ``unit=`` is passed with a non-convertible value (:issue:`14350`) -- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` (:issue:`17033`) +- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`) .. _whatsnew_0220.api_breaking: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- +- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``KeyError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) - - From c484f499604603e55d3f48556939c65d50ad2a7c Mon Sep 17 00:00:00 2001 From: Paul Reidy Date: Sun, 19 Nov 2017 23:18:53 +0000 Subject: [PATCH 11/11] whatsnew typo --- doc/source/whatsnew/v0.22.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.22.0.txt b/doc/source/whatsnew/v0.22.0.txt index ad7d32848bac0..d5a136c5ab0c7 100644 --- a/doc/source/whatsnew/v0.22.0.txt +++ b/doc/source/whatsnew/v0.22.0.txt @@ -35,7 +35,7 @@ Other Enhancements Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``KeyError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) +- :func:`Series.fillna` now raises a ``TypeError`` instead of a ``ValueError`` when passed a list, tuple or DataFrame as a ``value`` (:issue:`18293`) - -