From 38f36578c9e29f8ffc29a655923375c73a75bf02 Mon Sep 17 00:00:00 2001 From: proost Date: Tue, 11 Aug 2020 01:34:36 +0900 Subject: [PATCH 1/7] ENH:column-wise DataFrame.fillna and duplicated DataFrame.fillna with Series and Dict (#30922) --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/generic.py | 29 +++++---- pandas/tests/frame/test_missing.py | 95 ++++++++++++++++++++++++++++-- 3 files changed, 109 insertions(+), 17 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 94bb265c32e4c..311f240d5f08b 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -51,7 +51,7 @@ For example: Other enhancements ^^^^^^^^^^^^^^^^^^ - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) -- +- :meth:`DataFrame.fillna` can fill NA values column-wise with a dictionary or :class:`Series` (:issue:`4514`) - diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 520023050d49d..70d714e813085 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6134,20 +6134,25 @@ def fillna( ) elif isinstance(value, (dict, ABCSeries)): + temp_data = self if inplace else self.copy() + if axis == 1: - raise NotImplementedError( - "Currently only can fill " - "with dict/Series column " - "by column" - ) + for i, item in enumerate(temp_data.items()): + label, content = item + temp_data.iloc[:, i] = content.fillna( + value, limit=limit, inplace=False, downcast=downcast + ) + else: + for i, item in enumerate(temp_data.items()): + label, content = item + if label not in value: + continue + temp_data.iloc[:, i] = content.fillna( + value[label], limit=limit, inplace=False, downcast=downcast + ) - result = self if inplace else self.copy() - for k, v in value.items(): - if k not in result: - continue - obj = result[k] - obj.fillna(v, limit=limit, inplace=True, downcast=downcast) - return result if not inplace else None + temp_data = temp_data.infer_objects() + new_data = temp_data._mgr elif not is_list_like(value): new_data = self._mgr.fillna( diff --git a/pandas/tests/frame/test_missing.py b/pandas/tests/frame/test_missing.py index 9bf5d24085697..045f00a17933f 100644 --- a/pandas/tests/frame/test_missing.py +++ b/pandas/tests/frame/test_missing.py @@ -618,10 +618,6 @@ def test_fillna_dict_series(self): expected = df.fillna(df.max().to_dict()) tm.assert_frame_equal(result, expected) - # disable this for now - with pytest.raises(NotImplementedError, match="column by column"): - df.fillna(df.max(1), axis=1) - def test_fillna_dataframe(self): # GH 8377 df = DataFrame( @@ -710,3 +706,94 @@ def test_fill_corner(self, float_frame, float_string_frame): # TODO(wesm): unused? result = empty_float.fillna(value=0) # noqa + + @pytest.mark.parametrize( + "expected,fill_value", + [ + ( + DataFrame( + [[100, 100], [200, 4], [5, 6]], columns=list("AB"), dtype="float64" + ), + Series([100, 200, 300]), + ), + ( + DataFrame( + [[100, 100], [np.nan, 4], [5, 6]], + columns=list("AB"), + dtype="float64", + ), + {0: 100, 2: 300, 3: 400}, + ), + ], + ) + def test_fillna_column_wise(self, expected, fill_value): + # GH 4514 + df = DataFrame([[np.nan, np.nan], [np.nan, 4], [5, 6]], columns=list("AB")) + result = df.fillna(fill_value, axis=1) + tm.assert_frame_equal(expected, result) + + def test_fillna_column_wise_downcast(self): + # GH 4514 + df = DataFrame([[np.nan, 2], [3, np.nan], [np.nan, np.nan]], columns=list("AB")) + s = Series([100, 200, 300]) + + expected = DataFrame( + [[100, 2], [3, 200], [300, 300]], columns=list("AB"), dtype="int64" + ) + result = df.fillna(s, axis=1, downcast="infer") + tm.assert_frame_equal(expected, result) + + @pytest.mark.parametrize( + "fill_value", [Series([100, 200, 300]), {0: 100, 2: 300, 3: 400}] + ) + def test_fillna_column_wise_inplace(self, fill_value): + # GH 4514 + df = DataFrame([[np.nan, np.nan], [np.nan, 4], [5, 6]], columns=list("AB")) + expected = df.fillna(fill_value, axis=1, inplace=False) + df.fillna(fill_value, axis=1, inplace=True) + tm.assert_frame_equal(expected, df) + + @pytest.mark.parametrize( + "fill_value", + [Series([100, 200, 300], index=[0, 1, 2]), {0: 100, 1: 200, 2: 300}], + ) + def test_fillna_column_wise_duplicated_with_series_dict(self, fill_value): + # GH 4514 + df = DataFrame( + [[np.nan, np.nan, 3], [np.nan, 5, np.nan], [7, np.nan, np.nan]], + columns=list("ABB"), + index=[0, 0, 1], + ) + expected = DataFrame( + [[100, 100, 3], [100, 5, 100], [7, 200, 200]], + columns=list("ABB"), + index=[0, 0, 1], + dtype="float64", + ) + + result = df.fillna(fill_value, axis=1) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize( + "fill_value", + [ + Series([100, 200, 300], index=["A", "B", "C"]), + {"A": 100, "B": 200, "C": 300}, + ], + ) + def test_fillna_duplicated_with_series_dict(self, fill_value): + # GH 4514 + df = DataFrame( + [[np.nan, np.nan, 3], [np.nan, 5, np.nan], [7, np.nan, np.nan]], + columns=list("ABB"), + index=[0, 0, 1], + ) + expected = DataFrame( + [[100, 200, 3], [100, 5, 200], [7, 200, 200]], + columns=list("ABB"), + index=[0, 0, 1], + dtype="float64", + ) + + result = df.fillna(fill_value) + tm.assert_frame_equal(result, expected) From bf953c7e9e3559c9ee30e1d65e6951754b63c8d9 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 7 Dec 2020 23:11:15 -0500 Subject: [PATCH 2/7] fix merge error --- doc/source/whatsnew/v1.2.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d8a47472dc299..06d1dc5ffc1db 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -317,7 +317,6 @@ of columns could result in a larger Series result. See (:issue:`37799`). df = pd.DataFrame({"A": ["foo", "bar"], "B": [True, False]}, dtype=object) df["C"] = pd.Series([True, True]) ->>>>>>> 862cd05df4452592a99dd1a4fa10ce8cfb3766f7 *Previous behavior*: From 4df35015db5d73dbb5ffebf5d915d7cad75db9c3 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 7 Dec 2020 23:17:05 -0500 Subject: [PATCH 3/7] simplify code block --- pandas/core/generic.py | 21 +++++++-------------- 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 39222bf85cf59..bdbd9282180e6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6400,20 +6400,13 @@ def fillna( elif isinstance(value, (dict, ABCSeries)): temp_data = self if inplace else self.copy() - if axis == 1: - for i, item in enumerate(temp_data.items()): - label, content = item - temp_data.iloc[:, i] = content.fillna( - value, limit=limit, inplace=False, downcast=downcast - ) - else: - for i, item in enumerate(temp_data.items()): - label, content = item - if label not in value: - continue - temp_data.iloc[:, i] = content.fillna( - value[label], limit=limit, inplace=False, downcast=downcast - ) + for i, (label, content) in enumerate(temp_data.items()): + if axis == 0 and label not in value: + continue + fill_val = value[label] if axis == 0 else value + temp_data.iloc[:, i] = content.fillna( + fill_val, limit=limit, inplace=False, downcast=downcast + ) temp_data = temp_data.infer_objects() new_data = temp_data._mgr From 16cceb034932a736daad09e599f8f10b54e55dd2 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 7 Dec 2020 23:22:48 -0500 Subject: [PATCH 4/7] simplify more --- pandas/core/generic.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index bdbd9282180e6..03b5f13f9ac60 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6398,18 +6398,20 @@ def fillna( ) elif isinstance(value, (dict, ABCSeries)): - temp_data = self if inplace else self.copy() + tmp = self if inplace else self.copy() - for i, (label, content) in enumerate(temp_data.items()): + for i, (label, content) in enumerate(tmp.items()): if axis == 0 and label not in value: continue - fill_val = value[label] if axis == 0 else value - temp_data.iloc[:, i] = content.fillna( - fill_val, limit=limit, inplace=False, downcast=downcast + tmp.iloc[:, i] = content.fillna( + value[label] if axis == 0 else value, + limit=limit, + inplace=False, + downcast=downcast, ) - temp_data = temp_data.infer_objects() - new_data = temp_data._mgr + tmp = tmp.infer_objects() + new_data = tmp._mgr elif not is_list_like(value): new_data = self._mgr.fillna( From d061f6fd7d38de3d3f075e1954ce21fab2534c87 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 7 Dec 2020 23:33:24 -0500 Subject: [PATCH 5/7] fix test --- pandas/tests/frame/methods/test_fillna.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/frame/methods/test_fillna.py b/pandas/tests/frame/methods/test_fillna.py index d305e4940be0c..2089dfa58e837 100644 --- a/pandas/tests/frame/methods/test_fillna.py +++ b/pandas/tests/frame/methods/test_fillna.py @@ -612,6 +612,7 @@ def test_fillna_duplicated_with_series_dict(self, fill_value): result = df.fillna(fill_value) tm.assert_frame_equal(result, expected) + def test_fillna_nonconsolidated_frame(): # https://github.com/pandas-dev/pandas/issues/36495 df = DataFrame( From 55cee45a0a191eab40cb62028adcca0afed7f6a7 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 14 Dec 2020 12:47:47 -0500 Subject: [PATCH 6/7] move whatsnew to 1.3 --- doc/source/whatsnew/v1.3.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 05d3b1c797375..70ebe81e124fe 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -21,6 +21,7 @@ Other enhancements - Added :meth:`MultiIndex.dtypes` (:issue:`37062`) - Improve error message when ``usecols`` and ``names`` do not match for :func:`read_csv` and ``engine="c"`` (:issue:`29042`) +- :meth:`DataFrame.fillna` can fill NA values column-wise with a dictionary or :class:`Series` (:issue:`4514`) .. --------------------------------------------------------------------------- From dc66a0b8ece7c80206c0eae9488701ab21208702 Mon Sep 17 00:00:00 2001 From: Andrew Wieteska Date: Mon, 14 Dec 2020 12:48:03 -0500 Subject: [PATCH 7/7] move whatsnew to 1.3 --- doc/source/whatsnew/v1.2.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d2a3aaf3495be..e2521cedb64cc 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -275,7 +275,6 @@ Other enhancements - Added :meth:`~DataFrame.set_flags` for setting table-wide flags on a Series or DataFrame (:issue:`28394`) - :meth:`DataFrame.applymap` now supports ``na_action`` (:issue:`23803`) - :class:`Index` with object dtype supports division and multiplication (:issue:`34160`) -- :meth:`DataFrame.fillna` can fill NA values column-wise with a dictionary or :class:`Series` (:issue:`4514`) - :meth:`io.sql.get_schema` now supports a ``schema`` keyword argument that will add a schema into the create table statement (:issue:`28486`) - :meth:`DataFrame.explode` and :meth:`Series.explode` now support exploding of sets (:issue:`35614`) - :meth:`DataFrame.hist` now supports time series (datetime) data (:issue:`32590`)