From 3db34b71b067f52136039c1401e17716f532ea85 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Thu, 16 Jul 2020 00:17:47 +0200 Subject: [PATCH 1/5] 35299 - added perc threshold argument --- pandas/core/frame.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0268d19e00b97..589603d46c76e 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4868,7 +4868,9 @@ def notna(self) -> "DataFrame": def notnull(self) -> "DataFrame": return ~self.isna() - def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): + def dropna( + self, axis=0, how="any", thresh=None, perc=None, subset=None, inplace=False + ): """ Remove missing values. @@ -4898,6 +4900,8 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): thresh : int, optional Require that many non-NA values. + perc : float, optional + If a column exceeds this subset : array-like, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. @@ -4992,6 +4996,11 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): if thresh is not None: mask = count >= thresh + elif perc is not None: + if axis == 0: + mask = agg_obj.isna().mean(axis=agg_axis) <= perc + else: + mask = agg_obj.isna().mean() <= perc elif how == "any": mask = count == len(agg_obj._get_axis(agg_axis)) elif how == "all": @@ -5002,7 +5011,13 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): else: raise TypeError("must specify how or thresh") - result = self.loc(axis=axis)[mask] + if perc is not None: + if axis == 0: + result = self.loc[mask, :] + else: + result = self.loc[:, mask] + else: + result = self.loc(axis=axis)[mask] if inplace: self._update_inplace(result) From 384d459f573a43eb2e528f390bbd58d971a2a6ac Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Thu, 16 Jul 2020 00:18:14 +0200 Subject: [PATCH 2/5] 35299 - test for perc in dropna --- pandas/tests/extension/base/missing.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index a5969ef961bab..fb83fbdfe99db 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -134,3 +134,27 @@ def test_use_inf_as_na_no_effect(self, data_missing): with pd.option_context("mode.use_inf_as_na", True): result = ser.isna() self.assert_series_equal(result, expected) + + +def test_dropna_perc(): + # GH 35299 + df = pd.DataFrame( + { + "col": ["A", "A", "B", "B"], + "A": [80, np.nan, np.nan, np.nan], + "B": [80, np.nan, 76, 67], + } + ) + + # axis = 1 + expected = pd.DataFrame({"col": ["A", "A", "B", "B"], "B": [80, np.nan, 76, 67]}) + result = df.dropna(perc=0.5, axis=1) + tm.assert_frame_equal(result, expected) + + # axis = 0 + expected = pd.DataFrame( + {"col": ["A", "B", "B"], "A": [80.0, np.nan, np.nan], "B": [80.0, 76.0, 67.0]} + ) + expected.index = [0, 2, 3] + result = df.dropna(perc=0.5) + tm.assert_frame_equal(result, expected) From 1085375f75c1413753852bacc1faa955e38fc038 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Thu, 16 Jul 2020 00:22:12 +0200 Subject: [PATCH 3/5] 35299 - added docstring --- pandas/core/frame.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 589603d46c76e..b3ed6db9228ce 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4901,7 +4901,8 @@ def dropna( thresh : int, optional Require that many non-NA values. perc : float, optional - If a column exceeds this + If a column or row exceeds this percentage threshold NA-values + it will be dropped subset : array-like, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. From c2f0386a21bdd2731e155e3b4437f8536706f3f1 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Thu, 16 Jul 2020 00:25:17 +0200 Subject: [PATCH 4/5] 35299 - removed redundant code --- pandas/core/frame.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b3ed6db9228ce..2f06ad34797ee 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -5012,13 +5012,7 @@ def dropna( else: raise TypeError("must specify how or thresh") - if perc is not None: - if axis == 0: - result = self.loc[mask, :] - else: - result = self.loc[:, mask] - else: - result = self.loc(axis=axis)[mask] + result = self.loc(axis=axis)[mask] if inplace: self._update_inplace(result) From 388635454c163113a104fc63aad846efca06a495 Mon Sep 17 00:00:00 2001 From: Erfan Nariman Date: Thu, 16 Jul 2020 00:27:05 +0200 Subject: [PATCH 5/5] 35299 - fixed docstring --- pandas/core/frame.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 2f06ad34797ee..3b42d4d9e00e4 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4901,8 +4901,8 @@ def dropna( thresh : int, optional Require that many non-NA values. perc : float, optional - If a column or row exceeds this percentage threshold NA-values - it will be dropped + If a column or row exceeds this percentage threshold of + NA-values, it will be dropped. subset : array-like, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include.