diff --git a/pandas/core/frame.py b/pandas/core/frame.py index f52341ed782d8..45921f18b27bd 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -4874,7 +4874,9 @@ def notna(self) -> "DataFrame": def notnull(self) -> "DataFrame": return ~self.isna() - def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): + def dropna( + self, axis=0, how="any", thresh=None, perc=None, subset=None, inplace=False + ): """ Remove missing values. @@ -4904,6 +4906,9 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): thresh : int, optional Require that many non-NA values. + perc : float, optional + If a column or row exceeds this percentage threshold of + NA-values, it will be dropped. subset : array-like, optional Labels along other axis to consider, e.g. if you are dropping rows these would be a list of columns to include. @@ -4998,6 +5003,11 @@ def dropna(self, axis=0, how="any", thresh=None, subset=None, inplace=False): if thresh is not None: mask = count >= thresh + elif perc is not None: + if axis == 0: + mask = agg_obj.isna().mean(axis=agg_axis) <= perc + else: + mask = agg_obj.isna().mean() <= perc elif how == "any": mask = count == len(agg_obj._get_axis(agg_axis)) elif how == "all": diff --git a/pandas/tests/extension/base/missing.py b/pandas/tests/extension/base/missing.py index a5969ef961bab..fb83fbdfe99db 100644 --- a/pandas/tests/extension/base/missing.py +++ b/pandas/tests/extension/base/missing.py @@ -134,3 +134,27 @@ def test_use_inf_as_na_no_effect(self, data_missing): with pd.option_context("mode.use_inf_as_na", True): result = ser.isna() self.assert_series_equal(result, expected) + + +def test_dropna_perc(): + # GH 35299 + df = pd.DataFrame( + { + "col": ["A", "A", "B", "B"], + "A": [80, np.nan, np.nan, np.nan], + "B": [80, np.nan, 76, 67], + } + ) + + # axis = 1 + expected = pd.DataFrame({"col": ["A", "A", "B", "B"], "B": [80, np.nan, 76, 67]}) + result = df.dropna(perc=0.5, axis=1) + tm.assert_frame_equal(result, expected) + + # axis = 0 + expected = pd.DataFrame( + {"col": ["A", "B", "B"], "A": [80.0, np.nan, np.nan], "B": [80.0, 76.0, 67.0]} + ) + expected.index = [0, 2, 3] + result = df.dropna(perc=0.5) + tm.assert_frame_equal(result, expected)