From aa6d86776167ec4651caffc49f3b2820e1d6de70 Mon Sep 17 00:00:00 2001 From: Yuanhao Geng <41546976+GYHHAHA@users.noreply.github.com> Date: Sun, 10 Apr 2022 11:59:37 -0500 Subject: [PATCH] Backport PR #46656: BUG: df.nsmallest get wrong results when NaN in the sorting column --- doc/source/whatsnew/v1.4.3.rst | 1 + pandas/core/algorithms.py | 6 +++++- pandas/tests/frame/methods/test_nlargest.py | 21 ++++++++++++++++++++ pandas/tests/series/methods/test_nlargest.py | 12 +++++++++++ 4 files changed, 39 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.4.3.rst b/doc/source/whatsnew/v1.4.3.rst index 8572c136c28a9..0c326e15d90ed 100644 --- a/doc/source/whatsnew/v1.4.3.rst +++ b/doc/source/whatsnew/v1.4.3.rst @@ -14,6 +14,7 @@ including other versions of pandas. Fixed regressions ~~~~~~~~~~~~~~~~~ +- Fixed regression in :meth:`DataFrame.nsmallest` led to wrong results when ``np.nan`` in the sorting column (:issue:`46589`) - Fixed regression in :func:`read_fwf` raising ``ValueError`` when ``widths`` was specified with ``usecols`` (:issue:`46580`) - diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 36eabe93dbd7e..32e3e19688a63 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1216,7 +1216,6 @@ def compute(self, method: str) -> Series: arr = arr[::-1] nbase = n - findex = len(self.obj) narr = len(arr) n = min(n, narr) @@ -1229,6 +1228,11 @@ def compute(self, method: str) -> Series: if self.keep != "all": inds = inds[:n] findex = nbase + else: + if len(inds) < nbase and len(nan_index) + len(inds) >= nbase: + findex = len(nan_index) + len(inds) + else: + findex = len(inds) if self.keep == "last": # reverse indices diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 1b2db80d782ce..a317dae562ae0 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -216,3 +216,24 @@ def test_nlargest_nan(self): result = df.nlargest(5, 0) expected = df.sort_values(0, ascending=False).head(5) tm.assert_frame_equal(result, expected) + + def test_nsmallest_nan_after_n_element(self): + # GH#46589 + df = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5, None, 7], + "b": [7, 6, 5, 4, 3, 2, 1], + "c": [1, 1, 2, 2, 3, 3, 3], + }, + index=range(7), + ) + result = df.nsmallest(5, columns=["a", "b"]) + expected = pd.DataFrame( + { + "a": [1, 2, 3, 4, 5], + "b": [7, 6, 5, 4, 3], + "c": [1, 1, 2, 2, 3], + }, + index=range(5), + ).astype({"a": "float"}) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index ee96ab08ad66c..4f07257038bc9 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -231,3 +231,15 @@ def test_nlargest_nullable(self, any_numeric_ea_dtype): .astype(dtype) ) tm.assert_series_equal(result, expected) + + def test_nsmallest_nan_when_keep_is_all(self): + # GH#46589 + s = Series([1, 2, 3, 3, 3, None]) + result = s.nsmallest(3, keep="all") + expected = Series([1.0, 2.0, 3.0, 3.0, 3.0]) + tm.assert_series_equal(result, expected) + + s = Series([1, 2, None, None, None]) + result = s.nsmallest(3, keep="all") + expected = Series([1, 2, None, None, None]) + tm.assert_series_equal(result, expected)