diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index f0af60f80edd5..b1752a506c486 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -288,7 +288,7 @@ Indexing - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`) - Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`) - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`) -- +- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) Missing ^^^^^^^ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4f9dd61b8e0da..0e986aa6afb5d 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1247,6 +1247,8 @@ class SelectNSeries(SelectN): def compute(self, method: str) -> Series: + from pandas.core.reshape.concat import concat + n = self.n dtype = self.obj.dtype if not self.is_valid_dtype_n_method(dtype): @@ -1256,6 +1258,7 @@ def compute(self, method: str) -> Series: return self.obj[[]] dropped = self.obj.dropna() + nan_index = self.obj.drop(dropped.index) if is_extension_array_dtype(dropped.dtype): # GH#41816 bc we have dropped NAs above, MaskedArrays can use the @@ -1272,7 +1275,7 @@ def compute(self, method: str) -> Series: # slow method if n >= len(self.obj): ascending = method == "nsmallest" - return dropped.sort_values(ascending=ascending).head(n) + return self.obj.sort_values(ascending=ascending).head(n) # fast method new_dtype = dropped.dtype @@ -1290,6 +1293,8 @@ def compute(self, method: str) -> Series: if self.keep == "last": arr = arr[::-1] + nbase = n + findex = len(self.obj) narr = len(arr) n = min(n, narr) @@ -1301,12 +1306,13 @@ def compute(self, method: str) -> Series: if self.keep != "all": inds = inds[:n] + findex = nbase if self.keep == "last": # reverse indices inds = narr - 1 - inds - return dropped.iloc[inds] + return concat([dropped.iloc[inds], nan_index]).iloc[:findex] class SelectNFrame(SelectN): diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py index 4ce474230b686..1b2db80d782ce 100644 --- a/pandas/tests/frame/methods/test_nlargest.py +++ b/pandas/tests/frame/methods/test_nlargest.py @@ -209,3 +209,10 @@ def test_nlargest_multiindex_column_lookup(self): result = df.nlargest(3, ("x", "b")) expected = df.iloc[[3, 2, 1]] tm.assert_frame_equal(result, expected) + + def test_nlargest_nan(self): + # GH#43060 + df = pd.DataFrame([np.nan, np.nan, 0, 1, 2, 3]) + result = df.nlargest(5, 0) + expected = df.sort_values(0, ascending=False).head(5) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py index 25529e65118c8..0330bc0203185 100644 --- a/pandas/tests/groupby/test_apply.py +++ b/pandas/tests/groupby/test_apply.py @@ -1150,3 +1150,15 @@ def test_doctest_example2(): {"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A") ) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("dropna", [True, False]) +def test_apply_na(dropna): + # GH#28984 + df = DataFrame( + {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]} + ) + dfgrp = df.groupby("grp", dropna=dropna) + result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z")) + expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1)) + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py index 0efb0663a0327..8b0779c44bf4c 100644 --- a/pandas/tests/series/methods/test_nlargest.py +++ b/pandas/tests/series/methods/test_nlargest.py @@ -127,8 +127,12 @@ def test_nsmallest_nlargest(self, s_main_dtypes_split): def test_nlargest_misc(self): ser = Series([3.0, np.nan, 1, 2, 5]) - tm.assert_series_equal(ser.nlargest(), ser.iloc[[4, 0, 3, 2]]) - tm.assert_series_equal(ser.nsmallest(), ser.iloc[[2, 3, 0, 4]]) + result = ser.nlargest() + expected = ser.iloc[[4, 0, 3, 2, 1]] + tm.assert_series_equal(result, expected) + result = ser.nsmallest() + expected = ser.iloc[[2, 3, 0, 4, 1]] + tm.assert_series_equal(result, expected) msg = 'keep must be either "first", "last"' with pytest.raises(ValueError, match=msg):