BUG: nlargest/nsmallest can now consider nan values like sort_values(ascending=True).head(n) (#43060)

usersblock · web-flow · commit 16d2f59589cf · 2021-09-07T19:17:38.000-04:00
diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst
@@ -361,7 +361,7 @@ Indexing
 - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`)
 - Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`)
 - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`)
--
+- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`)
 
 
 Missing
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1252,6 +1252,8 @@ class SelectNSeries(SelectN):
 
     def compute(self, method: str) -> Series:
 
+        from pandas.core.reshape.concat import concat
+
         n = self.n
         dtype = self.obj.dtype
         if not self.is_valid_dtype_n_method(dtype):
@@ -1261,6 +1263,7 @@ def compute(self, method: str) -> Series:
             return self.obj[[]]
 
         dropped = self.obj.dropna()
+        nan_index = self.obj.drop(dropped.index)
 
         if is_extension_array_dtype(dropped.dtype):
             # GH#41816 bc we have dropped NAs above, MaskedArrays can use the
@@ -1277,7 +1280,7 @@ def compute(self, method: str) -> Series:
         # slow method
         if n >= len(self.obj):
             ascending = method == "nsmallest"
-            return dropped.sort_values(ascending=ascending).head(n)
+            return self.obj.sort_values(ascending=ascending).head(n)
 
         # fast method
         new_dtype = dropped.dtype
@@ -1295,6 +1298,8 @@ def compute(self, method: str) -> Series:
         if self.keep == "last":
             arr = arr[::-1]
 
+        nbase = n
+        findex = len(self.obj)
         narr = len(arr)
         n = min(n, narr)
 
@@ -1306,12 +1311,13 @@ def compute(self, method: str) -> Series:
 
         if self.keep != "all":
             inds = inds[:n]
+            findex = nbase
 
         if self.keep == "last":
             # reverse indices
             inds = narr - 1 - inds
 
-        return dropped.iloc[inds]
+        return concat([dropped.iloc[inds], nan_index]).iloc[:findex]
 
 
 class SelectNFrame(SelectN):
diff --git a/pandas/tests/frame/methods/test_nlargest.py b/pandas/tests/frame/methods/test_nlargest.py
@@ -209,3 +209,10 @@ def test_nlargest_multiindex_column_lookup(self):
         result = df.nlargest(3, ("x", "b"))
         expected = df.iloc[[3, 2, 1]]
         tm.assert_frame_equal(result, expected)
+
+    def test_nlargest_nan(self):
+        # GH#43060
+        df = pd.DataFrame([np.nan, np.nan, 0, 1, 2, 3])
+        result = df.nlargest(5, 0)
+        expected = df.sort_values(0, ascending=False).head(5)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/groupby/test_apply.py b/pandas/tests/groupby/test_apply.py
@@ -1145,3 +1145,15 @@ def test_doctest_example2():
         {"B": [1.0, 0.0], "C": [2.0, 0.0]}, index=Index(["a", "b"], name="A")
     )
     tm.assert_frame_equal(result, expected)
+
+
+@pytest.mark.parametrize("dropna", [True, False])
+def test_apply_na(dropna):
+    # GH#28984
+    df = DataFrame(
+        {"grp": [1, 1, 2, 2], "y": [1, 0, 2, 5], "z": [1, 2, np.nan, np.nan]}
+    )
+    dfgrp = df.groupby("grp", dropna=dropna)
+    result = dfgrp.apply(lambda grp_df: grp_df.nlargest(1, "z"))
+    expected = dfgrp.apply(lambda x: x.sort_values("z", ascending=False).head(1))
+    tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/methods/test_nlargest.py b/pandas/tests/series/methods/test_nlargest.py
@@ -127,8 +127,12 @@ def test_nsmallest_nlargest(self, s_main_dtypes_split):
     def test_nlargest_misc(self):
 
         ser = Series([3.0, np.nan, 1, 2, 5])
-        tm.assert_series_equal(ser.nlargest(), ser.iloc[[4, 0, 3, 2]])
-        tm.assert_series_equal(ser.nsmallest(), ser.iloc[[2, 3, 0, 4]])
+        result = ser.nlargest()
+        expected = ser.iloc[[4, 0, 3, 2, 1]]
+        tm.assert_series_equal(result, expected)
+        result = ser.nsmallest()
+        expected = ser.iloc[[2, 3, 0, 4, 1]]
+        tm.assert_series_equal(result, expected)
 
         msg = 'keep must be either "first", "last"'
         with pytest.raises(ValueError, match=msg):