BUG: nlargest/nsmallest gave wrong result (#22752)

troels · troels · commit 25a4ef35d3de · 2018-09-23T01:47:57.000+02:00
When asking for the n largest/smallest rows in a dataframe
nlargest/nsmallest sometimes failed to differentiate
the correct result based on the latter columns.
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -804,6 +804,7 @@ Other
 - :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`)
 - Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`)
 - :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`)
+- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`)
 - :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly.
 - Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`)
 -
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -1221,34 +1221,35 @@ def get_indexer(current_indexer, other_indexer):
             # and break
             # Otherwise we must save the index of the non duplicated values
             # and set the next cur_frame to cur_frame filtered on all
-            # duplcicated values (#GH15297)
+            # duplicated values (#GH15297)
             series = cur_frame[column]
-            values = getattr(series, method)(cur_n, keep=self.keep)
             is_last_column = len(columns) - 1 == i
-            if is_last_column or values.nunique() == series.isin(values).sum():
+            values = getattr(series, method)(
+                cur_n,
+                keep=self.keep if is_last_column else 'all')
 
-                # Last column in columns or values are unique in
-                # series => values
-                # is all that matters
+            if is_last_column or len(values) <= cur_n:
                 indexer = get_indexer(indexer, values.index)
                 break
 
-            duplicated_filter = series.duplicated(keep=False)
-            duplicated = values[duplicated_filter]
-            non_duplicated = values[~duplicated_filter]
-            indexer = get_indexer(indexer, non_duplicated.index)
+            last_value = values == values[values.index[-1]]
+            safe_values = values[~last_value]
+            unsafe_values = values[last_value]
 
-            # Must set cur frame to include all duplicated values
-            # to consider for the next column, we also can reduce
-            # cur_n by the current length of the indexer
-            cur_frame = cur_frame[series.isin(duplicated)]
+            indexer = get_indexer(indexer, safe_values.index)
+            cur_frame = cur_frame.loc[unsafe_values.index]
             cur_n = n - len(indexer)
 
         frame = frame.take(indexer)
 
         # Restore the index on frame
         frame.index = original_index.take(indexer)
-        return frame
+        ascending = method == 'nsmallest'
+
+        return frame.sort_values(
+            columns,
+            ascending=ascending,
+            kind='mergesort')
 
 
 # ------- ## ---- #
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -2095,6 +2095,28 @@ def test_n_all_dtypes(self, df_main_dtypes):
         df.nsmallest(2, list(set(df) - {'category_string', 'string'}))
         df.nlargest(2, list(set(df) - {'category_string', 'string'}))
 
+    @pytest.mark.parametrize('method,expected', [
+        ('nlargest',
+         pd.DataFrame({'a': [2, 2, 2, 1], 'b': [3, 2, 1, 3]},
+                      index=[2, 1, 0, 3])),
+        ('nsmallest',
+         pd.DataFrame({'a': [2, 1, 1, 1], 'b': [1, 3, 2, 1]},
+                      index=[0, 3, 4, 5]))])
+    def test_duplicates_on_starter_columns(self, method, expected):
+        # regression test for #22752
+
+        df = pd.DataFrame({
+            'a': [2, 2, 2, 1, 1, 1],
+            'b': [1, 2, 3, 3, 2, 1]
+        })
+
+        result = getattr(df, method)(
+            4, columns=['a', 'b']
+        ).sort_values(
+            ['a', 'b'], ascending=False
+        )
+        tm.assert_frame_equal(result, expected)
+
     def test_n_identical_values(self):
         # GH15297
         df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})

Original file line number	Diff line number	Diff line change
`@@ -804,6 +804,7 @@ Other`
`804`	`804`	- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`)
`805`	`805`	- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`)
`806`	`806`	- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`)
	`807`	+- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`)
`807`	`808`	- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly.
`808`	`809`	- Logical operations ``&, \|, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`)
`809`	`810`	`-`