Fix nsmallest/nlargest With Identical Values

Roger Thomas · Roger Thomas · commit 7f8cd04439e3 · 2017-04-03T14:54:32.000+01:00
diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt
@@ -1055,6 +1055,7 @@ Reshaping
 - Bug in ``pd.pivot_table()`` where no error was raised when values argument was not in the columns (:issue:`14938`)
 - Bug in ``pd.concat()`` in which concatting with an empty dataframe with ``join='inner'`` was being improperly handled (:issue:`15328`)
 - Bug with ``sort=True`` in ``DataFrame.join`` and ``pd.merge`` when joining on indexes (:issue:`15582`)
+- Bug in ``DataFrame.nsmallest`` and ``DataFrame.nlargest`` where identical values resulted in duplicated rows (:issue:`15297`)
 
 Numeric
 ^^^^^^^
diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py
@@ -944,14 +944,64 @@ def select_n_frame(frame, columns, n, method, keep):
     -------
     nordered : DataFrame
     """
-    from pandas.core.series import Series
+    from pandas import Int64Index
     if not is_list_like(columns):
         columns = [columns]
     columns = list(columns)
-    ser = getattr(frame[columns[0]], method)(n, keep=keep)
-    if isinstance(ser, Series):
-        ser = ser.to_frame()
-    return ser.merge(frame, on=columns[0], left_index=True)[frame.columns]
+    for column in columns:
+        dtype = frame[column].dtype
+        if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64,
+                                       np.timedelta64)):
+            msg = (
+                "{column!r} has dtype: {dtype}, cannot use method {method!r} "
+                "with this dtype"
+            ).format(column=column, dtype=dtype, method=method)
+            raise TypeError(msg)
+
+    # Below we save and reset the index in case index contains duplicates
+    original_index = frame.index
+    cur_frame = frame = frame.reset_index(drop=True)
+    cur_n = n
+    indexer = Int64Index([])
+    for i, column in enumerate(columns):
+
+        # For each column we apply method to cur_frame[column]. If it is the
+        # last column in columns, or if the values returned are unique in
+        # frame[column] we save this index and break
+        # Otherwise we must save the index of the non duplicated values
+        # and set the next cur_frame to cur_frame filtered on all duplcicated
+        # values (#GH15297)
+        series = cur_frame[column]
+        values = getattr(series, method)(cur_n, keep=keep)
+        is_last_column = len(columns) - 1 == i
+        if is_last_column or len(values.unique()) == sum(series.isin(values)):
+
+            # Last column in columns or values are unique in series => values
+            # is all that matters
+            if method == 'nsmallest':
+                indexer = indexer.append(values.index)
+            else:
+                indexer = values.index.append(indexer)
+            break
+        duplicated_filter = series.duplicated(keep=False)
+        non_duplicated = values[~duplicated_filter]
+        duplicated = values[duplicated_filter]
+        if method == 'nsmallest':
+            indexer = indexer.append(non_duplicated.index)
+        else:
+            indexer = non_duplicated.index.append(indexer)
+
+        # Must set cur frame to include all duplicated values to consider for
+        # the next column, we also can reduce cur_n by the current length of
+        # the indexer
+        cur_frame = cur_frame[series.isin(duplicated)]
+        cur_n = n - len(indexer)
+
+    frame = frame.take(indexer)
+
+    # Restore the index on frame
+    frame.index = original_index.take(indexer)
+    return frame
 
 
 def _finalize_nsmallest(arr, kth_val, n, keep, narr):
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -7,11 +7,12 @@
 import sys
 import pytest
 
+from string import ascii_lowercase
 from numpy import nan
 from numpy.random import randn
 import numpy as np
 
-from pandas.compat import lrange
+from pandas.compat import lrange, product
 from pandas import (compat, isnull, notnull, DataFrame, Series,
                     MultiIndex, date_range, Timestamp)
 import pandas as pd
@@ -1120,73 +1121,6 @@ def __nonzero__(self):
                 self.assertTrue(r1.all())
 
     # ----------------------------------------------------------------------
-    # Top / bottom
-
-    def test_nlargest(self):
-        # GH10393
-        from string import ascii_lowercase
-        df = pd.DataFrame({'a': np.random.permutation(10),
-                           'b': list(ascii_lowercase[:10])})
-        result = df.nlargest(5, 'a')
-        expected = df.sort_values('a', ascending=False).head(5)
-        tm.assert_frame_equal(result, expected)
-
-    def test_nlargest_multiple_columns(self):
-        from string import ascii_lowercase
-        df = pd.DataFrame({'a': np.random.permutation(10),
-                           'b': list(ascii_lowercase[:10]),
-                           'c': np.random.permutation(10).astype('float64')})
-        result = df.nlargest(5, ['a', 'b'])
-        expected = df.sort_values(['a', 'b'], ascending=False).head(5)
-        tm.assert_frame_equal(result, expected)
-
-    def test_nsmallest(self):
-        from string import ascii_lowercase
-        df = pd.DataFrame({'a': np.random.permutation(10),
-                           'b': list(ascii_lowercase[:10])})
-        result = df.nsmallest(5, 'a')
-        expected = df.sort_values('a').head(5)
-        tm.assert_frame_equal(result, expected)
-
-    def test_nsmallest_multiple_columns(self):
-        from string import ascii_lowercase
-        df = pd.DataFrame({'a': np.random.permutation(10),
-                           'b': list(ascii_lowercase[:10]),
-                           'c': np.random.permutation(10).astype('float64')})
-        result = df.nsmallest(5, ['a', 'c'])
-        expected = df.sort_values(['a', 'c']).head(5)
-        tm.assert_frame_equal(result, expected)
-
-    def test_nsmallest_nlargest_duplicate_index(self):
-        # GH 13412
-        df = pd.DataFrame({'a': [1, 2, 3, 4],
-                           'b': [4, 3, 2, 1],
-                           'c': [0, 1, 2, 3]},
-                          index=[0, 0, 1, 1])
-        result = df.nsmallest(4, 'a')
-        expected = df.sort_values('a').head(4)
-        tm.assert_frame_equal(result, expected)
-
-        result = df.nlargest(4, 'a')
-        expected = df.sort_values('a', ascending=False).head(4)
-        tm.assert_frame_equal(result, expected)
-
-        result = df.nsmallest(4, ['a', 'c'])
-        expected = df.sort_values(['a', 'c']).head(4)
-        tm.assert_frame_equal(result, expected)
-
-        result = df.nsmallest(4, ['c', 'a'])
-        expected = df.sort_values(['c', 'a']).head(4)
-        tm.assert_frame_equal(result, expected)
-
-        result = df.nlargest(4, ['a', 'c'])
-        expected = df.sort_values(['a', 'c'], ascending=False).head(4)
-        tm.assert_frame_equal(result, expected)
-
-        result = df.nlargest(4, ['c', 'a'])
-        expected = df.sort_values(['c', 'a'], ascending=False).head(4)
-        tm.assert_frame_equal(result, expected)
-    # ----------------------------------------------------------------------
     # Isin
 
     def test_isin(self):
@@ -1965,3 +1899,111 @@ def test_dot(self):
 
         with tm.assertRaisesRegexp(ValueError, 'aligned'):
             df.dot(df2)
+
+
+@pytest.fixture
+def df_duplicates():
+    return pd.DataFrame({'a': [1, 2, 3, 4, 4],
+                         'b': [1, 1, 1, 1, 1],
+                         'c': [0, 1, 2, 5, 4]},
+                        index=[0, 0, 1, 1, 1])
+
+
+@pytest.fixture
+def df_strings():
+    return pd.DataFrame({'a': np.random.permutation(10),
+                         'b': list(ascii_lowercase[:10]),
+                         'c': np.random.permutation(10).astype('float64')})
+
+
+class TestNLargestNSmallest(object):
+
+    # ----------------------------------------------------------------------
+    # Top / bottom
+    @pytest.mark.parametrize(
+        'n, order',
+        product(range(1, 11),
+                [['a'],
+                 ['c'],
+                 ['a', 'b'],
+                 ['a', 'c'],
+                 ['b', 'a'],
+                 ['b', 'c'],
+                 ['a', 'b', 'c'],
+                 ['c', 'a', 'b'],
+                 ['c', 'b', 'a'],
+                 ['b', 'c', 'a'],
+                 ['b', 'a', 'c'],
+
+                 # dups!
+                 ['b', 'c', 'c'],
+
+                 ]))
+    def test_n(self, df_strings, n, order):
+        # GH10393
+        df = df_strings
+
+        error_msg = (
+            "'b' has dtype: object, cannot use method 'nsmallest' "
+            "with this dtype"
+        )
+        if 'b' in order:
+            with pytest.raises(TypeError) as exception:
+                df.nsmallest(n, order)
+            assert exception.value, error_msg
+        else:
+            result = df.nsmallest(n, order)
+            expected = df.sort_values(order).head(n)
+            tm.assert_frame_equal(result, expected)
+
+        if 'b' in order:
+            with pytest.raises(TypeError) as exception:
+                df.nsmallest(n, order)
+            assert exception.value, error_msg
+        else:
+            result = df.nlargest(n, order)
+            expected = df.sort_values(order, ascending=False).head(n)
+            tm.assert_frame_equal(result, expected)
+
+    def test_n_error(self, df_strings):
+        # b alone raises a TypeError
+        df = df_strings
+        with pytest.raises(TypeError):
+            df.nsmallest(1, 'b')
+        with pytest.raises(TypeError):
+            df.nlargest(1, 'b')
+
+    def test_n_identical_values(self):
+        # GH15297
+        df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})
+
+        result = df.nlargest(3, 'a')
+        expected = pd.DataFrame(
+            {'a': [1] * 3, 'b': [1, 2, 3]}, index=[0, 1, 2]
+        )
+        tm.assert_frame_equal(result, expected)
+
+        result = df.nsmallest(3, 'a')
+        expected = pd.DataFrame({'a': [1] * 3, 'b': [1, 2, 3]})
+        tm.assert_frame_equal(result, expected)
+
+    @pytest.mark.parametrize(
+        'n, order',
+        product([1, 2, 3, 4, 5],
+                [['a', 'b', 'c'],
+                 ['c', 'b', 'a'],
+                 ['a'],
+                 ['b'],
+                 ['a', 'b'],
+                 ['c', 'b']]))
+    def test_n_duplicate_index(self, df_duplicates, n, order):
+        # GH 13412
+
+        df = df_duplicates
+        result = df.nsmallest(n, order)
+        expected = df.sort_values(order).head(n)
+        tm.assert_frame_equal(result, expected)
+
+        result = df.nlargest(n, order)
+        expected = df.sort_values(order, ascending=False).head(n)
+        tm.assert_frame_equal(result, expected)
diff --git a/pandas/tests/series/test_analytics.py b/pandas/tests/series/test_analytics.py