From 63218fde8630f8ea474ef83191f010817f6e327d Mon Sep 17 00:00:00 2001 From: Troels Nielsen Date: Wed, 19 Sep 2018 00:44:34 +0200 Subject: [PATCH] BUG: nlargest/nsmallest gave wrong result (#22752) When asking for the n largest/smallest rows in a dataframe nlargest/nsmallest sometimes failed to differentiate the correct result based on the latter columns. --- asv_bench/benchmarks/frame_methods.py | 13 ++++-- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/core/algorithms.py | 57 +++++++++++++++++---------- pandas/tests/frame/test_analytics.py | 18 +++++++++ 4 files changed, 65 insertions(+), 24 deletions(-) diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index 1819cfa2725db..f911d506b1f4f 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -505,14 +505,21 @@ class NSort(object): param_names = ['keep'] def setup(self, keep): - self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC')) + self.df = DataFrame(np.random.randn(100000, 3), + columns=list('ABC')) - def time_nlargest(self, keep): + def time_nlargest_one_column(self, keep): self.df.nlargest(100, 'A', keep=keep) - def time_nsmallest(self, keep): + def time_nlargest_two_columns(self, keep): + self.df.nlargest(100, ['A', 'B'], keep=keep) + + def time_nsmallest_one_column(self, keep): self.df.nsmallest(100, 'A', keep=keep) + def time_nsmallest_two_columns(self, keep): + self.df.nsmallest(100, ['A', 'B'], keep=keep) + class Describe(object): diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index ed1bf0a4f8394..2b44d1fc21f51 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -814,6 +814,7 @@ Other - :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`) - Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`) - :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`) +- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) - :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly. - Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`) - diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index d39e9e08e2947..e91cc8ec1e996 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -1214,41 +1214,56 @@ def get_indexer(current_indexer, other_indexer): indexer = Int64Index([]) for i, column in enumerate(columns): - # For each column we apply method to cur_frame[column]. - # If it is the last column in columns, or if the values - # returned are unique in frame[column] we save this index - # and break - # Otherwise we must save the index of the non duplicated values - # and set the next cur_frame to cur_frame filtered on all - # duplcicated values (#GH15297) + # If it's the last column or if we have the number of + # results desired we are done. + # Otherwise there are duplicates of the largest/smallest + # value and we need to look at the rest of the columns + # to determine which of the rows with the largest/smallest + # value in the column to keep. series = cur_frame[column] - values = getattr(series, method)(cur_n, keep=self.keep) is_last_column = len(columns) - 1 == i - if is_last_column or values.nunique() == series.isin(values).sum(): + values = getattr(series, method)( + cur_n, + keep=self.keep if is_last_column else 'all') - # Last column in columns or values are unique in - # series => values - # is all that matters + if is_last_column or len(values) <= cur_n: indexer = get_indexer(indexer, values.index) break - duplicated_filter = series.duplicated(keep=False) - duplicated = values[duplicated_filter] - non_duplicated = values[~duplicated_filter] - indexer = get_indexer(indexer, non_duplicated.index) + # Now find all values which are equal to + # the (nsmallest: largest)/(nlarrgest: smallest) + # from our series. + border_value = values == values[values.index[-1]] + + # Some of these values are among the top-n + # some aren't. + unsafe_values = values[border_value] + + # These values are definitely among the top-n + safe_values = values[~border_value] + indexer = get_indexer(indexer, safe_values.index) - # Must set cur frame to include all duplicated values - # to consider for the next column, we also can reduce - # cur_n by the current length of the indexer - cur_frame = cur_frame[series.isin(duplicated)] + # Go on and separate the unsafe_values on the remaining + # columns. + cur_frame = cur_frame.loc[unsafe_values.index] cur_n = n - len(indexer) frame = frame.take(indexer) # Restore the index on frame frame.index = original_index.take(indexer) - return frame + + # If there is only one column, the frame is already sorted. + if len(columns) == 1: + return frame + + ascending = method == 'nsmallest' + + return frame.sort_values( + columns, + ascending=ascending, + kind='mergesort') # ------- ## ---- # diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 52a52a1fd8752..baebf414969be 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -2095,6 +2095,24 @@ def test_n_all_dtypes(self, df_main_dtypes): df.nsmallest(2, list(set(df) - {'category_string', 'string'})) df.nlargest(2, list(set(df) - {'category_string', 'string'})) + @pytest.mark.parametrize('method,expected', [ + ('nlargest', + pd.DataFrame({'a': [2, 2, 2, 1], 'b': [3, 2, 1, 3]}, + index=[2, 1, 0, 3])), + ('nsmallest', + pd.DataFrame({'a': [1, 1, 1, 2], 'b': [1, 2, 3, 1]}, + index=[5, 4, 3, 0]))]) + def test_duplicates_on_starter_columns(self, method, expected): + # regression test for #22752 + + df = pd.DataFrame({ + 'a': [2, 2, 2, 1, 1, 1], + 'b': [1, 2, 3, 3, 2, 1] + }) + + result = getattr(df, method)(4, columns=['a', 'b']) + tm.assert_frame_equal(result, expected) + def test_n_identical_values(self): # GH15297 df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})