Skip to content

BUG: nlargest/nsmallest gave wrong result (#22752) #22754

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Sep 25, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -505,14 +505,21 @@ class NSort(object):
param_names = ['keep']

def setup(self, keep):
self.df = DataFrame(np.random.randn(1000, 3), columns=list('ABC'))
self.df = DataFrame(np.random.randn(100000, 3),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do the benchmarks show any difference? (e.g. from the prior impl)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it's significantly faster now. Especially, if I create large dataframes with many columns and many duplicates.

If sorting only on one column the speed should be exactly the same.

columns=list('ABC'))

def time_nlargest(self, keep):
def time_nlargest_one_column(self, keep):
self.df.nlargest(100, 'A', keep=keep)

def time_nsmallest(self, keep):
def time_nlargest_two_columns(self, keep):
self.df.nlargest(100, ['A', 'B'], keep=keep)

def time_nsmallest_one_column(self, keep):
self.df.nsmallest(100, 'A', keep=keep)

def time_nsmallest_two_columns(self, keep):
self.df.nsmallest(100, ['A', 'B'], keep=keep)


class Describe(object):

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,7 @@ Other
- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`)
- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`)
- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`)
- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`)
- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly.
- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`)
-
Expand Down
57 changes: 36 additions & 21 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -1214,41 +1214,56 @@ def get_indexer(current_indexer, other_indexer):
indexer = Int64Index([])

for i, column in enumerate(columns):

# For each column we apply method to cur_frame[column].
# If it is the last column in columns, or if the values
# returned are unique in frame[column] we save this index
# and break
# Otherwise we must save the index of the non duplicated values
# and set the next cur_frame to cur_frame filtered on all
# duplcicated values (#GH15297)
# If it's the last column or if we have the number of
# results desired we are done.
# Otherwise there are duplicates of the largest/smallest
# value and we need to look at the rest of the columns
# to determine which of the rows with the largest/smallest
# value in the column to keep.
series = cur_frame[column]
values = getattr(series, method)(cur_n, keep=self.keep)
is_last_column = len(columns) - 1 == i
if is_last_column or values.nunique() == series.isin(values).sum():
values = getattr(series, method)(
cur_n,
keep=self.keep if is_last_column else 'all')

# Last column in columns or values are unique in
# series => values
# is all that matters
if is_last_column or len(values) <= cur_n:
indexer = get_indexer(indexer, values.index)
break

duplicated_filter = series.duplicated(keep=False)
duplicated = values[duplicated_filter]
non_duplicated = values[~duplicated_filter]
indexer = get_indexer(indexer, non_duplicated.index)
# Now find all values which are equal to
# the (nsmallest: largest)/(nlarrgest: smallest)
# from our series.
border_value = values == values[values.index[-1]]

# Some of these values are among the top-n
# some aren't.
unsafe_values = values[border_value]

# These values are definitely among the top-n
safe_values = values[~border_value]
indexer = get_indexer(indexer, safe_values.index)

# Must set cur frame to include all duplicated values
# to consider for the next column, we also can reduce
# cur_n by the current length of the indexer
cur_frame = cur_frame[series.isin(duplicated)]
# Go on and separate the unsafe_values on the remaining
# columns.
cur_frame = cur_frame.loc[unsafe_values.index]
cur_n = n - len(indexer)

frame = frame.take(indexer)

# Restore the index on frame
frame.index = original_index.take(indexer)
return frame

# If there is only one column, the frame is already sorted.
if len(columns) == 1:
return frame

ascending = method == 'nsmallest'

return frame.sort_values(
columns,
ascending=ascending,
kind='mergesort')


# ------- ## ---- #
Expand Down
18 changes: 18 additions & 0 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -2095,6 +2095,24 @@ def test_n_all_dtypes(self, df_main_dtypes):
df.nsmallest(2, list(set(df) - {'category_string', 'string'}))
df.nlargest(2, list(set(df) - {'category_string', 'string'}))

@pytest.mark.parametrize('method,expected', [
('nlargest',
pd.DataFrame({'a': [2, 2, 2, 1], 'b': [3, 2, 1, 3]},
index=[2, 1, 0, 3])),
('nsmallest',
pd.DataFrame({'a': [1, 1, 1, 2], 'b': [1, 2, 3, 1]},
index=[5, 4, 3, 0]))])
def test_duplicates_on_starter_columns(self, method, expected):
# regression test for #22752

df = pd.DataFrame({
'a': [2, 2, 2, 1, 1, 1],
'b': [1, 2, 3, 3, 2, 1]
})

result = getattr(df, method)(4, columns=['a', 'b'])
tm.assert_frame_equal(result, expected)

def test_n_identical_values(self):
# GH15297
df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})
Expand Down