Skip to content

Commit 25a4ef3

Browse files
committed
BUG: nlargest/nsmallest gave wrong result (#22752)
When asking for the n largest/smallest rows in a dataframe nlargest/nsmallest sometimes failed to differentiate the correct result based on the latter columns.
1 parent 1a12c41 commit 25a4ef3

File tree

3 files changed

+39
-15
lines changed

3 files changed

+39
-15
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -804,6 +804,7 @@ Other
804804
- :meth:`~pandas.io.formats.style.Styler.background_gradient` now takes a ``text_color_threshold`` parameter to automatically lighten the text color based on the luminance of the background color. This improves readability with dark background colors without the need to limit the background colormap range. (:issue:`21258`)
805805
- Require at least 0.28.2 version of ``cython`` to support read-only memoryviews (:issue:`21688`)
806806
- :meth:`~pandas.io.formats.style.Styler.background_gradient` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` (:issue:`15204`)
807+
- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`)
807808
- :meth:`~pandas.io.formats.style.Styler.bar` now also supports tablewise application (in addition to rowwise and columnwise) with ``axis=None`` and setting clipping range with ``vmin`` and ``vmax`` (:issue:`21548` and :issue:`21526`). ``NaN`` values are also handled properly.
808809
- Logical operations ``&, |, ^`` between :class:`Series` and :class:`Index` will no longer raise ``ValueError`` (:issue:`22092`)
809810
-

pandas/core/algorithms.py

+16-15
Original file line numberDiff line numberDiff line change
@@ -1221,34 +1221,35 @@ def get_indexer(current_indexer, other_indexer):
12211221
# and break
12221222
# Otherwise we must save the index of the non duplicated values
12231223
# and set the next cur_frame to cur_frame filtered on all
1224-
# duplcicated values (#GH15297)
1224+
# duplicated values (#GH15297)
12251225
series = cur_frame[column]
1226-
values = getattr(series, method)(cur_n, keep=self.keep)
12271226
is_last_column = len(columns) - 1 == i
1228-
if is_last_column or values.nunique() == series.isin(values).sum():
1227+
values = getattr(series, method)(
1228+
cur_n,
1229+
keep=self.keep if is_last_column else 'all')
12291230

1230-
# Last column in columns or values are unique in
1231-
# series => values
1232-
# is all that matters
1231+
if is_last_column or len(values) <= cur_n:
12331232
indexer = get_indexer(indexer, values.index)
12341233
break
12351234

1236-
duplicated_filter = series.duplicated(keep=False)
1237-
duplicated = values[duplicated_filter]
1238-
non_duplicated = values[~duplicated_filter]
1239-
indexer = get_indexer(indexer, non_duplicated.index)
1235+
last_value = values == values[values.index[-1]]
1236+
safe_values = values[~last_value]
1237+
unsafe_values = values[last_value]
12401238

1241-
# Must set cur frame to include all duplicated values
1242-
# to consider for the next column, we also can reduce
1243-
# cur_n by the current length of the indexer
1244-
cur_frame = cur_frame[series.isin(duplicated)]
1239+
indexer = get_indexer(indexer, safe_values.index)
1240+
cur_frame = cur_frame.loc[unsafe_values.index]
12451241
cur_n = n - len(indexer)
12461242

12471243
frame = frame.take(indexer)
12481244

12491245
# Restore the index on frame
12501246
frame.index = original_index.take(indexer)
1251-
return frame
1247+
ascending = method == 'nsmallest'
1248+
1249+
return frame.sort_values(
1250+
columns,
1251+
ascending=ascending,
1252+
kind='mergesort')
12521253

12531254

12541255
# ------- ## ---- #

pandas/tests/frame/test_analytics.py

+22
Original file line numberDiff line numberDiff line change
@@ -2095,6 +2095,28 @@ def test_n_all_dtypes(self, df_main_dtypes):
20952095
df.nsmallest(2, list(set(df) - {'category_string', 'string'}))
20962096
df.nlargest(2, list(set(df) - {'category_string', 'string'}))
20972097

2098+
@pytest.mark.parametrize('method,expected', [
2099+
('nlargest',
2100+
pd.DataFrame({'a': [2, 2, 2, 1], 'b': [3, 2, 1, 3]},
2101+
index=[2, 1, 0, 3])),
2102+
('nsmallest',
2103+
pd.DataFrame({'a': [2, 1, 1, 1], 'b': [1, 3, 2, 1]},
2104+
index=[0, 3, 4, 5]))])
2105+
def test_duplicates_on_starter_columns(self, method, expected):
2106+
# regression test for #22752
2107+
2108+
df = pd.DataFrame({
2109+
'a': [2, 2, 2, 1, 1, 1],
2110+
'b': [1, 2, 3, 3, 2, 1]
2111+
})
2112+
2113+
result = getattr(df, method)(
2114+
4, columns=['a', 'b']
2115+
).sort_values(
2116+
['a', 'b'], ascending=False
2117+
)
2118+
tm.assert_frame_equal(result, expected)
2119+
20982120
def test_n_identical_values(self):
20992121
# GH15297
21002122
df = pd.DataFrame({'a': [1] * 5, 'b': [1, 2, 3, 4, 5]})

0 commit comments

Comments
 (0)