Skip to content

BUG: _nsorted incorrect with duplicated values in index (#13412) #14707

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -1012,3 +1012,14 @@ def setup(self):

def time_frame_quantile_axis1(self):
self.df.quantile([0.1, 0.5], axis=1)


class frame_nlargest(object):
goal_time = 0.2

def setup(self):
self.df = DataFrame(np.random.randn(1000, 3),
columns=list('ABC'))

def time_frame_nlargest(self):
self.df.nlargest(100, 'A')
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.19.2.txt
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ Bug Fixes
- Bug in resampling a ``DatetimeIndex`` in local TZ, covering a DST change, which would raise ``AmbiguousTimeError`` (:issue:`14682`)



- Bug ``HDFStore`` writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`)
- Bug in ``Series.groupby.nunique()`` raising an ``IndexError`` for an empty ``Series`` (:issue:`12553`)
- Bug in ``DataFrame.nlargest`` and ``DataFrame.nsmallest`` when the index had duplicate values (:issue:`13412`)



Expand Down
30 changes: 28 additions & 2 deletions pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -684,11 +684,12 @@ def select_n_slow(dropped, n, keep, method):
_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}


def select_n(series, n, keep, method):
"""Implement n largest/smallest.
def select_n_series(series, n, keep, method):
"""Implement n largest/smallest for pandas Series

Parameters
----------
series : pandas.Series object
n : int
keep : {'first', 'last'}, default 'first'
method : str, {'nlargest', 'nsmallest'}
Expand Down Expand Up @@ -717,6 +718,31 @@ def select_n(series, n, keep, method):
return dropped.iloc[inds]


def select_n_frame(frame, columns, n, method, keep):
"""Implement n largest/smallest for pandas DataFrame

Parameters
----------
series : pandas.DataFrame object
columns : list or str
n : int
keep : {'first', 'last'}, default 'first'
method : str, {'nlargest', 'nsmallest'}

Returns
-------
nordered : DataFrame
"""
from pandas.core.series import Series
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add doc-strings to select_n_* methods?. otherwise lgtm. ping on green.

if not is_list_like(columns):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for moving. If you are interested in next PR! would want to clean up this code w.r.t. largest/smallest a bit more anyhow; IOW, make it cleaner / nicer. (and I think we have a perf issue outstanding I think with Series).

columns = [columns]
columns = list(columns)
ser = getattr(frame[columns[0]], method)(n, keep=keep)
if isinstance(ser, Series):
ser = ser.to_frame()
return ser.merge(frame, on=columns[0], left_index=True)[frame.columns]


def _finalize_nsmallest(arr, kth_val, n, keep, narr):
ns, = np.nonzero(arr <= kth_val)
inds = ns[arr[ns].argsort(kind='mergesort')][:n]
Expand Down
13 changes: 2 additions & 11 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3390,15 +3390,6 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False,
return self.sort_index(level=level, axis=axis, ascending=ascending,
inplace=inplace, sort_remaining=sort_remaining)

def _nsorted(self, columns, n, method, keep):
if not is_list_like(columns):
columns = [columns]
columns = list(columns)
ser = getattr(self[columns[0]], method)(n, keep=keep)
ascending = dict(nlargest=False, nsmallest=True)[method]
return self.loc[ser.index].sort_values(columns, ascending=ascending,
kind='mergesort')

def nlargest(self, n, columns, keep='first'):
"""Get the rows of a DataFrame sorted by the `n` largest
values of `columns`.
Expand Down Expand Up @@ -3431,7 +3422,7 @@ def nlargest(self, n, columns, keep='first'):
1 10 b 2
2 8 d NaN
"""
return self._nsorted(columns, n, 'nlargest', keep)
return algos.select_n_frame(self, columns, n, 'nlargest', keep)

def nsmallest(self, n, columns, keep='first'):
"""Get the rows of a DataFrame sorted by the `n` smallest
Expand Down Expand Up @@ -3465,7 +3456,7 @@ def nsmallest(self, n, columns, keep='first'):
0 1 a 1
2 8 d NaN
"""
return self._nsorted(columns, n, 'nsmallest', keep)
return algos.select_n_frame(self, columns, n, 'nsmallest', keep)

def swaplevel(self, i=-2, j=-1, axis=0):
"""
Expand Down
4 changes: 2 additions & 2 deletions pandas/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1937,7 +1937,7 @@ def nlargest(self, n=5, keep='first'):
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nlargest(10) # only sorts up to the N requested
"""
return algos.select_n(self, n=n, keep=keep, method='nlargest')
return algos.select_n_series(self, n=n, keep=keep, method='nlargest')

@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
False: 'first'})
Expand Down Expand Up @@ -1975,7 +1975,7 @@ def nsmallest(self, n=5, keep='first'):
>>> s = pd.Series(np.random.randn(1e6))
>>> s.nsmallest(10) # only sorts up to the N requested
"""
return algos.select_n(self, n=n, keep=keep, method='nsmallest')
return algos.select_n_series(self, n=n, keep=keep, method='nsmallest')

def sortlevel(self, level=0, ascending=True, sort_remaining=True):
"""
Expand Down
29 changes: 29 additions & 0 deletions pandas/tests/frame/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1323,6 +1323,35 @@ def test_nsmallest_multiple_columns(self):
expected = df.sort_values(['a', 'c']).head(5)
tm.assert_frame_equal(result, expected)

def test_nsmallest_nlargest_duplicate_index(self):
# GH 13412
df = pd.DataFrame({'a': [1, 2, 3, 4],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add the issue number as a comment

'b': [4, 3, 2, 1],
'c': [0, 1, 2, 3]},
index=[0, 0, 1, 1])
result = df.nsmallest(4, 'a')
expected = df.sort_values('a').head(4)
tm.assert_frame_equal(result, expected)

result = df.nlargest(4, 'a')
expected = df.sort_values('a', ascending=False).head(4)
tm.assert_frame_equal(result, expected)

result = df.nsmallest(4, ['a', 'c'])
expected = df.sort_values(['a', 'c']).head(4)
tm.assert_frame_equal(result, expected)

result = df.nsmallest(4, ['c', 'a'])
expected = df.sort_values(['c', 'a']).head(4)
tm.assert_frame_equal(result, expected)

result = df.nlargest(4, ['a', 'c'])
expected = df.sort_values(['a', 'c'], ascending=False).head(4)
tm.assert_frame_equal(result, expected)

result = df.nlargest(4, ['c', 'a'])
expected = df.sort_values(['c', 'a'], ascending=False).head(4)
tm.assert_frame_equal(result, expected)
# ----------------------------------------------------------------------
# Isin

Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/series/test_analytics.py
Original file line number Diff line number Diff line change
Expand Up @@ -1532,6 +1532,15 @@ def test_nsmallest_nlargest(self):
with tm.assertRaisesRegexp(ValueError, msg):
s.nlargest(keep='invalid')

# GH 13412
s = Series([1, 4, 3, 2], index=[0, 0, 1, 1])
result = s.nlargest(3)
expected = s.sort_values(ascending=False).head(3)
assert_series_equal(result, expected)
result = s.nsmallest(3)
expected = s.sort_values().head(3)
assert_series_equal(result, expected)

def test_sortlevel(self):
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
s = Series([1, 2], mi)
Expand Down