Skip to content

Commit 6e514da

Browse files
mroeschkejreback
authored andcommitted
BUG: _nsorted incorrect with duplicated values in index
closes #13412 closes #14707
1 parent 4378f82 commit 6e514da

File tree

7 files changed

+82
-15
lines changed

7 files changed

+82
-15
lines changed

asv_bench/benchmarks/frame_methods.py

+11
Original file line numberDiff line numberDiff line change
@@ -1012,3 +1012,14 @@ def setup(self):
10121012

10131013
def time_frame_quantile_axis1(self):
10141014
self.df.quantile([0.1, 0.5], axis=1)
1015+
1016+
1017+
class frame_nlargest(object):
1018+
goal_time = 0.2
1019+
1020+
def setup(self):
1021+
self.df = DataFrame(np.random.randn(1000, 3),
1022+
columns=list('ABC'))
1023+
1024+
def time_frame_nlargest(self):
1025+
self.df.nlargest(100, 'A')

doc/source/whatsnew/v0.19.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -61,6 +61,7 @@ Bug Fixes
6161
- Bug in ``HDFStore`` when writing a ``MultiIndex`` when using ``data_columns=True`` (:issue:`14435`)
6262
- Bug in ``HDFStore.append()`` when writing a ``Series`` and passing a ``min_itemsize`` argument containing a value for the ``index`` (:issue:`11412`)
6363
- Bug in ``Series.groupby.nunique()`` raising an ``IndexError`` for an empty ``Series`` (:issue:`12553`)
64+
- Bug in ``DataFrame.nlargest`` and ``DataFrame.nsmallest`` when the index had duplicate values (:issue:`13412`)
6465

6566

6667

pandas/core/algorithms.py

+28-2
Original file line numberDiff line numberDiff line change
@@ -684,11 +684,12 @@ def select_n_slow(dropped, n, keep, method):
684684
_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}
685685

686686

687-
def select_n(series, n, keep, method):
688-
"""Implement n largest/smallest.
687+
def select_n_series(series, n, keep, method):
688+
"""Implement n largest/smallest for pandas Series
689689
690690
Parameters
691691
----------
692+
series : pandas.Series object
692693
n : int
693694
keep : {'first', 'last'}, default 'first'
694695
method : str, {'nlargest', 'nsmallest'}
@@ -717,6 +718,31 @@ def select_n(series, n, keep, method):
717718
return dropped.iloc[inds]
718719

719720

721+
def select_n_frame(frame, columns, n, method, keep):
722+
"""Implement n largest/smallest for pandas DataFrame
723+
724+
Parameters
725+
----------
726+
frame : pandas.DataFrame object
727+
columns : list or str
728+
n : int
729+
keep : {'first', 'last'}, default 'first'
730+
method : str, {'nlargest', 'nsmallest'}
731+
732+
Returns
733+
-------
734+
nordered : DataFrame
735+
"""
736+
from pandas.core.series import Series
737+
if not is_list_like(columns):
738+
columns = [columns]
739+
columns = list(columns)
740+
ser = getattr(frame[columns[0]], method)(n, keep=keep)
741+
if isinstance(ser, Series):
742+
ser = ser.to_frame()
743+
return ser.merge(frame, on=columns[0], left_index=True)[frame.columns]
744+
745+
720746
def _finalize_nsmallest(arr, kth_val, n, keep, narr):
721747
ns, = np.nonzero(arr <= kth_val)
722748
inds = ns[arr[ns].argsort(kind='mergesort')][:n]

pandas/core/frame.py

+2-11
Original file line numberDiff line numberDiff line change
@@ -3337,15 +3337,6 @@ def sortlevel(self, level=0, axis=0, ascending=True, inplace=False,
33373337
return self.sort_index(level=level, axis=axis, ascending=ascending,
33383338
inplace=inplace, sort_remaining=sort_remaining)
33393339

3340-
def _nsorted(self, columns, n, method, keep):
3341-
if not is_list_like(columns):
3342-
columns = [columns]
3343-
columns = list(columns)
3344-
ser = getattr(self[columns[0]], method)(n, keep=keep)
3345-
ascending = dict(nlargest=False, nsmallest=True)[method]
3346-
return self.loc[ser.index].sort_values(columns, ascending=ascending,
3347-
kind='mergesort')
3348-
33493340
def nlargest(self, n, columns, keep='first'):
33503341
"""Get the rows of a DataFrame sorted by the `n` largest
33513342
values of `columns`.
@@ -3378,7 +3369,7 @@ def nlargest(self, n, columns, keep='first'):
33783369
1 10 b 2
33793370
2 8 d NaN
33803371
"""
3381-
return self._nsorted(columns, n, 'nlargest', keep)
3372+
return algos.select_n_frame(self, columns, n, 'nlargest', keep)
33823373

33833374
def nsmallest(self, n, columns, keep='first'):
33843375
"""Get the rows of a DataFrame sorted by the `n` smallest
@@ -3412,7 +3403,7 @@ def nsmallest(self, n, columns, keep='first'):
34123403
0 1 a 1
34133404
2 8 d NaN
34143405
"""
3415-
return self._nsorted(columns, n, 'nsmallest', keep)
3406+
return algos.select_n_frame(self, columns, n, 'nsmallest', keep)
34163407

34173408
def swaplevel(self, i=-2, j=-1, axis=0):
34183409
"""

pandas/core/series.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -1940,7 +1940,7 @@ def nlargest(self, n=5, keep='first'):
19401940
>>> s = pd.Series(np.random.randn(1e6))
19411941
>>> s.nlargest(10) # only sorts up to the N requested
19421942
"""
1943-
return algos.select_n(self, n=n, keep=keep, method='nlargest')
1943+
return algos.select_n_series(self, n=n, keep=keep, method='nlargest')
19441944

19451945
@deprecate_kwarg('take_last', 'keep', mapping={True: 'last',
19461946
False: 'first'})
@@ -1978,7 +1978,7 @@ def nsmallest(self, n=5, keep='first'):
19781978
>>> s = pd.Series(np.random.randn(1e6))
19791979
>>> s.nsmallest(10) # only sorts up to the N requested
19801980
"""
1981-
return algos.select_n(self, n=n, keep=keep, method='nsmallest')
1981+
return algos.select_n_series(self, n=n, keep=keep, method='nsmallest')
19821982

19831983
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
19841984
"""

pandas/tests/frame/test_analytics.py

+29
Original file line numberDiff line numberDiff line change
@@ -1323,6 +1323,35 @@ def test_nsmallest_multiple_columns(self):
13231323
expected = df.sort_values(['a', 'c']).head(5)
13241324
tm.assert_frame_equal(result, expected)
13251325

1326+
def test_nsmallest_nlargest_duplicate_index(self):
1327+
# GH 13412
1328+
df = pd.DataFrame({'a': [1, 2, 3, 4],
1329+
'b': [4, 3, 2, 1],
1330+
'c': [0, 1, 2, 3]},
1331+
index=[0, 0, 1, 1])
1332+
result = df.nsmallest(4, 'a')
1333+
expected = df.sort_values('a').head(4)
1334+
tm.assert_frame_equal(result, expected)
1335+
1336+
result = df.nlargest(4, 'a')
1337+
expected = df.sort_values('a', ascending=False).head(4)
1338+
tm.assert_frame_equal(result, expected)
1339+
1340+
result = df.nsmallest(4, ['a', 'c'])
1341+
expected = df.sort_values(['a', 'c']).head(4)
1342+
tm.assert_frame_equal(result, expected)
1343+
1344+
result = df.nsmallest(4, ['c', 'a'])
1345+
expected = df.sort_values(['c', 'a']).head(4)
1346+
tm.assert_frame_equal(result, expected)
1347+
1348+
result = df.nlargest(4, ['a', 'c'])
1349+
expected = df.sort_values(['a', 'c'], ascending=False).head(4)
1350+
tm.assert_frame_equal(result, expected)
1351+
1352+
result = df.nlargest(4, ['c', 'a'])
1353+
expected = df.sort_values(['c', 'a'], ascending=False).head(4)
1354+
tm.assert_frame_equal(result, expected)
13261355
# ----------------------------------------------------------------------
13271356
# Isin
13281357

pandas/tests/series/test_analytics.py

+9
Original file line numberDiff line numberDiff line change
@@ -1532,6 +1532,15 @@ def test_nsmallest_nlargest(self):
15321532
with tm.assertRaisesRegexp(ValueError, msg):
15331533
s.nlargest(keep='invalid')
15341534

1535+
# GH 13412
1536+
s = Series([1, 4, 3, 2], index=[0, 0, 1, 1])
1537+
result = s.nlargest(3)
1538+
expected = s.sort_values(ascending=False).head(3)
1539+
assert_series_equal(result, expected)
1540+
result = s.nsmallest(3)
1541+
expected = s.sort_values().head(3)
1542+
assert_series_equal(result, expected)
1543+
15351544
def test_sortlevel(self):
15361545
mi = MultiIndex.from_tuples([[1, 1, 3], [1, 1, 1]], names=list('ABC'))
15371546
s = Series([1, 2], mi)

0 commit comments

Comments
 (0)