Skip to content

Add nlargest/nsmallest for DataFrame #10393

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 4, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -904,6 +904,8 @@ Reshaping, sorting, transposing
DataFrame.sort
DataFrame.sort_index
DataFrame.sortlevel
DataFrame.nlargest
DataFrame.nsmallest
DataFrame.swaplevel
DataFrame.stack
DataFrame.unstack
Expand Down
14 changes: 14 additions & 0 deletions doc/source/basics.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1497,6 +1497,20 @@ faster than sorting the entire Series and calling ``head(n)`` on the result.
s.nsmallest(3)
s.nlargest(3)

.. versionadded:: 0.17.0

``DataFrame`` also has the ``nlargest`` and ``nsmallest`` methods.

.. ipython:: python

df = DataFrame({'a': [-2, -1, 1, 10, 8, 11, -1],
'b': list('abdceff'),
'c': [1.0, 2.0, 4.0, 3.2, np.nan, 3.0, 4.0]})
df.nlargest(3, 'a')
df.nlargest(5, ['a', 'c'])
df.nsmallest(3, 'a')
df.nsmallest(5, ['a', 'c'])


.. _basics.multi-index_sorting:

Expand Down
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.17.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ Check the :ref:`API Changes <whatsnew_0170.api>` and :ref:`deprecations <whatsne
New features
~~~~~~~~~~~~

- ``DataFrame`` has the ``nlargest`` and ``nsmallest`` methods (:issue:`10393`)
- SQL io functions now accept a SQLAlchemy connectable. (:issue:`7877`)
- Enable writing complex values to HDF stores when using table format (:issue:`10447`)
- Enable reading gzip compressed files via URL, either by explicitly setting the compression parameter or by inferring from the presence of the HTTP Content-Encoding header in the response (:issue:`8685`)
Expand Down
73 changes: 73 additions & 0 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3127,6 +3127,79 @@ def sortlevel(self, level=0, axis=0, ascending=True,
else:
return self._constructor(new_data).__finalize__(self)

def _nsorted(self, columns, n, method, take_last):
if not com.is_list_like(columns):
columns = [columns]
columns = list(columns)
ser = getattr(self[columns[0]], method)(n, take_last=take_last)
ascending = dict(nlargest=False, nsmallest=True)[method]
return self.loc[ser.index].sort(columns, ascending=ascending,
kind='mergesort')

def nlargest(self, n, columns, take_last=False):
"""Get the rows of a DataFrame sorted by the `n` largest
values of `columns`.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a .. versionadded:: 0.17.0 here (and for nsmallest as well)?

.. versionadded:: 0.17.0

Parameters
----------
n : int
Number of items to retrieve
columns : list or str
Column name or names to order by
take_last : bool, optional
Where there are duplicate values, take the last duplicate

Returns
-------
DataFrame

Examples
--------
>>> df = DataFrame({'a': [1, 10, 8, 11, -1],
... 'b': list('abdce'),
... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
>>> df.nlargest(3, 'a')
a b c
3 11 c 3
1 10 b 2
2 8 d NaN
"""
return self._nsorted(columns, n, 'nlargest', take_last)

def nsmallest(self, n, columns, take_last=False):
"""Get the rows of a DataFrame sorted by the `n` smallest
values of `columns`.

.. versionadded:: 0.17.0

Parameters
----------
n : int
Number of items to retrieve
columns : list or str
Column name or names to order by
take_last : bool, optional
Where there are duplicate values, take the last duplicate

Returns
-------
DataFrame

Examples
--------
>>> df = DataFrame({'a': [1, 10, 8, 11, -1],
... 'b': list('abdce'),
... 'c': [1.0, 2.0, np.nan, 3.0, 4.0]})
>>> df.nsmallest(3, 'a')
a b c
4 -1 e 4
0 1 a 1
2 8 d NaN
"""
return self._nsorted(columns, n, 'nsmallest', take_last)

def swaplevel(self, i, j, axis=0):
"""
Swap levels i and j in a MultiIndex on a particular axis
Expand Down
35 changes: 35 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -14609,6 +14609,41 @@ def test_dataframe_metadata(self):
self.assertEqual(df._metadata, unpickled._metadata)
self.assertEqual(df.testattr, unpickled.testattr)

def test_nlargest(self):
# GH10393
from string import ascii_lowercase
df = pd.DataFrame({'a': np.random.permutation(10),
'b': list(ascii_lowercase[:10])})
result = df.nlargest(5, 'a')
expected = df.sort('a', ascending=False).head(5)
tm.assert_frame_equal(result, expected)

def test_nlargest_multiple_columns(self):
from string import ascii_lowercase
df = pd.DataFrame({'a': np.random.permutation(10),
'b': list(ascii_lowercase[:10]),
'c': np.random.permutation(10).astype('float64')})
result = df.nlargest(5, ['a', 'b'])
expected = df.sort(['a', 'b'], ascending=False).head(5)
tm.assert_frame_equal(result, expected)

def test_nsmallest(self):
from string import ascii_lowercase
df = pd.DataFrame({'a': np.random.permutation(10),
'b': list(ascii_lowercase[:10])})
result = df.nsmallest(5, 'a')
expected = df.sort('a').head(5)
tm.assert_frame_equal(result, expected)

def test_nsmallest_multiple_columns(self):
from string import ascii_lowercase
df = pd.DataFrame({'a': np.random.permutation(10),
'b': list(ascii_lowercase[:10]),
'c': np.random.permutation(10).astype('float64')})
result = df.nsmallest(5, ['a', 'c'])
expected = df.sort(['a', 'c']).head(5)
tm.assert_frame_equal(result, expected)

def test_to_panel_expanddim(self):
# GH 9762

Expand Down