Skip to content

Commit fcec82e

Browse files
committed
Merge pull request #7113 from cpcloud/hayd-kth_smallest
ENH: add nlargest nsmallest to Series
2 parents 1f34b47 + 6673705 commit fcec82e

File tree

8 files changed

+317
-37
lines changed

8 files changed

+317
-37
lines changed

doc/source/basics.rst

+15
Original file line numberDiff line numberDiff line change
@@ -1311,6 +1311,21 @@ Some other sorting notes / nuances:
13111311
compatibility with NumPy methods which expect the ``ndarray.sort``
13121312
behavior.
13131313

1314+
.. versionadded:: 0.14.0
1315+
1316+
``Series`` has the ``nsmallest`` and ``nlargest`` methods which return the
1317+
smallest or largest :math:`n` values. For a large ``Series`` this can be much
1318+
faster than sorting the entire Series and calling ``head(n)`` on the result.
1319+
1320+
.. ipython:: python
1321+
1322+
s = Series(np.random.permutation(10))
1323+
s
1324+
s.order()
1325+
s.nsmallest(3)
1326+
s.nlargest(3)
1327+
1328+
13141329
Sorting by a multi-index column
13151330
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
13161331

doc/source/v0.14.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -643,6 +643,7 @@ Enhancements
643643
values='Quantity', aggfunc=np.sum)
644644

645645
- str.wrap implemented (:issue:`6999`)
646+
- Add nsmallest and nlargest Series methods (:issue:`3960`)
646647

647648
- `PeriodIndex` fully supports partial string indexing like `DatetimeIndex` (:issue:`7043`)
648649

pandas/algos.pyx

+47-20
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,9 @@ from numpy cimport NPY_FLOAT16 as NPY_float16
2121
from numpy cimport NPY_FLOAT32 as NPY_float32
2222
from numpy cimport NPY_FLOAT64 as NPY_float64
2323

24+
from numpy cimport (int8_t, int16_t, int32_t, int64_t, uint8_t, uint16_t,
25+
uint32_t, uint64_t, float16_t, float32_t, float64_t)
26+
2427
int8 = np.dtype(np.int8)
2528
int16 = np.dtype(np.int16)
2629
int32 = np.dtype(np.int32)
@@ -736,16 +739,43 @@ def _check_minp(win, minp, N):
736739
# Physical description: 366 p.
737740
# Series: Prentice-Hall Series in Automatic Computation
738741

739-
def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
740-
cdef:
741-
Py_ssize_t i,j,l,m,n
742-
double_t x, t
743742

744-
n = len(a)
743+
ctypedef fused numeric:
744+
int8_t
745+
int16_t
746+
int32_t
747+
int64_t
748+
749+
uint8_t
750+
uint16_t
751+
uint32_t
752+
uint64_t
753+
754+
float32_t
755+
float64_t
756+
757+
758+
cdef inline Py_ssize_t swap(numeric *a, numeric *b) except -1:
759+
cdef numeric t
760+
761+
# cython doesn't allow pointer dereference so use array syntax
762+
t = a[0]
763+
a[0] = b[0]
764+
b[0] = t
765+
return 0
766+
767+
768+
@cython.boundscheck(False)
769+
@cython.wraparound(False)
770+
cpdef numeric kth_smallest(numeric[:] a, Py_ssize_t k):
771+
cdef:
772+
Py_ssize_t i, j, l, m, n = a.size
773+
numeric x
745774

746775
l = 0
747-
m = n-1
748-
while (l<m):
776+
m = n - 1
777+
778+
while l < m:
749779
x = a[k]
750780
i = l
751781
j = m
@@ -754,9 +784,7 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
754784
while a[i] < x: i += 1
755785
while x < a[j]: j -= 1
756786
if i <= j:
757-
t = a[i]
758-
a[i] = a[j]
759-
a[j] = t
787+
swap(&a[i], &a[j])
760788
i += 1; j -= 1
761789

762790
if i > j: break
@@ -765,6 +793,7 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
765793
if k < i: m = j
766794
return a[k]
767795

796+
768797
cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
769798
cdef:
770799
Py_ssize_t i,j,l,m
@@ -781,9 +810,7 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
781810
while a[i] < x: i += 1
782811
while x < a[j]: j -= 1
783812
if i <= j:
784-
t = a[i]
785-
a[i] = a[j]
786-
a[j] = t
813+
swap(&a[i], &a[j])
787814
i += 1; j -= 1
788815

789816
if i > j: break
@@ -793,22 +820,22 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
793820
return a[k]
794821

795822

796-
def median(ndarray arr):
823+
cpdef numeric median(numeric[:] arr):
797824
'''
798825
A faster median
799826
'''
800-
cdef int n = len(arr)
827+
cdef Py_ssize_t n = arr.size
801828

802-
if len(arr) == 0:
829+
if n == 0:
803830
return np.NaN
804831

805832
arr = arr.copy()
806833

807834
if n % 2:
808-
return kth_smallest(arr, n / 2)
835+
return kth_smallest(arr, n // 2)
809836
else:
810-
return (kth_smallest(arr, n / 2) +
811-
kth_smallest(arr, n / 2 - 1)) / 2
837+
return (kth_smallest(arr, n // 2) +
838+
kth_smallest(arr, n // 2 - 1)) / 2
812839

813840

814841
# -------------- Min, Max subsequence
@@ -2226,7 +2253,7 @@ cdef inline float64_t _median_linear(float64_t* a, int n):
22262253

22272254

22282255
if n % 2:
2229-
result = kth_smallest_c(a, n / 2, n)
2256+
result = kth_smallest_c( a, n / 2, n)
22302257
else:
22312258
result = (kth_smallest_c(a, n / 2, n) +
22322259
kth_smallest_c(a, n / 2 - 1, n)) / 2

pandas/core/algorithms.py

+85-3
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,7 @@
99
import pandas.core.common as com
1010
import pandas.algos as algos
1111
import pandas.hashtable as htable
12-
import pandas.compat as compat
13-
from pandas.compat import filter, string_types
14-
from pandas.util.decorators import deprecate_kwarg
12+
from pandas.compat import string_types
1513

1614
def match(to_match, values, na_sentinel=-1):
1715
"""
@@ -413,6 +411,90 @@ def group_position(*args):
413411
return result
414412

415413

414+
_dtype_map = {'datetime64[ns]': 'int64', 'timedelta64[ns]': 'int64'}
415+
416+
417+
def _finalize_nsmallest(arr, kth_val, n, take_last, narr):
418+
ns, = np.nonzero(arr <= kth_val)
419+
inds = ns[arr[ns].argsort(kind='mergesort')][:n]
420+
421+
if take_last:
422+
# reverse indices
423+
return narr - 1 - inds
424+
return inds
425+
426+
427+
def nsmallest(arr, n, take_last=False):
428+
'''
429+
Find the indices of the n smallest values of a numpy array.
430+
431+
Note: Fails silently with NaN.
432+
433+
'''
434+
if take_last:
435+
arr = arr[::-1]
436+
437+
narr = len(arr)
438+
n = min(n, narr)
439+
440+
sdtype = str(arr.dtype)
441+
arr = arr.view(_dtype_map.get(sdtype, sdtype))
442+
443+
kth_val = algos.kth_smallest(arr.copy(), n - 1)
444+
return _finalize_nsmallest(arr, kth_val, n, take_last, narr)
445+
446+
447+
def nlargest(arr, n, take_last=False):
448+
"""
449+
Find the indices of the n largest values of a numpy array.
450+
451+
Note: Fails silently with NaN.
452+
"""
453+
sdtype = str(arr.dtype)
454+
arr = arr.view(_dtype_map.get(sdtype, sdtype))
455+
return nsmallest(-arr, n, take_last=take_last)
456+
457+
458+
def select_n_slow(dropped, n, take_last, method):
459+
reverse_it = take_last or method == 'nlargest'
460+
ascending = method == 'nsmallest'
461+
slc = np.s_[::-1] if reverse_it else np.s_[:]
462+
return dropped[slc].order(ascending=ascending).head(n)
463+
464+
465+
_select_methods = {'nsmallest': nsmallest, 'nlargest': nlargest}
466+
467+
468+
def select_n(series, n, take_last, method):
469+
"""Implement n largest/smallest.
470+
471+
Parameters
472+
----------
473+
n : int
474+
take_last : bool
475+
method : str, {'nlargest', 'nsmallest'}
476+
477+
Returns
478+
-------
479+
nordered : Series
480+
"""
481+
dtype = series.dtype
482+
if not issubclass(dtype.type, (np.integer, np.floating, np.datetime64,
483+
np.timedelta64)):
484+
raise TypeError("Cannot use method %r with dtype %s" % (method, dtype))
485+
486+
if n <= 0:
487+
return series[[]]
488+
489+
dropped = series.dropna()
490+
491+
if n >= len(series):
492+
return select_n_slow(dropped, n, take_last, method)
493+
494+
inds = _select_methods[method](dropped.values, n, take_last)
495+
return dropped.iloc[inds]
496+
497+
416498
_rank1d_functions = {
417499
'float64': algos.rank_1d_float64,
418500
'int64': algos.rank_1d_int64,

pandas/core/series.py

+74-13
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,6 @@
66
# pylint: disable=E1101,E1103
77
# pylint: disable=W0703,W0622,W0613,W0201
88

9-
import operator
109
import types
1110
import warnings
1211

@@ -15,38 +14,34 @@
1514
import numpy.ma as ma
1615

1716
from pandas.core.common import (isnull, notnull, _is_bool_indexer,
18-
_default_index, _maybe_promote, _maybe_upcast,
19-
_asarray_tuplesafe, is_integer_dtype,
20-
_NS_DTYPE, _TD_DTYPE,
21-
_infer_dtype_from_scalar, is_list_like,
22-
_values_from_object,
17+
_default_index, _maybe_upcast,
18+
_asarray_tuplesafe, _infer_dtype_from_scalar,
19+
is_list_like, _values_from_object,
2320
_possibly_cast_to_datetime, _possibly_castable,
24-
_possibly_convert_platform,
25-
_try_sort,
21+
_possibly_convert_platform, _try_sort,
2622
ABCSparseArray, _maybe_match_name,
2723
_ensure_object, SettingWithCopyError)
2824
from pandas.core.index import (Index, MultiIndex, InvalidIndexError,
2925
_ensure_index)
30-
from pandas.core.indexing import (
31-
_check_bool_indexer,
32-
_is_index_slice, _maybe_convert_indices)
26+
from pandas.core.indexing import _check_bool_indexer, _maybe_convert_indices
3327
from pandas.core import generic, base
3428
from pandas.core.internals import SingleBlockManager
3529
from pandas.core.categorical import Categorical
3630
from pandas.tseries.index import DatetimeIndex
3731
from pandas.tseries.period import PeriodIndex, Period
3832
from pandas import compat
3933
from pandas.util.terminal import get_terminal_size
40-
from pandas.compat import zip, lzip, u, OrderedDict
34+
from pandas.compat import zip, u, OrderedDict
4135

4236
import pandas.core.array as pa
4337
import pandas.core.ops as ops
38+
from pandas.core.algorithms import select_n
4439

4540
import pandas.core.common as com
4641
import pandas.core.datetools as datetools
4742
import pandas.core.format as fmt
4843
import pandas.core.nanops as nanops
49-
from pandas.util.decorators import Appender, Substitution, cache_readonly
44+
from pandas.util.decorators import Appender, cache_readonly
5045

5146
import pandas.lib as lib
5247
import pandas.tslib as tslib
@@ -1728,6 +1723,72 @@ def _try_kind_sort(arr):
17281723
else:
17291724
return result.__finalize__(self)
17301725

1726+
def nlargest(self, n=5, take_last=False):
1727+
"""Return the largest `n` elements.
1728+
1729+
Parameters
1730+
----------
1731+
n : int
1732+
Return this many descending sorted values
1733+
take_last : bool
1734+
Where there are duplicate values, take the last duplicate
1735+
1736+
Returns
1737+
-------
1738+
top_n : Series
1739+
The n largest values in the Series, in sorted order
1740+
1741+
Notes
1742+
-----
1743+
Faster than ``.order(ascending=False).head(n)`` for small `n` relative
1744+
to the size of the ``Series`` object.
1745+
1746+
See Also
1747+
--------
1748+
Series.nsmallest
1749+
1750+
Examples
1751+
--------
1752+
>>> import pandas as pd
1753+
>>> import numpy as np
1754+
>>> s = pd.Series(np.random.randn(1e6))
1755+
>>> s.nlargest(10) # only sorts up to the N requested
1756+
"""
1757+
return select_n(self, n=n, take_last=take_last, method='nlargest')
1758+
1759+
def nsmallest(self, n=5, take_last=False):
1760+
"""Return the smallest `n` elements.
1761+
1762+
Parameters
1763+
----------
1764+
n : int
1765+
Return this many ascending sorted values
1766+
take_last : bool
1767+
Where there are duplicate values, take the last duplicate
1768+
1769+
Returns
1770+
-------
1771+
bottom_n : Series
1772+
The n smallest values in the Series, in sorted order
1773+
1774+
Notes
1775+
-----
1776+
Faster than ``.order().head(n)`` for small `n` relative to
1777+
the size of the ``Series`` object.
1778+
1779+
See Also
1780+
--------
1781+
Series.nlargest
1782+
1783+
Examples
1784+
--------
1785+
>>> import pandas as pd
1786+
>>> import numpy as np
1787+
>>> s = pd.Series(np.random.randn(1e6))
1788+
>>> s.nsmallest(10) # only sorts up to the N requested
1789+
"""
1790+
return select_n(self, n=n, take_last=take_last, method='nsmallest')
1791+
17311792
def sortlevel(self, level=0, ascending=True, sort_remaining=True):
17321793
"""
17331794
Sort Series with MultiIndex by chosen level. Data will be

0 commit comments

Comments
 (0)