diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt index b48f555f9691a..557abfc48a023 100644 --- a/doc/source/v0.13.1.txt +++ b/doc/source/v0.13.1.txt @@ -128,6 +128,7 @@ API changes import pandas.core.common as com com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan])) np.array_equal(np.array([0, np.nan]), np.array([0, np.nan])) +- Add nsmallest and nlargest Series methods (:issue:`3960`) - ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a ``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 27e25c3954dad..71d7b41647564 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -752,6 +752,8 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k): if k < i: m = j return a[k] +kth_smallest_float64 = kth_smallest + cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): cdef: Py_ssize_t i,j,l,m @@ -779,6 +781,35 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): if k < i: m = j return a[k] +def kth_smallest_int64(ndarray[int64_t] a, Py_ssize_t k): + cdef: + Py_ssize_t i,j,l,m,n + int64_t x, t + + n = len(a) + + l = 0 + m = n-1 + while (l j: break + + if j < k: l = i + if k < i: m = j + return a[k] + def median(ndarray arr): ''' diff --git a/pandas/core/series.py b/pandas/core/series.py index 70b73c56772aa..73a5ebce01d01 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -35,7 +35,9 @@ from pandas.core.categorical import Categorical from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period +from pandas.tseries.tools import to_datetime from pandas import compat +from pandas import algos as _algos from pandas.util.terminal import get_terminal_size from pandas.compat import zip, lzip, u, OrderedDict @@ -1740,7 +1742,17 @@ def _try_kind_sort(arr): good = -bad idx = pa.arange(len(self)) - argsorted = _try_kind_sort(arr[good]) + def _try_kind_sort(arr, kind='mergesort'): + # easier to ask forgiveness than permission + try: + # if kind==mergesort, it can fail for object dtype + return arr.argsort(kind=kind) + except TypeError: + # stable sort not available for object dtype + # uses the argsort default quicksort + return arr.argsort(kind='quicksort') + + argsorted = _try_kind_sort(arr[good], kind=kind) if not ascending: argsorted = argsorted[::-1] @@ -1758,6 +1770,51 @@ def _try_kind_sort(arr): return self._constructor(arr[sortedIdx], index=self.index[sortedIdx])\ .__finalize__(self) + def nlargest(self, n=5, take_last=False): + ''' + Returns the largest n rows: + + May be faster than .order(ascending=False).head(n). + + ''' + # TODO remove need for dropna ? + dropped = self.dropna() + + from pandas.tools.util import nlargest + + if dropped.dtype == object: + try: + dropped = dropped.astype(float) + except (NotImplementedError, TypeError): + return dropped.order(ascending=False).head(n) + + inds = nlargest(dropped.values, n, take_last) + if len(inds) == 0: + # TODO remove this special case + return dropped[[]] + return dropped.iloc[inds] + + def nsmallest(self, n=5, take_last=False): + ''' + Returns the smallest n rows. + + May be faster than .order().head(n). + + ''' + # TODO remove need for dropna ? + dropped = self.dropna() + + from pandas.tools.util import nsmallest + try: + inds = nsmallest(dropped.values, n, take_last) + except NotImplementedError: + return dropped.order().head(n) + + if len(inds) == 0: + # TODO remove this special case + return dropped[[]] + return dropped.iloc[inds] + def sortlevel(self, level=0, ascending=True): """ Sort Series with MultiIndex by chosen level. Data will be diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 5b088598dfcec..27a6281510e59 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -3956,6 +3956,39 @@ def test_order(self): ordered = ts.order(ascending=False, na_position='first') assert_almost_equal(expected, ordered.valid().values) + def test_nsmallest_nlargest(self): + # float, int, datetime64 (use i8), timedelts64 (same), + # object that are numbers, object that are strings + + s_list = [Series([3, 2, 1, 2, 5]), + Series([3., 2., 1., 2., 5.]), + Series([3., 2, 1, 2, 5], dtype='object'), + Series([3., 2, 1, 2, '5'], dtype='object'), + Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005']))] + + for s in s_list: + + assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) + assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]]) + + assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]]) + assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]]) + + empty = s.iloc[0:0] + assert_series_equal(s.nsmallest(0), empty) + assert_series_equal(s.nsmallest(-1), empty) + assert_series_equal(s.nlargest(0), empty) + assert_series_equal(s.nlargest(-1), empty) + + assert_series_equal(s.nsmallest(len(s)), s.order()) + assert_series_equal(s.nsmallest(len(s) + 1), s.order()) + assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]]) + + s = Series([3., np.nan, 1, 2, 5]) + assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) + assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) + def test_rank(self): from pandas.compat.scipy import rankdata diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 6dbefc4b70930..60bbd68a8ac08 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,6 +1,9 @@ from pandas.compat import reduce from pandas.core.index import Index import numpy as np +from pandas import algos +import pandas.core.common as com + def match(needles, haystack): haystack = Index(haystack) @@ -17,7 +20,7 @@ def cartesian_product(X): -------- >>> cartesian_product([list('ABC'), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), - array([1, 2, 1, 2, 1, 2])] + array([1, 2, 1, 2, 1, 2])] ''' @@ -43,3 +46,76 @@ def compose(*funcs): """Compose 2 or more callables""" assert len(funcs) > 1, 'At least 2 callables must be passed to compose' return reduce(_compose2, funcs) + + +def nsmallest(arr, n=5, take_last=False): + ''' + Find the indices of the n smallest values of a numpy array. + + Note: Fails silently with NaN. + + ''' + if n <= 0: + return np.array([]) # empty + elif n >= len(arr): + n = len(arr) + + if arr.dtype == object: + # just sort and take n + return arr.argsort(kind='mergesort')[:n] + + if com.needs_i8_conversion(arr): + dtype = 'i8' + kth_s = algos.kth_smallest_int64 + elif arr.dtype in ['int64']: + dtype = 'int64' + kth_s = algos.kth_smallest_int64 + elif arr.dtype in ['float64']: + dtype = 'float64' + kth_s = algos.kth_smallest_float64 + else: + raise NotImplementedError("Not implemented for %s dtype, " + "perhaps convert to int64 or float64, " + "or use .order().head(n)") % arr.dtype + + if take_last: + arr = arr.view(dtype)[::-1] + else: + arr = arr.view(dtype) + + kth_val = kth_s(arr.copy(), n - 1) + + ns = np.nonzero(arr <= kth_val)[0] + inds = ns[arr[ns].argsort(kind='mergesort')][:n] + + if take_last: + # reverse indices + return len(arr) - 1 - inds + else: + return inds + + +def nlargest(arr, n=5, take_last=False): + ''' + Find the indices of the n largest values of a numpy array. + + Note: Fails silently with NaN. + + ''' + if n <= 0: + return np.array([]) # empty + elif n >= len(arr): + n = len(arr) + + if arr.dtype == object: + try: + arr = arr.astype(float) + except: + raise TypeError("An object array must convert to float.") + + if com.needs_i8_conversion(arr): + arr = -arr.view('i8') + else: + arr = -arr + + return nsmallest(arr, n, take_last=take_last) diff --git a/vb_suite/series_methods.py b/vb_suite/series_methods.py new file mode 100644 index 0000000000000..1659340cfe050 --- /dev/null +++ b/vb_suite/series_methods.py @@ -0,0 +1,29 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +setup = common_setup + """ +s1 = Series(np.random.randn(10000)) +s2 = Series(np.random.randint(1, 10, 10000)) +""" + +series_nlargest1 = Benchmark('s1.nlargest(3, take_last=True);' + 's1.nlargest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) +series_nlargest2 = Benchmark('s2.nlargest(3, take_last=True);' + 's2.nlargest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) + +series_nsmallest2 = Benchmark('s1.nsmallest(3, take_last=True);' + 's1.nsmallest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) + +series_nsmallest2 = Benchmark('s2.nsmallest(3, take_last=True);' + 's2.nsmallest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25))