From 62b5b0cb31fb710f3e7c79335b6ff8def628036e Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Sun, 17 Nov 2013 02:08:44 -0800 Subject: [PATCH 1/2] ENH nlargest and nsmallest Series methods --- doc/source/v0.13.1.txt | 1 + pandas/algos.pyx | 31 ++++++++++++++ pandas/core/series.py | 62 +++++++++++++++++++++++++++- pandas/tests/test_series.py | 37 +++++++++++++++++ pandas/tools/util.py | 80 ++++++++++++++++++++++++++++++++++++- vb_suite/series_methods.py | 29 ++++++++++++++ 6 files changed, 238 insertions(+), 2 deletions(-) create mode 100644 vb_suite/series_methods.py diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt index b48f555f9691a..557abfc48a023 100644 --- a/doc/source/v0.13.1.txt +++ b/doc/source/v0.13.1.txt @@ -128,6 +128,7 @@ API changes import pandas.core.common as com com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan])) np.array_equal(np.array([0, np.nan]), np.array([0, np.nan])) +- Add nsmallest and nlargest Series methods (:issue:`3960`) - ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a ``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is diff --git a/pandas/algos.pyx b/pandas/algos.pyx index 27e25c3954dad..71d7b41647564 100644 --- a/pandas/algos.pyx +++ b/pandas/algos.pyx @@ -752,6 +752,8 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k): if k < i: m = j return a[k] +kth_smallest_float64 = kth_smallest + cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): cdef: Py_ssize_t i,j,l,m @@ -779,6 +781,35 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n): if k < i: m = j return a[k] +def kth_smallest_int64(ndarray[int64_t] a, Py_ssize_t k): + cdef: + Py_ssize_t i,j,l,m,n + int64_t x, t + + n = len(a) + + l = 0 + m = n-1 + while (l j: break + + if j < k: l = i + if k < i: m = j + return a[k] + def median(ndarray arr): ''' diff --git a/pandas/core/series.py b/pandas/core/series.py index 70b73c56772aa..0e697ada11119 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -35,7 +35,9 @@ from pandas.core.categorical import Categorical from pandas.tseries.index import DatetimeIndex from pandas.tseries.period import PeriodIndex, Period +from pandas.tseries.tools import to_datetime from pandas import compat +from pandas import algos as _algos from pandas.util.terminal import get_terminal_size from pandas.compat import zip, lzip, u, OrderedDict @@ -1740,7 +1742,17 @@ def _try_kind_sort(arr): good = -bad idx = pa.arange(len(self)) - argsorted = _try_kind_sort(arr[good]) + def _try_kind_sort(arr, kind='mergesort'): + # easier to ask forgiveness than permission + try: + # if kind==mergesort, it can fail for object dtype + return arr.argsort(kind=kind) + except TypeError: + # stable sort not available for object dtype + # uses the argsort default quicksort + return arr.argsort(kind='quicksort') + + argsorted = _try_kind_sort(arr[good], kind=kind) if not ascending: argsorted = argsorted[::-1] @@ -1758,6 +1770,54 @@ def _try_kind_sort(arr): return self._constructor(arr[sortedIdx], index=self.index[sortedIdx])\ .__finalize__(self) + def nlargest(self, n=5, take_last=False): + ''' + Returns the largest n rows: + + May be faster than .order(ascending=False).head(n). + + ''' + # TODO remove need for dropna ? + dropped = self.dropna() + + from pandas.tools.util import nlargest + + if dropped.dtype == object: + try: + dropped = dropped.astype(float) + except: + return dropped.order(ascending=False).head(n) + + inds = nlargest(dropped.values, n, take_last) + if len(inds) == 0: + # TODO remove this special case + return dropped[[]] + return dropped.iloc[inds] + + def nsmallest(self, n=5, take_last=False): + ''' + Returns the smallest n rows. + + May be faster than .order().head(n). + + ''' + # TODO remove need for dropna ? + dropped = self.dropna() + + from pandas.tools.util import nsmallest + + if dropped.dtype == object: + try: + dropped = dropped.astype(float) + except: + return dropped.order().head(n) + + inds = nsmallest(dropped.values, n, take_last) + if len(inds) == 0: + # TODO remove this special case + return dropped[[]] + return dropped.iloc[inds] + def sortlevel(self, level=0, ascending=True): """ Sort Series with MultiIndex by chosen level. Data will be diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 5b088598dfcec..89504ce599602 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -3956,6 +3956,43 @@ def test_order(self): ordered = ts.order(ascending=False, na_position='first') assert_almost_equal(expected, ordered.valid().values) + def test_nsmallest_nlargest(self): + # float, int, datetime64 (use i8), timedelts64 (same), + # object that are numbers, object that are strings + + s_list = [Series([3, 2, 1, 2, 5]), + Series([3., 2., 1., 2., 5.]), + Series([3., 2, 1, 2, 5], dtype='object'), + Series([3., 2, 1, 2, '5'], dtype='object'), + Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005']))] + + for s in s_list: + if s.dtype == object: + s2 = s.astype(float) + else: + s2 = s + + assert_series_equal(s.nsmallest(2), s2.iloc[[2, 1]]) + assert_series_equal(s.nsmallest(2, take_last=True), s2.iloc[[2, 3]]) + + assert_series_equal(s.nlargest(3), s2.iloc[[4, 0, 1]]) + assert_series_equal(s.nlargest(3, take_last=True), s2.iloc[[4, 0, 3]]) + + empty = s2.iloc[0:0] + assert_series_equal(s.nsmallest(0), empty) + assert_series_equal(s.nsmallest(-1), empty) + assert_series_equal(s.nlargest(0), empty) + assert_series_equal(s.nlargest(-1), empty) + + assert_series_equal(s.nsmallest(len(s)), s2.order()) + assert_series_equal(s.nsmallest(len(s) + 1), s2.order()) + assert_series_equal(s.nlargest(len(s)), s2.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nlargest(len(s) + 1), s2.iloc[[4, 0, 1, 3, 2]]) + + s = Series([3., np.nan, 1, 2, 5]) + assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) + assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]]) + def test_rank(self): from pandas.compat.scipy import rankdata diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 6dbefc4b70930..8a8a2f89b2dd5 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -1,6 +1,9 @@ from pandas.compat import reduce from pandas.core.index import Index import numpy as np +from pandas import algos +import pandas.core.common as com + def match(needles, haystack): haystack = Index(haystack) @@ -17,7 +20,7 @@ def cartesian_product(X): -------- >>> cartesian_product([list('ABC'), [1, 2]]) [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'), - array([1, 2, 1, 2, 1, 2])] + array([1, 2, 1, 2, 1, 2])] ''' @@ -43,3 +46,78 @@ def compose(*funcs): """Compose 2 or more callables""" assert len(funcs) > 1, 'At least 2 callables must be passed to compose' return reduce(_compose2, funcs) + + +def nsmallest(arr, n=5, take_last=False): + ''' + Find the indices of the n smallest values of a numpy array. + + Note: Fails silently with NaN. + + ''' + if n <= 0: + return np.array([]) # empty + elif n >= len(arr): + n = len(arr) + + if arr.dtype == object: + try: + arr = arr.astype(float) + except: + raise TypeError("An object array must convert to float.") + + if com.needs_i8_conversion(arr): + dtype = 'i8' + kth_s = algos.kth_smallest_int64 + elif arr.dtype in ['int64']: + dtype = 'int64' + kth_s = algos.kth_smallest_int64 + elif arr.dtype in ['float64']: + dtype = 'float64' + kth_s = algos.kth_smallest_float64 + else: + raise NotImplementedError("Not implemented for %s dtype, " + "perhaps convert to int64 or float64, " + "or use .order().head(n)") % arr.dtype + + if take_last: + arr = arr.view(dtype)[::-1] + else: + arr = arr.view(dtype) + + kth_val = kth_s(arr.copy(), n - 1) + + ns = np.nonzero(arr <= kth_val)[0] + inds = ns[arr[ns].argsort(kind='mergesort')][:n] + + if take_last: + # reverse indices + return len(arr) - 1 - inds + else: + return inds + + +def nlargest(arr, n=5, take_last=False): + ''' + Find the indices of the n largest values of a numpy array. + + Note: Fails silently with NaN. + + ''' + if n <= 0: + return np.array([]) # empty + elif n >= len(arr): + n = len(arr) + + if arr.dtype == object: + try: + arr = arr.astype(float) + except: + raise TypeError("An object array must convert to float.") + + if com.needs_i8_conversion(arr): + arr = -arr.view('i8') + else: + arr = -arr + + return nsmallest(arr, n, take_last=take_last) diff --git a/vb_suite/series_methods.py b/vb_suite/series_methods.py new file mode 100644 index 0000000000000..1659340cfe050 --- /dev/null +++ b/vb_suite/series_methods.py @@ -0,0 +1,29 @@ +from vbench.api import Benchmark +from datetime import datetime + +common_setup = """from pandas_vb_common import * +""" + +setup = common_setup + """ +s1 = Series(np.random.randn(10000)) +s2 = Series(np.random.randint(1, 10, 10000)) +""" + +series_nlargest1 = Benchmark('s1.nlargest(3, take_last=True);' + 's1.nlargest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) +series_nlargest2 = Benchmark('s2.nlargest(3, take_last=True);' + 's2.nlargest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) + +series_nsmallest2 = Benchmark('s1.nsmallest(3, take_last=True);' + 's1.nsmallest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) + +series_nsmallest2 = Benchmark('s2.nsmallest(3, take_last=True);' + 's2.nsmallest(3, take_last=False)', + setup, + start_date=datetime(2014, 1, 25)) From 685ac641b77b64107dea85d82bf0133ad8f88211 Mon Sep 17 00:00:00 2001 From: Andy Hayden Date: Fri, 11 Apr 2014 13:37:18 -0700 Subject: [PATCH 2/2] wip --- pandas/core/series.py | 13 +++++-------- pandas/tests/test_series.py | 22 +++++++++------------- pandas/tools/util.py | 6 ++---- 3 files changed, 16 insertions(+), 25 deletions(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index 0e697ada11119..73a5ebce01d01 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1785,7 +1785,7 @@ def nlargest(self, n=5, take_last=False): if dropped.dtype == object: try: dropped = dropped.astype(float) - except: + except (NotImplementedError, TypeError): return dropped.order(ascending=False).head(n) inds = nlargest(dropped.values, n, take_last) @@ -1805,14 +1805,11 @@ def nsmallest(self, n=5, take_last=False): dropped = self.dropna() from pandas.tools.util import nsmallest + try: + inds = nsmallest(dropped.values, n, take_last) + except NotImplementedError: + return dropped.order().head(n) - if dropped.dtype == object: - try: - dropped = dropped.astype(float) - except: - return dropped.order().head(n) - - inds = nsmallest(dropped.values, n, take_last) if len(inds) == 0: # TODO remove this special case return dropped[[]] diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py index 89504ce599602..27a6281510e59 100644 --- a/pandas/tests/test_series.py +++ b/pandas/tests/test_series.py @@ -3967,27 +3967,23 @@ def test_nsmallest_nlargest(self): Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005']))] for s in s_list: - if s.dtype == object: - s2 = s.astype(float) - else: - s2 = s - assert_series_equal(s.nsmallest(2), s2.iloc[[2, 1]]) - assert_series_equal(s.nsmallest(2, take_last=True), s2.iloc[[2, 3]]) + assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]]) + assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]]) - assert_series_equal(s.nlargest(3), s2.iloc[[4, 0, 1]]) - assert_series_equal(s.nlargest(3, take_last=True), s2.iloc[[4, 0, 3]]) + assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]]) + assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]]) - empty = s2.iloc[0:0] + empty = s.iloc[0:0] assert_series_equal(s.nsmallest(0), empty) assert_series_equal(s.nsmallest(-1), empty) assert_series_equal(s.nlargest(0), empty) assert_series_equal(s.nlargest(-1), empty) - assert_series_equal(s.nsmallest(len(s)), s2.order()) - assert_series_equal(s.nsmallest(len(s) + 1), s2.order()) - assert_series_equal(s.nlargest(len(s)), s2.iloc[[4, 0, 1, 3, 2]]) - assert_series_equal(s.nlargest(len(s) + 1), s2.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nsmallest(len(s)), s.order()) + assert_series_equal(s.nsmallest(len(s) + 1), s.order()) + assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]]) + assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]]) s = Series([3., np.nan, 1, 2, 5]) assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]]) diff --git a/pandas/tools/util.py b/pandas/tools/util.py index 8a8a2f89b2dd5..60bbd68a8ac08 100644 --- a/pandas/tools/util.py +++ b/pandas/tools/util.py @@ -61,10 +61,8 @@ def nsmallest(arr, n=5, take_last=False): n = len(arr) if arr.dtype == object: - try: - arr = arr.astype(float) - except: - raise TypeError("An object array must convert to float.") + # just sort and take n + return arr.argsort(kind='mergesort')[:n] if com.needs_i8_conversion(arr): dtype = 'i8'