From 62b5b0cb31fb710f3e7c79335b6ff8def628036e Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Sun, 17 Nov 2013 02:08:44 -0800
Subject: [PATCH 1/2] ENH nlargest and nsmallest Series methods

---
 doc/source/v0.13.1.txt      |  1 +
 pandas/algos.pyx            | 31 ++++++++++++++
 pandas/core/series.py       | 62 +++++++++++++++++++++++++++-
 pandas/tests/test_series.py | 37 +++++++++++++++++
 pandas/tools/util.py        | 80 ++++++++++++++++++++++++++++++++++++-
 vb_suite/series_methods.py  | 29 ++++++++++++++
 6 files changed, 238 insertions(+), 2 deletions(-)
 create mode 100644 vb_suite/series_methods.py

diff --git a/doc/source/v0.13.1.txt b/doc/source/v0.13.1.txt
index b48f555f9691a..557abfc48a023 100644
--- a/doc/source/v0.13.1.txt
+++ b/doc/source/v0.13.1.txt
@@ -128,6 +128,7 @@ API changes
       import pandas.core.common as com
       com.array_equivalent(np.array([0, np.nan]), np.array([0, np.nan]))
       np.array_equal(np.array([0, np.nan]), np.array([0, np.nan]))
+- Add nsmallest and nlargest Series methods (:issue:`3960`)
 
 - ``DataFrame.apply`` will use the ``reduce`` argument to determine whether a
   ``Series`` or a ``DataFrame`` should be returned when the ``DataFrame`` is
diff --git a/pandas/algos.pyx b/pandas/algos.pyx
index 27e25c3954dad..71d7b41647564 100644
--- a/pandas/algos.pyx
+++ b/pandas/algos.pyx
@@ -752,6 +752,8 @@ def kth_smallest(ndarray[double_t] a, Py_ssize_t k):
         if k < i: m = j
     return a[k]
 
+kth_smallest_float64 = kth_smallest
+
 cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
     cdef:
         Py_ssize_t i,j,l,m
@@ -779,6 +781,35 @@ cdef inline kth_smallest_c(float64_t* a, Py_ssize_t k, Py_ssize_t n):
         if k < i: m = j
     return a[k]
 
+def kth_smallest_int64(ndarray[int64_t] a, Py_ssize_t k):
+    cdef:
+        Py_ssize_t i,j,l,m,n
+        int64_t x, t
+
+    n = len(a)
+
+    l = 0
+    m = n-1
+    while (l<m):
+        x = a[k]
+        i = l
+        j = m
+
+        while 1:
+            while a[i] < x: i += 1
+            while x < a[j]: j -= 1
+            if i <= j:
+                t = a[i]
+                a[i] = a[j]
+                a[j] = t
+                i += 1; j -= 1
+
+            if i > j: break
+
+        if j < k: l = i
+        if k < i: m = j
+    return a[k]
+
 
 def median(ndarray arr):
     '''
diff --git a/pandas/core/series.py b/pandas/core/series.py
index 70b73c56772aa..0e697ada11119 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -35,7 +35,9 @@
 from pandas.core.categorical import Categorical
 from pandas.tseries.index import DatetimeIndex
 from pandas.tseries.period import PeriodIndex, Period
+from pandas.tseries.tools import to_datetime
 from pandas import compat
+from pandas import algos as _algos
 from pandas.util.terminal import get_terminal_size
 from pandas.compat import zip, lzip, u, OrderedDict
 
@@ -1740,7 +1742,17 @@ def _try_kind_sort(arr):
         good = -bad
         idx = pa.arange(len(self))
 
-        argsorted = _try_kind_sort(arr[good])
+        def _try_kind_sort(arr, kind='mergesort'):
+            # easier to ask forgiveness than permission
+            try:
+                # if kind==mergesort, it can fail for object dtype
+                return arr.argsort(kind=kind)
+            except TypeError:
+                # stable sort not available for object dtype
+                # uses the argsort default quicksort
+                return arr.argsort(kind='quicksort')
+
+        argsorted = _try_kind_sort(arr[good], kind=kind)
 
         if not ascending:
             argsorted = argsorted[::-1]
@@ -1758,6 +1770,54 @@ def _try_kind_sort(arr):
         return self._constructor(arr[sortedIdx], index=self.index[sortedIdx])\
                    .__finalize__(self)
 
+    def nlargest(self, n=5, take_last=False):
+        '''
+        Returns the largest n rows:
+
+        May be faster than .order(ascending=False).head(n).
+
+        '''
+        # TODO remove need for dropna ?
+        dropped = self.dropna()
+
+        from pandas.tools.util import nlargest
+
+        if dropped.dtype == object:
+            try:
+                dropped = dropped.astype(float)
+            except:
+                return dropped.order(ascending=False).head(n)
+
+        inds = nlargest(dropped.values, n, take_last)
+        if len(inds) == 0:
+            # TODO remove this special case
+            return dropped[[]]
+        return dropped.iloc[inds]
+
+    def nsmallest(self, n=5, take_last=False):
+        '''
+        Returns the smallest n rows.
+
+        May be faster than .order().head(n).
+
+        '''
+        # TODO remove need for dropna ?
+        dropped = self.dropna()
+
+        from pandas.tools.util import nsmallest
+
+        if dropped.dtype == object:
+            try:
+                dropped = dropped.astype(float)
+            except:
+                return dropped.order().head(n)
+
+        inds = nsmallest(dropped.values, n, take_last)
+        if len(inds) == 0:
+            # TODO remove this special case
+            return dropped[[]]
+        return dropped.iloc[inds]
+
     def sortlevel(self, level=0, ascending=True):
         """
         Sort Series with MultiIndex by chosen level. Data will be
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
index 5b088598dfcec..89504ce599602 100644
--- a/pandas/tests/test_series.py
+++ b/pandas/tests/test_series.py
@@ -3956,6 +3956,43 @@ def test_order(self):
         ordered = ts.order(ascending=False, na_position='first')
         assert_almost_equal(expected, ordered.valid().values)
 
+    def test_nsmallest_nlargest(self):
+        # float, int, datetime64 (use i8), timedelts64 (same),
+        # object that are numbers, object that are strings
+
+        s_list = [Series([3, 2, 1, 2, 5]),
+                  Series([3., 2., 1., 2., 5.]),
+                  Series([3., 2, 1, 2, 5], dtype='object'),
+                  Series([3., 2, 1, 2, '5'], dtype='object'),
+                  Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005']))]
+
+        for s in s_list:
+            if s.dtype == object:
+                s2 = s.astype(float)
+            else:
+                s2 = s
+
+            assert_series_equal(s.nsmallest(2), s2.iloc[[2, 1]])
+            assert_series_equal(s.nsmallest(2, take_last=True), s2.iloc[[2, 3]])
+
+            assert_series_equal(s.nlargest(3), s2.iloc[[4, 0, 1]])
+            assert_series_equal(s.nlargest(3, take_last=True), s2.iloc[[4, 0, 3]])
+
+            empty = s2.iloc[0:0]
+            assert_series_equal(s.nsmallest(0), empty)
+            assert_series_equal(s.nsmallest(-1), empty)
+            assert_series_equal(s.nlargest(0), empty)
+            assert_series_equal(s.nlargest(-1), empty)
+
+            assert_series_equal(s.nsmallest(len(s)), s2.order())
+            assert_series_equal(s.nsmallest(len(s) + 1), s2.order())
+            assert_series_equal(s.nlargest(len(s)), s2.iloc[[4, 0, 1, 3, 2]])
+            assert_series_equal(s.nlargest(len(s) + 1), s2.iloc[[4, 0, 1, 3, 2]])
+
+        s = Series([3., np.nan, 1, 2, 5])
+        assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
+        assert_series_equal(s.nsmallest(), s.iloc[[2, 3, 0, 4]])
+
     def test_rank(self):
         from pandas.compat.scipy import rankdata
 
diff --git a/pandas/tools/util.py b/pandas/tools/util.py
index 6dbefc4b70930..8a8a2f89b2dd5 100644
--- a/pandas/tools/util.py
+++ b/pandas/tools/util.py
@@ -1,6 +1,9 @@
 from pandas.compat import reduce
 from pandas.core.index import Index
 import numpy as np
+from pandas import algos
+import pandas.core.common as com
+
 
 def match(needles, haystack):
     haystack = Index(haystack)
@@ -17,7 +20,7 @@ def cartesian_product(X):
     --------
     >>> cartesian_product([list('ABC'), [1, 2]])
     [array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
- 	array([1, 2, 1, 2, 1, 2])]
+    array([1, 2, 1, 2, 1, 2])]
 
     '''
 
@@ -43,3 +46,78 @@ def compose(*funcs):
     """Compose 2 or more callables"""
     assert len(funcs) > 1, 'At least 2 callables must be passed to compose'
     return reduce(_compose2, funcs)
+
+
+def nsmallest(arr, n=5, take_last=False):
+    '''
+    Find the indices of the n smallest values of a numpy array.
+
+    Note: Fails silently with NaN.
+
+    '''
+    if n <= 0:
+        return np.array([])  # empty
+    elif n >= len(arr):
+        n = len(arr)
+
+    if arr.dtype == object:
+        try:
+            arr = arr.astype(float)
+        except:
+            raise TypeError("An object array must convert to float.")
+
+    if com.needs_i8_conversion(arr):
+        dtype = 'i8'
+        kth_s = algos.kth_smallest_int64
+    elif arr.dtype in ['int64']:
+        dtype = 'int64'
+        kth_s = algos.kth_smallest_int64
+    elif arr.dtype in ['float64']:
+        dtype = 'float64'
+        kth_s = algos.kth_smallest_float64
+    else:
+        raise NotImplementedError("Not implemented for %s dtype, "
+                                  "perhaps convert to int64 or float64, "
+                                  "or use .order().head(n)") % arr.dtype
+
+    if take_last:
+        arr = arr.view(dtype)[::-1]
+    else:
+        arr = arr.view(dtype)
+
+    kth_val = kth_s(arr.copy(), n - 1)
+
+    ns = np.nonzero(arr <= kth_val)[0]
+    inds = ns[arr[ns].argsort(kind='mergesort')][:n]
+
+    if take_last:
+        # reverse indices
+        return len(arr) - 1 - inds
+    else:
+        return inds
+
+
+def nlargest(arr, n=5, take_last=False):
+    '''
+    Find the indices of the n largest values of a numpy array.
+
+    Note: Fails silently with NaN.
+
+    '''
+    if n <= 0:
+        return np.array([])  # empty
+    elif n >= len(arr):
+        n = len(arr)
+
+    if arr.dtype == object:
+        try:
+            arr = arr.astype(float)
+        except:
+            raise TypeError("An object array must convert to float.")
+
+    if com.needs_i8_conversion(arr):
+        arr = -arr.view('i8')
+    else:
+        arr = -arr
+
+    return nsmallest(arr, n, take_last=take_last)
diff --git a/vb_suite/series_methods.py b/vb_suite/series_methods.py
new file mode 100644
index 0000000000000..1659340cfe050
--- /dev/null
+++ b/vb_suite/series_methods.py
@@ -0,0 +1,29 @@
+from vbench.api import Benchmark
+from datetime import datetime
+
+common_setup = """from pandas_vb_common import *
+"""
+
+setup = common_setup + """
+s1 = Series(np.random.randn(10000))
+s2 = Series(np.random.randint(1, 10, 10000))
+"""
+
+series_nlargest1 = Benchmark('s1.nlargest(3, take_last=True);'
+                             's1.nlargest(3, take_last=False)',
+                             setup,
+                             start_date=datetime(2014, 1, 25))
+series_nlargest2 = Benchmark('s2.nlargest(3, take_last=True);'
+                             's2.nlargest(3, take_last=False)',
+                             setup,
+                             start_date=datetime(2014, 1, 25))
+
+series_nsmallest2 = Benchmark('s1.nsmallest(3, take_last=True);'
+                              's1.nsmallest(3, take_last=False)',
+                              setup,
+                              start_date=datetime(2014, 1, 25))
+
+series_nsmallest2 = Benchmark('s2.nsmallest(3, take_last=True);'
+                              's2.nsmallest(3, take_last=False)',
+                              setup,
+                              start_date=datetime(2014, 1, 25))

From 685ac641b77b64107dea85d82bf0133ad8f88211 Mon Sep 17 00:00:00 2001
From: Andy Hayden <andyhayden1@gmail.com>
Date: Fri, 11 Apr 2014 13:37:18 -0700
Subject: [PATCH 2/2] wip

---
 pandas/core/series.py       | 13 +++++--------
 pandas/tests/test_series.py | 22 +++++++++-------------
 pandas/tools/util.py        |  6 ++----
 3 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/pandas/core/series.py b/pandas/core/series.py
index 0e697ada11119..73a5ebce01d01 100644
--- a/pandas/core/series.py
+++ b/pandas/core/series.py
@@ -1785,7 +1785,7 @@ def nlargest(self, n=5, take_last=False):
         if dropped.dtype == object:
             try:
                 dropped = dropped.astype(float)
-            except:
+            except (NotImplementedError, TypeError):
                 return dropped.order(ascending=False).head(n)
 
         inds = nlargest(dropped.values, n, take_last)
@@ -1805,14 +1805,11 @@ def nsmallest(self, n=5, take_last=False):
         dropped = self.dropna()
 
         from pandas.tools.util import nsmallest
+        try:
+            inds = nsmallest(dropped.values, n, take_last)
+        except NotImplementedError:
+            return dropped.order().head(n)
 
-        if dropped.dtype == object:
-            try:
-                dropped = dropped.astype(float)
-            except:
-                return dropped.order().head(n)
-
-        inds = nsmallest(dropped.values, n, take_last)
         if len(inds) == 0:
             # TODO remove this special case
             return dropped[[]]
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py
index 89504ce599602..27a6281510e59 100644
--- a/pandas/tests/test_series.py
+++ b/pandas/tests/test_series.py
@@ -3967,27 +3967,23 @@ def test_nsmallest_nlargest(self):
                   Series(pd.to_datetime(['2003', '2002', '2001', '2002', '2005']))]
 
         for s in s_list:
-            if s.dtype == object:
-                s2 = s.astype(float)
-            else:
-                s2 = s
 
-            assert_series_equal(s.nsmallest(2), s2.iloc[[2, 1]])
-            assert_series_equal(s.nsmallest(2, take_last=True), s2.iloc[[2, 3]])
+            assert_series_equal(s.nsmallest(2), s.iloc[[2, 1]])
+            assert_series_equal(s.nsmallest(2, take_last=True), s.iloc[[2, 3]])
 
-            assert_series_equal(s.nlargest(3), s2.iloc[[4, 0, 1]])
-            assert_series_equal(s.nlargest(3, take_last=True), s2.iloc[[4, 0, 3]])
+            assert_series_equal(s.nlargest(3), s.iloc[[4, 0, 1]])
+            assert_series_equal(s.nlargest(3, take_last=True), s.iloc[[4, 0, 3]])
 
-            empty = s2.iloc[0:0]
+            empty = s.iloc[0:0]
             assert_series_equal(s.nsmallest(0), empty)
             assert_series_equal(s.nsmallest(-1), empty)
             assert_series_equal(s.nlargest(0), empty)
             assert_series_equal(s.nlargest(-1), empty)
 
-            assert_series_equal(s.nsmallest(len(s)), s2.order())
-            assert_series_equal(s.nsmallest(len(s) + 1), s2.order())
-            assert_series_equal(s.nlargest(len(s)), s2.iloc[[4, 0, 1, 3, 2]])
-            assert_series_equal(s.nlargest(len(s) + 1), s2.iloc[[4, 0, 1, 3, 2]])
+            assert_series_equal(s.nsmallest(len(s)), s.order())
+            assert_series_equal(s.nsmallest(len(s) + 1), s.order())
+            assert_series_equal(s.nlargest(len(s)), s.iloc[[4, 0, 1, 3, 2]])
+            assert_series_equal(s.nlargest(len(s) + 1), s.iloc[[4, 0, 1, 3, 2]])
 
         s = Series([3., np.nan, 1, 2, 5])
         assert_series_equal(s.nlargest(), s.iloc[[4, 0, 3, 2]])
diff --git a/pandas/tools/util.py b/pandas/tools/util.py
index 8a8a2f89b2dd5..60bbd68a8ac08 100644
--- a/pandas/tools/util.py
+++ b/pandas/tools/util.py
@@ -61,10 +61,8 @@ def nsmallest(arr, n=5, take_last=False):
         n = len(arr)
 
     if arr.dtype == object:
-        try:
-            arr = arr.astype(float)
-        except:
-            raise TypeError("An object array must convert to float.")
+        # just sort and take n
+        return arr.argsort(kind='mergesort')[:n]
 
     if com.needs_i8_conversion(arr):
         dtype = 'i8'