Merge pull request #5231 from unutbu/nan-sort

jreback · jreback · commit 87e121282937 · 2014-03-27T18:23:50.000-04:00
EHN/FIX: Add na_last parameter to DataFrame.sort. Fixes GH3917
diff --git a/doc/source/basics.rst b/doc/source/basics.rst
@@ -1286,14 +1286,14 @@ The ``by`` argument can take a list of column names, e.g.:
 
 Series has the method ``order`` (analogous to `R's order function
 <http://stat.ethz.ch/R-manual/R-patched/library/base/html/order.html>`__) which
-sorts by value, with special treatment of NA values via the ``na_last``
+sorts by value, with special treatment of NA values via the ``na_position``
 argument:
 
 .. ipython:: python
 
    s[2] = np.nan
    s.order()
-   s.order(na_last=False)
+   s.order(na_position='first')
 
 Some other sorting notes / nuances:
 
diff --git a/doc/source/release.rst b/doc/source/release.rst
@@ -147,6 +147,8 @@ API Changes
 - Define and document the order of column vs index names in query/eval
     (:issue:`6676`)
 
+- ``DataFrame.sort`` now places NaNs at the beginning or end of the sort according to the ``na_position`` parameter. (:issue:`3917`)
+
 Deprecations
 ~~~~~~~~~~~~
 
diff --git a/pandas/core/common.py b/pandas/core/common.py
@@ -316,9 +316,9 @@ def array_equivalent(left, right):
     # NaNs occur only in object arrays, float or complex arrays.
     if issubclass(left.dtype.type, np.object_):
         return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
-    if not issubclass(left.dtype.type, (np.floating, np.complexfloating)):
-        return np.array_equal(left, right)
-    return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
+    if issubclass(left.dtype.type, (np.floating, np.complexfloating)):
+        return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
+    return np.array_equal(left, right)
 
 def _iterable_not_string(x):
     return (isinstance(x, collections.Iterable) and
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2522,7 +2522,7 @@ def _m8_to_i8(x):
     # Sorting
 
     def sort(self, columns=None, axis=0, ascending=True,
-             inplace=False):
+             inplace=False, kind='quicksort', na_position='last'):
         """
         Sort DataFrame either by labels (along either axis) or by the values in
         column(s)
@@ -2540,6 +2540,11 @@ def sort(self, columns=None, axis=0, ascending=True,
             Sort index/rows versus columns
         inplace : boolean, default False
             Sort the DataFrame without creating a new instance
+        kind : {'quicksort', 'mergesort', 'heapsort'}, optional
+            This option is only applied when sorting on a single column or label.
+        na_position : {'first', 'last'} (optional, default='last')
+            'first' puts NaNs at the beginning
+            'last' puts NaNs at the end
 
         Examples
         --------
@@ -2550,10 +2555,10 @@ def sort(self, columns=None, axis=0, ascending=True,
         sorted : DataFrame
         """
         return self.sort_index(by=columns, axis=axis, ascending=ascending,
-                               inplace=inplace)
+                               inplace=inplace, kind=kind, na_position=na_position)
 
     def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
-                   kind='quicksort'):
+                   kind='quicksort', na_position='last'):
         """
         Sort DataFrame either by labels (along either axis) or by the values in
         a column
@@ -2571,6 +2576,11 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
             orders
         inplace : boolean, default False
             Sort the DataFrame without creating a new instance
+        na_position : {'first', 'last'} (optional, default='last')
+            'first' puts NaNs at the beginning
+            'last' puts NaNs at the end
+        kind : {'quicksort', 'mergesort', 'heapsort'}, optional
+            This option is only applied when sorting on a single column or label.
 
         Examples
         --------
@@ -2580,8 +2590,8 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
         -------
         sorted : DataFrame
         """
-        from pandas.core.groupby import _lexsort_indexer
-
+        
+        from pandas.core.groupby import _lexsort_indexer, _nargsort
         axis = self._get_axis_number(axis)
         if axis not in [0, 1]:  # pragma: no cover
             raise AssertionError('Axis must be 0 or 1, got %s' % str(axis))
@@ -2597,23 +2607,19 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
             if com._is_sequence(ascending) and len(by) != len(ascending):
                 raise ValueError('Length of ascending (%d) != length of by'
                                  ' (%d)' % (len(ascending), len(by)))
-
             if len(by) > 1:
-                keys = []
-                for x in by:
-                    k = self[x].values
-                    if k.ndim == 2:
-                        raise ValueError('Cannot sort by duplicate column %s'
-                                         % str(x))
-                    keys.append(k)
-
                 def trans(v):
                     if com.needs_i8_conversion(v):
                         return v.view('i8')
                     return v
-
-                keys = [trans(self[x].values) for x in by]
-                indexer = _lexsort_indexer(keys, orders=ascending)
+                keys = []
+                for x in by:
+                    k = self[x].values
+                    if k.ndim == 2:
+                        raise ValueError('Cannot sort by duplicate column %s' % str(x))
+                    keys.append(trans(k))
+                indexer = _lexsort_indexer(keys, orders=ascending,
+                                           na_position=na_position)
                 indexer = com._ensure_platform_int(indexer)
             else:
                 by = by[0]
@@ -2630,20 +2636,17 @@ def trans(v):
                                      % str(by))
                 if isinstance(ascending, (tuple, list)):
                     ascending = ascending[0]
+                indexer = _nargsort(k, kind=kind, ascending=ascending,
+                                    na_position=na_position)
 
-                if not ascending:
-                    k = k[::-1]
-                indexer = k.argsort(kind=kind)
-                if not ascending:
-                    indexer = indexer.max() - indexer[::-1]
         elif isinstance(labels, MultiIndex):
-            indexer = _lexsort_indexer(labels.labels, orders=ascending)
+            indexer = _lexsort_indexer(labels.labels, orders=ascending,
+                                       na_position=na_position)
             indexer = com._ensure_platform_int(indexer)
         else:
-            indexer = labels.argsort(kind=kind)
-            if not ascending:
-                indexer = indexer[::-1]
-
+            indexer = _nargsort(labels, kind=kind, ascending=ascending,
+                                na_position=na_position)
+            
         if inplace:
             if axis == 1:
                 new_data = self._data.reindex_items(
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -3145,33 +3145,72 @@ def _indexer_from_factorized(labels, shape, compress=True):
     return indexer
 
 
-def _lexsort_indexer(keys, orders=None):
+def _lexsort_indexer(keys, orders=None, na_position='last'):
     labels = []
     shape = []
-
     if isinstance(orders, bool):
         orders = [orders] * len(keys)
     elif orders is None:
         orders = [True] * len(keys)
 
     for key, order in zip(keys, orders):
+        key = np.asanyarray(key)
         rizer = _hash.Factorizer(len(key))
 
         if not key.dtype == np.object_:
             key = key.astype('O')
 
+        # factorize maps nans to na_sentinel=-1
         ids = rizer.factorize(key, sort=True)
-
         n = len(rizer.uniques)
+        mask = (ids == -1)
+        if order: # ascending
+            if na_position == 'last':
+                ids = np.where(mask, n, ids)
+            elif na_position == 'first':
+                ids += 1
+            else:
+                raise ValueError('invalid na_position: {!r}'.format(na_position))
+        else: # not order means descending
+            if na_position == 'last':
+                ids = np.where(mask, n, n-ids-1)
+            elif na_position == 'first':
+                ids = np.where(mask, 0, n-ids)
+            else:
+                raise ValueError('invalid na_position: {!r}'.format(na_position))
+        if mask.any():
+            n += 1
         shape.append(n)
-        if not order:
-            mask = ids == -1
-            ids = np.where(mask, -1, n - ids)
-
         labels.append(ids)
-
     return _indexer_from_factorized(labels, shape)
 
+def _nargsort(items, kind='quicksort', ascending=True, na_position='last'):
+    """
+    This is intended to be a drop-in replacement for np.argsort which handles NaNs
+    It adds ascending and na_position parameters.
+    GH #6399, #5231
+    """
+    items = np.asanyarray(items)
+    idx = np.arange(len(items))
+    mask = isnull(items)
+    non_nans = items[~mask]
+    non_nan_idx = idx[~mask]
+    nan_idx = np.nonzero(mask)[0]
+    if not ascending:
+        non_nans = non_nans[::-1]
+        non_nan_idx = non_nan_idx[::-1]
+    indexer = non_nan_idx[non_nans.argsort(kind=kind)]
+    if not ascending:
+        indexer = indexer[::-1]
+    # Finally, place the NaNs at the end or the beginning according to na_position
+    if na_position == 'last':
+        indexer = np.concatenate([indexer, nan_idx])
+    elif na_position == 'first':
+        indexer = np.concatenate([nan_idx, indexer])
+    else:
+        raise ValueError('invalid na_position: {!r}'.format(na_position))
+    return indexer
+
 
 class _KeyMapper(object):
 
diff --git a/pandas/core/index.py b/pandas/core/index.py
@@ -13,7 +13,7 @@
 from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin
 
 from pandas.util.decorators import cache_readonly, deprecate
-from pandas.core.common import isnull
+from pandas.core.common import isnull, array_equivalent
 import pandas.core.common as com
 from pandas.core.common import _values_from_object, is_float, is_integer, ABCSeries
 from pandas.core.config import get_option
@@ -800,7 +800,7 @@ def equals(self, other):
         if type(other) != Index:
             return other.equals(self)
 
-        return np.array_equal(self, other)
+        return array_equivalent(self, other)
 
     def identical(self, other):
         """Similar to equals, but check that other comparable attributes are
@@ -1872,7 +1872,7 @@ def equals(self, other):
         #     return False
 
         try:
-            return np.array_equal(self, other)
+            return array_equivalent(self, other)
         except TypeError:
             # e.g. fails in numpy 1.6 with DatetimeIndex #1681
             return False
@@ -3533,7 +3533,7 @@ def equals(self, other):
             return True
 
         if not isinstance(other, MultiIndex):
-            return np.array_equal(self.values, _ensure_index(other))
+            return array_equivalent(self.values, _ensure_index(other))
 
         if self.nlevels != other.nlevels:
             return False
@@ -3546,7 +3546,7 @@ def equals(self, other):
                                   allow_fill=False)
             ovalues = com.take_nd(other.levels[i].values, other.labels[i],
                                   allow_fill=False)
-            if not np.array_equal(svalues, ovalues):
+            if not array_equivalent(svalues, ovalues):
                 return False
 
         return True
diff --git a/pandas/core/series.py b/pandas/core/series.py
@@ -1743,24 +1743,32 @@ def rank(self, method='average', na_option='keep', ascending=True,
                      ascending=ascending, pct=pct)
         return self._constructor(ranks, index=self.index).__finalize__(self)
 
-    def order(self, na_last=True, ascending=True, kind='mergesort'):
+    def order(self, na_last=None, ascending=True, kind='mergesort', na_position='last'):
         """
         Sorts Series object, by value, maintaining index-value link
 
         Parameters
         ----------
-        na_last : boolean (optional, default=True)
+        na_last : boolean (optional, default=True) (DEPRECATED; use na_position)
             Put NaN's at beginning or end
         ascending : boolean, default True
             Sort ascending. Passing False sorts descending
         kind : {'mergesort', 'quicksort', 'heapsort'}, default 'mergesort'
             Choice of sorting algorithm. See np.sort for more
             information. 'mergesort' is the only stable algorithm
+        na_position : {'first', 'last'} (optional, default='last')
+            'first' puts NaNs at the beginning
+            'last' puts NaNs at the end
 
         Returns
         -------
         y : Series
         """
+        if na_last is not None:
+            warnings.warn(("na_last is deprecated. Please use na_position instead"),
+                          FutureWarning)
+            na_position = 'last' if na_last else 'first'
+            
         def _try_kind_sort(arr):
             # easier to ask forgiveness than permission
             try:
@@ -1784,15 +1792,16 @@ def _try_kind_sort(arr):
         if not ascending:
             argsorted = argsorted[::-1]
 
-        if na_last:
+        if na_position == 'last':
             n = good.sum()
             sortedIdx[:n] = idx[good][argsorted]
             sortedIdx[n:] = idx[bad]
-        else:
+        elif na_position == 'first':
             n = bad.sum()
             sortedIdx[n:] = idx[good][argsorted]
             sortedIdx[:n] = idx[bad]
-
+        else:
+            raise ValueError('invalid na_position: {!r}'.format(na_position))
         return self._constructor(arr[sortedIdx], index=self.index[sortedIdx])\
                    .__finalize__(self)
 
diff --git a/pandas/hashtable.pyx b/pandas/hashtable.pyx
@@ -835,20 +835,23 @@ cdef class Factorizer:
         return self.count
 
     def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1):
+        """
+        Factorize values with nans replaced by na_sentinel
+        >>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
+        array([ 0,  1, 20])
+        """
         labels = self.table.get_labels(values, self.uniques,
                                        self.count, na_sentinel)
-
+        mask = (labels == na_sentinel)
         # sort on
         if sort:
             if labels.dtype != np.int_:
                 labels = labels.astype(np.int_)
-
             sorter = self.uniques.to_array().argsort()
             reverse_indexer = np.empty(len(sorter), dtype=np.int_)
             reverse_indexer.put(sorter, np.arange(len(sorter)))
-
-            labels = reverse_indexer.take(labels)
-
+            labels = reverse_indexer.take(labels, mode='clip')
+            labels[mask] = na_sentinel
         self.count = len(self.uniques)
         return labels
 
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
diff --git a/pandas/tests/test_groupby.py b/pandas/tests/test_groupby.py
diff --git a/pandas/tests/test_hashtable.py b/pandas/tests/test_hashtable.py
diff --git a/pandas/tests/test_series.py b/pandas/tests/test_series.py