Skip to content

Commit 87e1212

Browse files
committed
Merge pull request #5231 from unutbu/nan-sort
EHN/FIX: Add na_last parameter to DataFrame.sort. Fixes GH3917
2 parents 110406c + 3230ed4 commit 87e1212

12 files changed

+352
-65
lines changed

doc/source/basics.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -1286,14 +1286,14 @@ The ``by`` argument can take a list of column names, e.g.:
12861286
12871287
Series has the method ``order`` (analogous to `R's order function
12881288
<http://stat.ethz.ch/R-manual/R-patched/library/base/html/order.html>`__) which
1289-
sorts by value, with special treatment of NA values via the ``na_last``
1289+
sorts by value, with special treatment of NA values via the ``na_position``
12901290
argument:
12911291

12921292
.. ipython:: python
12931293
12941294
s[2] = np.nan
12951295
s.order()
1296-
s.order(na_last=False)
1296+
s.order(na_position='first')
12971297
12981298
Some other sorting notes / nuances:
12991299

doc/source/release.rst

+2
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,8 @@ API Changes
147147
- Define and document the order of column vs index names in query/eval
148148
(:issue:`6676`)
149149

150+
- ``DataFrame.sort`` now places NaNs at the beginning or end of the sort according to the ``na_position`` parameter. (:issue:`3917`)
151+
150152
Deprecations
151153
~~~~~~~~~~~~
152154

pandas/core/common.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -316,9 +316,9 @@ def array_equivalent(left, right):
316316
# NaNs occur only in object arrays, float or complex arrays.
317317
if issubclass(left.dtype.type, np.object_):
318318
return ((left == right) | (pd.isnull(left) & pd.isnull(right))).all()
319-
if not issubclass(left.dtype.type, (np.floating, np.complexfloating)):
320-
return np.array_equal(left, right)
321-
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
319+
if issubclass(left.dtype.type, (np.floating, np.complexfloating)):
320+
return ((left == right) | (np.isnan(left) & np.isnan(right))).all()
321+
return np.array_equal(left, right)
322322

323323
def _iterable_not_string(x):
324324
return (isinstance(x, collections.Iterable) and

pandas/core/frame.py

100644100755
+30-27
Original file line numberDiff line numberDiff line change
@@ -2522,7 +2522,7 @@ def _m8_to_i8(x):
25222522
# Sorting
25232523

25242524
def sort(self, columns=None, axis=0, ascending=True,
2525-
inplace=False):
2525+
inplace=False, kind='quicksort', na_position='last'):
25262526
"""
25272527
Sort DataFrame either by labels (along either axis) or by the values in
25282528
column(s)
@@ -2540,6 +2540,11 @@ def sort(self, columns=None, axis=0, ascending=True,
25402540
Sort index/rows versus columns
25412541
inplace : boolean, default False
25422542
Sort the DataFrame without creating a new instance
2543+
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
2544+
This option is only applied when sorting on a single column or label.
2545+
na_position : {'first', 'last'} (optional, default='last')
2546+
'first' puts NaNs at the beginning
2547+
'last' puts NaNs at the end
25432548
25442549
Examples
25452550
--------
@@ -2550,10 +2555,10 @@ def sort(self, columns=None, axis=0, ascending=True,
25502555
sorted : DataFrame
25512556
"""
25522557
return self.sort_index(by=columns, axis=axis, ascending=ascending,
2553-
inplace=inplace)
2558+
inplace=inplace, kind=kind, na_position=na_position)
25542559

25552560
def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
2556-
kind='quicksort'):
2561+
kind='quicksort', na_position='last'):
25572562
"""
25582563
Sort DataFrame either by labels (along either axis) or by the values in
25592564
a column
@@ -2571,6 +2576,11 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
25712576
orders
25722577
inplace : boolean, default False
25732578
Sort the DataFrame without creating a new instance
2579+
na_position : {'first', 'last'} (optional, default='last')
2580+
'first' puts NaNs at the beginning
2581+
'last' puts NaNs at the end
2582+
kind : {'quicksort', 'mergesort', 'heapsort'}, optional
2583+
This option is only applied when sorting on a single column or label.
25742584
25752585
Examples
25762586
--------
@@ -2580,8 +2590,8 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
25802590
-------
25812591
sorted : DataFrame
25822592
"""
2583-
from pandas.core.groupby import _lexsort_indexer
2584-
2593+
2594+
from pandas.core.groupby import _lexsort_indexer, _nargsort
25852595
axis = self._get_axis_number(axis)
25862596
if axis not in [0, 1]: # pragma: no cover
25872597
raise AssertionError('Axis must be 0 or 1, got %s' % str(axis))
@@ -2597,23 +2607,19 @@ def sort_index(self, axis=0, by=None, ascending=True, inplace=False,
25972607
if com._is_sequence(ascending) and len(by) != len(ascending):
25982608
raise ValueError('Length of ascending (%d) != length of by'
25992609
' (%d)' % (len(ascending), len(by)))
2600-
26012610
if len(by) > 1:
2602-
keys = []
2603-
for x in by:
2604-
k = self[x].values
2605-
if k.ndim == 2:
2606-
raise ValueError('Cannot sort by duplicate column %s'
2607-
% str(x))
2608-
keys.append(k)
2609-
26102611
def trans(v):
26112612
if com.needs_i8_conversion(v):
26122613
return v.view('i8')
26132614
return v
2614-
2615-
keys = [trans(self[x].values) for x in by]
2616-
indexer = _lexsort_indexer(keys, orders=ascending)
2615+
keys = []
2616+
for x in by:
2617+
k = self[x].values
2618+
if k.ndim == 2:
2619+
raise ValueError('Cannot sort by duplicate column %s' % str(x))
2620+
keys.append(trans(k))
2621+
indexer = _lexsort_indexer(keys, orders=ascending,
2622+
na_position=na_position)
26172623
indexer = com._ensure_platform_int(indexer)
26182624
else:
26192625
by = by[0]
@@ -2630,20 +2636,17 @@ def trans(v):
26302636
% str(by))
26312637
if isinstance(ascending, (tuple, list)):
26322638
ascending = ascending[0]
2639+
indexer = _nargsort(k, kind=kind, ascending=ascending,
2640+
na_position=na_position)
26332641

2634-
if not ascending:
2635-
k = k[::-1]
2636-
indexer = k.argsort(kind=kind)
2637-
if not ascending:
2638-
indexer = indexer.max() - indexer[::-1]
26392642
elif isinstance(labels, MultiIndex):
2640-
indexer = _lexsort_indexer(labels.labels, orders=ascending)
2643+
indexer = _lexsort_indexer(labels.labels, orders=ascending,
2644+
na_position=na_position)
26412645
indexer = com._ensure_platform_int(indexer)
26422646
else:
2643-
indexer = labels.argsort(kind=kind)
2644-
if not ascending:
2645-
indexer = indexer[::-1]
2646-
2647+
indexer = _nargsort(labels, kind=kind, ascending=ascending,
2648+
na_position=na_position)
2649+
26472650
if inplace:
26482651
if axis == 1:
26492652
new_data = self._data.reindex_items(

pandas/core/groupby.py

+47-8
Original file line numberDiff line numberDiff line change
@@ -3145,33 +3145,72 @@ def _indexer_from_factorized(labels, shape, compress=True):
31453145
return indexer
31463146

31473147

3148-
def _lexsort_indexer(keys, orders=None):
3148+
def _lexsort_indexer(keys, orders=None, na_position='last'):
31493149
labels = []
31503150
shape = []
3151-
31523151
if isinstance(orders, bool):
31533152
orders = [orders] * len(keys)
31543153
elif orders is None:
31553154
orders = [True] * len(keys)
31563155

31573156
for key, order in zip(keys, orders):
3157+
key = np.asanyarray(key)
31583158
rizer = _hash.Factorizer(len(key))
31593159

31603160
if not key.dtype == np.object_:
31613161
key = key.astype('O')
31623162

3163+
# factorize maps nans to na_sentinel=-1
31633164
ids = rizer.factorize(key, sort=True)
3164-
31653165
n = len(rizer.uniques)
3166+
mask = (ids == -1)
3167+
if order: # ascending
3168+
if na_position == 'last':
3169+
ids = np.where(mask, n, ids)
3170+
elif na_position == 'first':
3171+
ids += 1
3172+
else:
3173+
raise ValueError('invalid na_position: {!r}'.format(na_position))
3174+
else: # not order means descending
3175+
if na_position == 'last':
3176+
ids = np.where(mask, n, n-ids-1)
3177+
elif na_position == 'first':
3178+
ids = np.where(mask, 0, n-ids)
3179+
else:
3180+
raise ValueError('invalid na_position: {!r}'.format(na_position))
3181+
if mask.any():
3182+
n += 1
31663183
shape.append(n)
3167-
if not order:
3168-
mask = ids == -1
3169-
ids = np.where(mask, -1, n - ids)
3170-
31713184
labels.append(ids)
3172-
31733185
return _indexer_from_factorized(labels, shape)
31743186

3187+
def _nargsort(items, kind='quicksort', ascending=True, na_position='last'):
3188+
"""
3189+
This is intended to be a drop-in replacement for np.argsort which handles NaNs
3190+
It adds ascending and na_position parameters.
3191+
GH #6399, #5231
3192+
"""
3193+
items = np.asanyarray(items)
3194+
idx = np.arange(len(items))
3195+
mask = isnull(items)
3196+
non_nans = items[~mask]
3197+
non_nan_idx = idx[~mask]
3198+
nan_idx = np.nonzero(mask)[0]
3199+
if not ascending:
3200+
non_nans = non_nans[::-1]
3201+
non_nan_idx = non_nan_idx[::-1]
3202+
indexer = non_nan_idx[non_nans.argsort(kind=kind)]
3203+
if not ascending:
3204+
indexer = indexer[::-1]
3205+
# Finally, place the NaNs at the end or the beginning according to na_position
3206+
if na_position == 'last':
3207+
indexer = np.concatenate([indexer, nan_idx])
3208+
elif na_position == 'first':
3209+
indexer = np.concatenate([nan_idx, indexer])
3210+
else:
3211+
raise ValueError('invalid na_position: {!r}'.format(na_position))
3212+
return indexer
3213+
31753214

31763215
class _KeyMapper(object):
31773216

pandas/core/index.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from pandas.core.base import FrozenList, FrozenNDArray, IndexOpsMixin
1414

1515
from pandas.util.decorators import cache_readonly, deprecate
16-
from pandas.core.common import isnull
16+
from pandas.core.common import isnull, array_equivalent
1717
import pandas.core.common as com
1818
from pandas.core.common import _values_from_object, is_float, is_integer, ABCSeries
1919
from pandas.core.config import get_option
@@ -800,7 +800,7 @@ def equals(self, other):
800800
if type(other) != Index:
801801
return other.equals(self)
802802

803-
return np.array_equal(self, other)
803+
return array_equivalent(self, other)
804804

805805
def identical(self, other):
806806
"""Similar to equals, but check that other comparable attributes are
@@ -1872,7 +1872,7 @@ def equals(self, other):
18721872
# return False
18731873

18741874
try:
1875-
return np.array_equal(self, other)
1875+
return array_equivalent(self, other)
18761876
except TypeError:
18771877
# e.g. fails in numpy 1.6 with DatetimeIndex #1681
18781878
return False
@@ -3533,7 +3533,7 @@ def equals(self, other):
35333533
return True
35343534

35353535
if not isinstance(other, MultiIndex):
3536-
return np.array_equal(self.values, _ensure_index(other))
3536+
return array_equivalent(self.values, _ensure_index(other))
35373537

35383538
if self.nlevels != other.nlevels:
35393539
return False
@@ -3546,7 +3546,7 @@ def equals(self, other):
35463546
allow_fill=False)
35473547
ovalues = com.take_nd(other.levels[i].values, other.labels[i],
35483548
allow_fill=False)
3549-
if not np.array_equal(svalues, ovalues):
3549+
if not array_equivalent(svalues, ovalues):
35503550
return False
35513551

35523552
return True

pandas/core/series.py

+14-5
Original file line numberDiff line numberDiff line change
@@ -1743,24 +1743,32 @@ def rank(self, method='average', na_option='keep', ascending=True,
17431743
ascending=ascending, pct=pct)
17441744
return self._constructor(ranks, index=self.index).__finalize__(self)
17451745

1746-
def order(self, na_last=True, ascending=True, kind='mergesort'):
1746+
def order(self, na_last=None, ascending=True, kind='mergesort', na_position='last'):
17471747
"""
17481748
Sorts Series object, by value, maintaining index-value link
17491749
17501750
Parameters
17511751
----------
1752-
na_last : boolean (optional, default=True)
1752+
na_last : boolean (optional, default=True) (DEPRECATED; use na_position)
17531753
Put NaN's at beginning or end
17541754
ascending : boolean, default True
17551755
Sort ascending. Passing False sorts descending
17561756
kind : {'mergesort', 'quicksort', 'heapsort'}, default 'mergesort'
17571757
Choice of sorting algorithm. See np.sort for more
17581758
information. 'mergesort' is the only stable algorithm
1759+
na_position : {'first', 'last'} (optional, default='last')
1760+
'first' puts NaNs at the beginning
1761+
'last' puts NaNs at the end
17591762
17601763
Returns
17611764
-------
17621765
y : Series
17631766
"""
1767+
if na_last is not None:
1768+
warnings.warn(("na_last is deprecated. Please use na_position instead"),
1769+
FutureWarning)
1770+
na_position = 'last' if na_last else 'first'
1771+
17641772
def _try_kind_sort(arr):
17651773
# easier to ask forgiveness than permission
17661774
try:
@@ -1784,15 +1792,16 @@ def _try_kind_sort(arr):
17841792
if not ascending:
17851793
argsorted = argsorted[::-1]
17861794

1787-
if na_last:
1795+
if na_position == 'last':
17881796
n = good.sum()
17891797
sortedIdx[:n] = idx[good][argsorted]
17901798
sortedIdx[n:] = idx[bad]
1791-
else:
1799+
elif na_position == 'first':
17921800
n = bad.sum()
17931801
sortedIdx[n:] = idx[good][argsorted]
17941802
sortedIdx[:n] = idx[bad]
1795-
1803+
else:
1804+
raise ValueError('invalid na_position: {!r}'.format(na_position))
17961805
return self._constructor(arr[sortedIdx], index=self.index[sortedIdx])\
17971806
.__finalize__(self)
17981807

pandas/hashtable.pyx

+8-5
Original file line numberDiff line numberDiff line change
@@ -835,20 +835,23 @@ cdef class Factorizer:
835835
return self.count
836836

837837
def factorize(self, ndarray[object] values, sort=False, na_sentinel=-1):
838+
"""
839+
Factorize values with nans replaced by na_sentinel
840+
>>> factorize(np.array([1,2,np.nan], dtype='O'), na_sentinel=20)
841+
array([ 0, 1, 20])
842+
"""
838843
labels = self.table.get_labels(values, self.uniques,
839844
self.count, na_sentinel)
840-
845+
mask = (labels == na_sentinel)
841846
# sort on
842847
if sort:
843848
if labels.dtype != np.int_:
844849
labels = labels.astype(np.int_)
845-
846850
sorter = self.uniques.to_array().argsort()
847851
reverse_indexer = np.empty(len(sorter), dtype=np.int_)
848852
reverse_indexer.put(sorter, np.arange(len(sorter)))
849-
850-
labels = reverse_indexer.take(labels)
851-
853+
labels = reverse_indexer.take(labels, mode='clip')
854+
labels[mask] = na_sentinel
852855
self.count = len(self.uniques)
853856
return labels
854857

0 commit comments

Comments
 (0)