Skip to content

Commit 32ef84b

Browse files
staftermathjreback
authored andcommitted
BUG: df.sort_values() not respecting na_position with categoricals #22556 (#22640)
1 parent 0c51d4d commit 32ef84b

File tree

5 files changed

+106
-30
lines changed

5 files changed

+106
-30
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -791,6 +791,7 @@ Categorical
791791
^^^^^^^^^^^
792792

793793
- Bug in :meth:`Categorical.from_codes` where ``NaN`` values in ``codes`` were silently converted to ``0`` (:issue:`21767`). In the future this will raise a ``ValueError``. Also changes the behavior of ``.from_codes([1.1, 2.0])``.
794+
- Bug in :meth:`Categorical.sort_values` where ``NaN`` values were always positioned in front regardless of ``na_position`` value. (:issue:`22556`).
794795
- Bug when indexing with a boolean-valued ``Categorical``. Now a boolean-valued ``Categorical`` is treated as a boolean mask (:issue:`22665`)
795796
- Constructing a :class:`CategoricalIndex` with empty values and boolean categories was raising a ``ValueError`` after a change to dtype coercion (:issue:`22702`).
796797

pandas/core/arrays/categorical.py

+9-24
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,8 @@
4545

4646
import pandas.core.algorithms as algorithms
4747

48+
from pandas.core.sorting import nargsort
49+
4850
from pandas.io.formats import console
4951
from pandas.io.formats.terminal import get_terminal_size
5052
from pandas.util._validators import validate_bool_kwarg, validate_fillna_kwargs
@@ -1605,32 +1607,15 @@ def sort_values(self, inplace=False, ascending=True, na_position='last'):
16051607
msg = 'invalid na_position: {na_position!r}'
16061608
raise ValueError(msg.format(na_position=na_position))
16071609

1608-
codes = np.sort(self._codes)
1609-
if not ascending:
1610-
codes = codes[::-1]
1611-
1612-
# NaN handling
1613-
na_mask = (codes == -1)
1614-
if na_mask.any():
1615-
n_nans = len(codes[na_mask])
1616-
if na_position == "first":
1617-
# in this case sort to the front
1618-
new_codes = codes.copy()
1619-
new_codes[0:n_nans] = -1
1620-
new_codes[n_nans:] = codes[~na_mask]
1621-
codes = new_codes
1622-
elif na_position == "last":
1623-
# ... and to the end
1624-
new_codes = codes.copy()
1625-
pos = len(codes) - n_nans
1626-
new_codes[0:pos] = codes[~na_mask]
1627-
new_codes[pos:] = -1
1628-
codes = new_codes
1610+
sorted_idx = nargsort(self,
1611+
ascending=ascending,
1612+
na_position=na_position)
1613+
16291614
if inplace:
1630-
self._codes = codes
1631-
return
1615+
self._codes = self._codes[sorted_idx]
16321616
else:
1633-
return self._constructor(values=codes, dtype=self.dtype,
1617+
return self._constructor(values=self._codes[sorted_idx],
1618+
dtype=self.dtype,
16341619
fastpath=True)
16351620

16361621
def _values_for_rank(self):

pandas/core/sorting.py

+13-1
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,19 @@ def nargsort(items, kind='quicksort', ascending=True, na_position='last'):
241241

242242
# specially handle Categorical
243243
if is_categorical_dtype(items):
244-
return items.argsort(ascending=ascending, kind=kind)
244+
if na_position not in {'first', 'last'}:
245+
raise ValueError('invalid na_position: {!r}'.format(na_position))
246+
247+
mask = isna(items)
248+
cnt_null = mask.sum()
249+
sorted_idx = items.argsort(ascending=ascending, kind=kind)
250+
if ascending and na_position == 'last':
251+
# NaN is coded as -1 and is listed in front after sorting
252+
sorted_idx = np.roll(sorted_idx, -cnt_null)
253+
elif not ascending and na_position == 'first':
254+
# NaN is coded as -1 and is listed in the end after sorting
255+
sorted_idx = np.roll(sorted_idx, cnt_null)
256+
return sorted_idx
245257

246258
items = np.asanyarray(items)
247259
idx = np.arange(len(items))

pandas/tests/frame/test_sorting.py

+82-4
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from pandas.compat import lrange
1111
from pandas.api.types import CategoricalDtype
1212
from pandas import (DataFrame, Series, MultiIndex, Timestamp,
13-
date_range, NaT, IntervalIndex)
13+
date_range, NaT, IntervalIndex, Categorical)
1414

1515
from pandas.util.testing import assert_series_equal, assert_frame_equal
1616

@@ -161,7 +161,7 @@ def test_sort_nan(self):
161161
'B': [5, 9, 2, nan, 5, 5, 4]},
162162
index=[2, 0, 3, 1, 6, 4, 5])
163163
sorted_df = df.sort_values(['A', 'B'], ascending=[
164-
1, 0], na_position='first')
164+
1, 0], na_position='first')
165165
assert_frame_equal(sorted_df, expected)
166166

167167
# na_position='last', not order
@@ -170,7 +170,7 @@ def test_sort_nan(self):
170170
'B': [4, 5, 5, nan, 2, 9, 5]},
171171
index=[5, 4, 6, 1, 3, 0, 2])
172172
sorted_df = df.sort_values(['A', 'B'], ascending=[
173-
0, 1], na_position='last')
173+
0, 1], na_position='last')
174174
assert_frame_equal(sorted_df, expected)
175175

176176
# Test DataFrame with nan label
@@ -514,7 +514,7 @@ def test_sort_index_categorical_index(self):
514514

515515
df = (DataFrame({'A': np.arange(6, dtype='int64'),
516516
'B': Series(list('aabbca'))
517-
.astype(CategoricalDtype(list('cab')))})
517+
.astype(CategoricalDtype(list('cab')))})
518518
.set_index('B'))
519519

520520
result = df.sort_index()
@@ -598,3 +598,81 @@ def test_sort_index_intervalindex(self):
598598
closed='right')
599599
result = result.columns.levels[1].categories
600600
tm.assert_index_equal(result, expected)
601+
602+
def test_sort_index_na_position_with_categories(self):
603+
# GH 22556
604+
# Positioning missing value properly when column is Categorical.
605+
categories = ['A', 'B', 'C']
606+
category_indices = [0, 2, 4]
607+
list_of_nans = [np.nan, np.nan]
608+
na_indices = [1, 3]
609+
na_position_first = 'first'
610+
na_position_last = 'last'
611+
column_name = 'c'
612+
613+
reversed_categories = sorted(categories, reverse=True)
614+
reversed_category_indices = sorted(category_indices, reverse=True)
615+
reversed_na_indices = sorted(na_indices, reverse=True)
616+
617+
df = pd.DataFrame({
618+
column_name: pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
619+
categories=categories,
620+
ordered=True)})
621+
# sort ascending with na first
622+
result = df.sort_values(by=column_name,
623+
ascending=True,
624+
na_position=na_position_first)
625+
expected = DataFrame({
626+
column_name: Categorical(list_of_nans + categories,
627+
categories=categories,
628+
ordered=True)
629+
}, index=na_indices + category_indices)
630+
631+
assert_frame_equal(result, expected)
632+
633+
# sort ascending with na last
634+
result = df.sort_values(by=column_name,
635+
ascending=True,
636+
na_position=na_position_last)
637+
expected = DataFrame({
638+
column_name: Categorical(categories + list_of_nans,
639+
categories=categories,
640+
ordered=True)
641+
}, index=category_indices + na_indices)
642+
643+
assert_frame_equal(result, expected)
644+
645+
# sort descending with na first
646+
result = df.sort_values(by=column_name,
647+
ascending=False,
648+
na_position=na_position_first)
649+
expected = DataFrame({
650+
column_name: Categorical(list_of_nans + reversed_categories,
651+
categories=categories,
652+
ordered=True)
653+
}, index=reversed_na_indices + reversed_category_indices)
654+
655+
assert_frame_equal(result, expected)
656+
657+
# sort descending with na last
658+
result = df.sort_values(by=column_name,
659+
ascending=False,
660+
na_position=na_position_last)
661+
expected = DataFrame({
662+
column_name: Categorical(reversed_categories + list_of_nans,
663+
categories=categories,
664+
ordered=True)
665+
}, index=reversed_category_indices + reversed_na_indices)
666+
667+
assert_frame_equal(result, expected)
668+
669+
def test_sort_index_na_position_with_categories_raises(self):
670+
df = pd.DataFrame({
671+
'c': pd.Categorical(['A', np.nan, 'B', np.nan, 'C'],
672+
categories=['A', 'B', 'C'],
673+
ordered=True)})
674+
675+
with pytest.raises(ValueError):
676+
df.sort_values(by='c',
677+
ascending=False,
678+
na_position='bad_position')

pandas/tests/io/test_stata.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -995,7 +995,7 @@ def test_categorical_sorting(self, file):
995995
parsed = read_stata(getattr(self, file))
996996

997997
# Sort based on codes, not strings
998-
parsed = parsed.sort_values("srh")
998+
parsed = parsed.sort_values("srh", na_position='first')
999999

10001000
# Don't sort index
10011001
parsed.index = np.arange(parsed.shape[0])

0 commit comments

Comments
 (0)