Skip to content

Commit 9857a0e

Browse files
committed
Merge pull request #7850 from jreback/cat_sort
BUG: fix multi-column sort that includes Categoricals / concat (GH7848/GH7864)
2 parents 381a289 + 620462b commit 9857a0e

File tree

5 files changed

+122
-52
lines changed

5 files changed

+122
-52
lines changed

doc/source/v0.15.0.txt

+2-1
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,8 @@ Categoricals in Series/DataFrame
116116
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
117117

118118
:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
119-
methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`).
119+
methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
120+
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`).
120121

121122
For full docs, see the :ref:`Categorical introduction <categorical>` and the :ref:`API documentation <api.categorical>`.
122123

pandas/core/categorical.py

+31-21
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@
1616
from pandas.core.config import get_option
1717
from pandas.core import format as fmt
1818

19-
2019
def _cat_compare_op(op):
2120
def f(self, other):
2221
if isinstance(other, (Categorical, np.ndarray)):
@@ -45,16 +44,6 @@ def _maybe_to_categorical(array):
4544
return array
4645

4746

48-
def _get_codes_for_values(values, levels):
49-
from pandas.core.algorithms import _get_data_algo, _hashtables
50-
if values.dtype != levels.dtype:
51-
values = com._ensure_object(values)
52-
levels = com._ensure_object(levels)
53-
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
54-
t = hash_klass(len(levels))
55-
t.map_locations(levels)
56-
return com._ensure_platform_int(t.lookup(values))
57-
5847
_codes_doc = """The level codes of this categorical.
5948
6049
Level codes are an array if integer which are the positions of the real
@@ -484,7 +473,7 @@ def argsort(self, ascending=True, **kwargs):
484473
result = result[::-1]
485474
return result
486475

487-
def order(self, inplace=False, ascending=True, **kwargs):
476+
def order(self, inplace=False, ascending=True, na_position='last', **kwargs):
488477
""" Sorts the Category by level value returning a new Categorical by default.
489478
490479
Only ordered Categoricals can be sorted!
@@ -495,11 +484,11 @@ def order(self, inplace=False, ascending=True, **kwargs):
495484
----------
496485
ascending : boolean, default True
497486
Sort ascending. Passing False sorts descending
487+
inplace : boolean, default False
488+
Do operation in place.
498489
na_position : {'first', 'last'} (optional, default='last')
499490
'first' puts NaNs at the beginning
500491
'last' puts NaNs at the end
501-
inplace : boolean, default False
502-
Do operation in place.
503492
504493
Returns
505494
-------
@@ -511,18 +500,22 @@ def order(self, inplace=False, ascending=True, **kwargs):
511500
"""
512501
if not self.ordered:
513502
raise TypeError("Categorical not ordered")
514-
_sorted = np.sort(self._codes.copy())
503+
if na_position not in ['last','first']:
504+
raise ValueError('invalid na_position: {!r}'.format(na_position))
505+
506+
codes = np.sort(self._codes.copy())
515507
if not ascending:
516-
_sorted = _sorted[::-1]
508+
codes = codes[::-1]
509+
517510
if inplace:
518-
self._codes = _sorted
511+
self._codes = codes
519512
return
520513
else:
521-
return Categorical(values=_sorted,levels=self.levels, ordered=self.ordered,
514+
return Categorical(values=codes,levels=self.levels, ordered=self.ordered,
522515
name=self.name, fastpath=True)
523516

524517

525-
def sort(self, inplace=True, ascending=True, **kwargs):
518+
def sort(self, inplace=True, ascending=True, na_position='last', **kwargs):
526519
""" Sorts the Category inplace by level value.
527520
528521
Only ordered Categoricals can be sorted!
@@ -533,11 +526,11 @@ def sort(self, inplace=True, ascending=True, **kwargs):
533526
----------
534527
ascending : boolean, default True
535528
Sort ascending. Passing False sorts descending
529+
inplace : boolean, default False
530+
Do operation in place.
536531
na_position : {'first', 'last'} (optional, default='last')
537532
'first' puts NaNs at the beginning
538533
'last' puts NaNs at the end
539-
inplace : boolean, default False
540-
Do operation in place.
541534
542535
Returns
543536
-------
@@ -932,3 +925,20 @@ def describe(self):
932925
result.index.name = 'levels'
933926
result.columns = ['counts','freqs']
934927
return result
928+
929+
##### utility routines #####
930+
931+
def _get_codes_for_values(values, levels):
932+
""""
933+
utility routine to turn values into codes given the specified levels
934+
"""
935+
936+
from pandas.core.algorithms import _get_data_algo, _hashtables
937+
if values.dtype != levels.dtype:
938+
values = com._ensure_object(values)
939+
levels = com._ensure_object(levels)
940+
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
941+
t = hash_klass(len(levels))
942+
t.map_locations(levels)
943+
return com._ensure_platform_int(t.lookup(values))
944+

pandas/core/groupby.py

+21-17
Original file line numberDiff line numberDiff line change
@@ -3415,34 +3415,38 @@ def _lexsort_indexer(keys, orders=None, na_position='last'):
34153415
orders = [True] * len(keys)
34163416

34173417
for key, order in zip(keys, orders):
3418-
key = np.asanyarray(key)
3419-
rizer = _hash.Factorizer(len(key))
34203418

3421-
if not key.dtype == np.object_:
3422-
key = key.astype('O')
3419+
# we are already a Categorical
3420+
if is_categorical_dtype(key):
3421+
c = key
34233422

3424-
# factorize maps nans to na_sentinel=-1
3425-
ids = rizer.factorize(key, sort=True)
3426-
n = len(rizer.uniques)
3427-
mask = (ids == -1)
3423+
# create the Categorical
3424+
else:
3425+
c = Categorical(key,ordered=True)
3426+
3427+
if na_position not in ['last','first']:
3428+
raise ValueError('invalid na_position: {!r}'.format(na_position))
3429+
3430+
n = len(c.levels)
3431+
codes = c.codes.copy()
3432+
3433+
mask = (c.codes == -1)
34283434
if order: # ascending
34293435
if na_position == 'last':
3430-
ids = np.where(mask, n, ids)
3436+
codes = np.where(mask, n, codes)
34313437
elif na_position == 'first':
3432-
ids += 1
3433-
else:
3434-
raise ValueError('invalid na_position: {!r}'.format(na_position))
3438+
codes += 1
34353439
else: # not order means descending
34363440
if na_position == 'last':
3437-
ids = np.where(mask, n, n-ids-1)
3441+
codes = np.where(mask, n, n-codes-1)
34383442
elif na_position == 'first':
3439-
ids = np.where(mask, 0, n-ids)
3440-
else:
3441-
raise ValueError('invalid na_position: {!r}'.format(na_position))
3443+
codes = np.where(mask, 0, n-codes)
34423444
if mask.any():
34433445
n += 1
3446+
34443447
shape.append(n)
3445-
labels.append(ids)
3448+
labels.append(codes)
3449+
34463450
return _indexer_from_factorized(labels, shape)
34473451

34483452
def _nargsort(items, kind='quicksort', ascending=True, na_position='last'):

pandas/core/internals.py

+11-13
Original file line numberDiff line numberDiff line change
@@ -451,9 +451,9 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs):
451451
values[mask] = na_rep
452452
return values.tolist()
453453

454-
def _validate_merge(self, blocks):
455-
""" validate that we can merge these blocks """
456-
return True
454+
def _concat_blocks(self, blocks, values):
455+
""" return the block concatenation """
456+
return self._holder(values[0])
457457

458458
# block actions ####
459459
def copy(self, deep=True):
@@ -1639,15 +1639,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
16391639
ndim=self.ndim,
16401640
placement=self.mgr_locs)
16411641

1642-
def _validate_merge(self, blocks):
1643-
""" validate that we can merge these blocks """
1642+
def _concat_blocks(self, blocks, values):
1643+
"""
1644+
validate that we can merge these blocks
1645+
1646+
return the block concatenation
1647+
"""
16441648

16451649
levels = self.values.levels
16461650
for b in blocks:
16471651
if not levels.equals(b.values.levels):
16481652
raise ValueError("incompatible levels in categorical block merge")
16491653

1650-
return True
1654+
return self._holder(values[0], levels=levels)
16511655

16521656
def to_native_types(self, slicer=None, na_rep='', **kwargs):
16531657
""" convert to our native types format, slicing if desired """
@@ -4026,17 +4030,11 @@ def concatenate_join_units(join_units, concat_axis, copy):
40264030
else:
40274031
concat_values = com._concat_compat(to_concat, axis=concat_axis)
40284032

4029-
# FIXME: optimization potential: if len(join_units) == 1, single join unit
4030-
# is densified and sparsified back.
40314033
if any(unit.needs_block_conversion for unit in join_units):
40324034

40334035
# need to ask the join unit block to convert to the underlying repr for us
40344036
blocks = [ unit.block for unit in join_units if unit.block is not None ]
4035-
4036-
# may need to validate this combination
4037-
blocks[0]._validate_merge(blocks)
4038-
4039-
return blocks[0]._holder(concat_values[0])
4037+
return blocks[0]._concat_blocks(blocks, concat_values)
40404038
else:
40414039
return concat_values
40424040

pandas/tests/test_categorical.py

+57
Original file line numberDiff line numberDiff line change
@@ -983,6 +983,47 @@ def f():
983983
df.sort(columns=["unsort"], ascending=False)
984984
self.assertRaises(TypeError, f)
985985

986+
# multi-columns sort
987+
# GH 7848
988+
df = DataFrame({"id":[6,5,4,3,2,1], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
989+
df["grade"] = pd.Categorical(df["raw_grade"])
990+
df['grade'].cat.reorder_levels(['b', 'e', 'a'])
991+
992+
# sorts 'grade' according to the order of the levels
993+
result = df.sort(columns=['grade'])
994+
expected = df.iloc[[1,2,5,0,3,4]]
995+
tm.assert_frame_equal(result,expected)
996+
997+
# multi
998+
result = df.sort(columns=['grade', 'id'])
999+
expected = df.iloc[[2,1,5,4,3,0]]
1000+
tm.assert_frame_equal(result,expected)
1001+
1002+
# reverse
1003+
cat = Categorical(["a","c","c","b","d"], ordered=True)
1004+
res = cat.order(ascending=False)
1005+
exp_val = np.array(["d","c", "c", "b","a"],dtype=object)
1006+
exp_levels = np.array(["a","b","c","d"],dtype=object)
1007+
self.assert_numpy_array_equal(res.__array__(), exp_val)
1008+
self.assert_numpy_array_equal(res.levels, exp_levels)
1009+
1010+
# some NaN positions
1011+
1012+
cat = Categorical(["a","c","b","d", np.nan], ordered=True)
1013+
res = cat.order(ascending=False, na_position='last')
1014+
exp_val = np.array(["d","c","b","a", np.nan],dtype=object)
1015+
exp_levels = np.array(["a","b","c","d"],dtype=object)
1016+
# FIXME: IndexError: Out of bounds on buffer access (axis 0)
1017+
#self.assert_numpy_array_equal(res.__array__(), exp_val)
1018+
#self.assert_numpy_array_equal(res.levels, exp_levels)
1019+
1020+
cat = Categorical(["a","c","b","d", np.nan], ordered=True)
1021+
res = cat.order(ascending=False, na_position='first')
1022+
exp_val = np.array([np.nan, "d","c","b","a"],dtype=object)
1023+
exp_levels = np.array(["a","b","c","d"],dtype=object)
1024+
# FIXME: IndexError: Out of bounds on buffer access (axis 0)
1025+
#self.assert_numpy_array_equal(res.__array__(), exp_val)
1026+
#self.assert_numpy_array_equal(res.levels, exp_levels)
9861027

9871028
def test_slicing(self):
9881029
cat = Series(Categorical([1,2,3,4]))
@@ -1429,6 +1470,22 @@ def f():
14291470
pd.concat([df,df_wrong_levels])
14301471
self.assertRaises(ValueError, f)
14311472

1473+
# GH 7864
1474+
# make sure ordering is preserverd
1475+
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
1476+
df["grade"] = pd.Categorical(df["raw_grade"])
1477+
df['grade'].cat.reorder_levels(['e', 'a', 'b'])
1478+
1479+
df1 = df[0:3]
1480+
df2 = df[3:]
1481+
1482+
self.assert_numpy_array_equal(df['grade'].cat.levels, df1['grade'].cat.levels)
1483+
self.assert_numpy_array_equal(df['grade'].cat.levels, df2['grade'].cat.levels)
1484+
1485+
dfx = pd.concat([df1, df2])
1486+
dfx['grade'].cat.levels
1487+
self.assert_numpy_array_equal(df['grade'].cat.levels, dfx['grade'].cat.levels)
1488+
14321489
def test_append(self):
14331490
cat = pd.Categorical(["a","b"], levels=["a","b"])
14341491
vals = [1,2]

0 commit comments

Comments
 (0)