Skip to content

BUG: fix multi-column sort that includes Categoricals / concat (GH7848/GH7864) #7850

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 29, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion doc/source/v0.15.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,8 @@ Categoricals in Series/DataFrame
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

:class:`~pandas.Categorical` can now be included in `Series` and `DataFrames` and gained new
methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`, :issue:`7444`, :issue:`7839`).
methods to manipulate. Thanks to Jan Schultz for much of this API/implementation. (:issue:`3943`, :issue:`5313`, :issue:`5314`,
:issue:`7444`, :issue:`7839`, :issue:`7848`, :issue:`7864`).

For full docs, see the :ref:`Categorical introduction <categorical>` and the :ref:`API documentation <api.categorical>`.

Expand Down
52 changes: 31 additions & 21 deletions pandas/core/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from pandas.core.config import get_option
from pandas.core import format as fmt


def _cat_compare_op(op):
def f(self, other):
if isinstance(other, (Categorical, np.ndarray)):
Expand Down Expand Up @@ -45,16 +44,6 @@ def _maybe_to_categorical(array):
return array


def _get_codes_for_values(values, levels):
from pandas.core.algorithms import _get_data_algo, _hashtables
if values.dtype != levels.dtype:
values = com._ensure_object(values)
levels = com._ensure_object(levels)
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
t = hash_klass(len(levels))
t.map_locations(levels)
return com._ensure_platform_int(t.lookup(values))

_codes_doc = """The level codes of this categorical.

Level codes are an array if integer which are the positions of the real
Expand Down Expand Up @@ -484,7 +473,7 @@ def argsort(self, ascending=True, **kwargs):
result = result[::-1]
return result

def order(self, inplace=False, ascending=True, **kwargs):
def order(self, inplace=False, ascending=True, na_position='last', **kwargs):
""" Sorts the Category by level value returning a new Categorical by default.

Only ordered Categoricals can be sorted!
Expand All @@ -495,11 +484,11 @@ def order(self, inplace=False, ascending=True, **kwargs):
----------
ascending : boolean, default True
Sort ascending. Passing False sorts descending
inplace : boolean, default False
Do operation in place.
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
inplace : boolean, default False
Do operation in place.

Returns
-------
Expand All @@ -511,18 +500,22 @@ def order(self, inplace=False, ascending=True, **kwargs):
"""
if not self.ordered:
raise TypeError("Categorical not ordered")
_sorted = np.sort(self._codes.copy())
if na_position not in ['last','first']:
raise ValueError('invalid na_position: {!r}'.format(na_position))

codes = np.sort(self._codes.copy())
if not ascending:
_sorted = _sorted[::-1]
codes = codes[::-1]

if inplace:
self._codes = _sorted
self._codes = codes
return
else:
return Categorical(values=_sorted,levels=self.levels, ordered=self.ordered,
return Categorical(values=codes,levels=self.levels, ordered=self.ordered,
name=self.name, fastpath=True)


def sort(self, inplace=True, ascending=True, **kwargs):
def sort(self, inplace=True, ascending=True, na_position='last', **kwargs):
""" Sorts the Category inplace by level value.

Only ordered Categoricals can be sorted!
Expand All @@ -533,11 +526,11 @@ def sort(self, inplace=True, ascending=True, **kwargs):
----------
ascending : boolean, default True
Sort ascending. Passing False sorts descending
inplace : boolean, default False
Do operation in place.
na_position : {'first', 'last'} (optional, default='last')
'first' puts NaNs at the beginning
'last' puts NaNs at the end
inplace : boolean, default False
Do operation in place.

Returns
-------
Expand Down Expand Up @@ -932,3 +925,20 @@ def describe(self):
result.index.name = 'levels'
result.columns = ['counts','freqs']
return result

##### utility routines #####

def _get_codes_for_values(values, levels):
""""
utility routine to turn values into codes given the specified levels
"""

from pandas.core.algorithms import _get_data_algo, _hashtables
if values.dtype != levels.dtype:
values = com._ensure_object(values)
levels = com._ensure_object(levels)
(hash_klass, vec_klass), vals = _get_data_algo(values, _hashtables)
t = hash_klass(len(levels))
t.map_locations(levels)
return com._ensure_platform_int(t.lookup(values))

38 changes: 21 additions & 17 deletions pandas/core/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -3415,34 +3415,38 @@ def _lexsort_indexer(keys, orders=None, na_position='last'):
orders = [True] * len(keys)

for key, order in zip(keys, orders):
key = np.asanyarray(key)
rizer = _hash.Factorizer(len(key))

if not key.dtype == np.object_:
key = key.astype('O')
# we are already a Categorical
if is_categorical_dtype(key):
c = key

# factorize maps nans to na_sentinel=-1
ids = rizer.factorize(key, sort=True)
n = len(rizer.uniques)
mask = (ids == -1)
# create the Categorical
else:
c = Categorical(key,ordered=True)

if na_position not in ['last','first']:
raise ValueError('invalid na_position: {!r}'.format(na_position))

n = len(c.levels)
codes = c.codes.copy()

mask = (c.codes == -1)
if order: # ascending
if na_position == 'last':
ids = np.where(mask, n, ids)
codes = np.where(mask, n, codes)
elif na_position == 'first':
ids += 1
else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
codes += 1
else: # not order means descending
if na_position == 'last':
ids = np.where(mask, n, n-ids-1)
codes = np.where(mask, n, n-codes-1)
elif na_position == 'first':
ids = np.where(mask, 0, n-ids)
else:
raise ValueError('invalid na_position: {!r}'.format(na_position))
codes = np.where(mask, 0, n-codes)
if mask.any():
n += 1

shape.append(n)
labels.append(ids)
labels.append(codes)

return _indexer_from_factorized(labels, shape)

def _nargsort(items, kind='quicksort', ascending=True, na_position='last'):
Expand Down
24 changes: 11 additions & 13 deletions pandas/core/internals.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,9 +451,9 @@ def to_native_types(self, slicer=None, na_rep='', **kwargs):
values[mask] = na_rep
return values.tolist()

def _validate_merge(self, blocks):
""" validate that we can merge these blocks """
return True
def _concat_blocks(self, blocks, values):
""" return the block concatenation """
return self._holder(values[0])

# block actions ####
def copy(self, deep=True):
Expand Down Expand Up @@ -1639,15 +1639,19 @@ def _astype(self, dtype, copy=False, raise_on_error=True, values=None,
ndim=self.ndim,
placement=self.mgr_locs)

def _validate_merge(self, blocks):
""" validate that we can merge these blocks """
def _concat_blocks(self, blocks, values):
"""
validate that we can merge these blocks

return the block concatenation
"""

levels = self.values.levels
for b in blocks:
if not levels.equals(b.values.levels):
raise ValueError("incompatible levels in categorical block merge")

return True
return self._holder(values[0], levels=levels)

def to_native_types(self, slicer=None, na_rep='', **kwargs):
""" convert to our native types format, slicing if desired """
Expand Down Expand Up @@ -4026,17 +4030,11 @@ def concatenate_join_units(join_units, concat_axis, copy):
else:
concat_values = com._concat_compat(to_concat, axis=concat_axis)

# FIXME: optimization potential: if len(join_units) == 1, single join unit
# is densified and sparsified back.
if any(unit.needs_block_conversion for unit in join_units):

# need to ask the join unit block to convert to the underlying repr for us
blocks = [ unit.block for unit in join_units if unit.block is not None ]

# may need to validate this combination
blocks[0]._validate_merge(blocks)

return blocks[0]._holder(concat_values[0])
return blocks[0]._concat_blocks(blocks, concat_values)
else:
return concat_values

Expand Down
57 changes: 57 additions & 0 deletions pandas/tests/test_categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,6 +983,47 @@ def f():
df.sort(columns=["unsort"], ascending=False)
self.assertRaises(TypeError, f)

# multi-columns sort
# GH 7848
df = DataFrame({"id":[6,5,4,3,2,1], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df["grade"] = pd.Categorical(df["raw_grade"])
df['grade'].cat.reorder_levels(['b', 'e', 'a'])

# sorts 'grade' according to the order of the levels
result = df.sort(columns=['grade'])
expected = df.iloc[[1,2,5,0,3,4]]
tm.assert_frame_equal(result,expected)

# multi
result = df.sort(columns=['grade', 'id'])
expected = df.iloc[[2,1,5,4,3,0]]
tm.assert_frame_equal(result,expected)

# reverse
cat = Categorical(["a","c","c","b","d"], ordered=True)
res = cat.order(ascending=False)
exp_val = np.array(["d","c", "c", "b","a"],dtype=object)
exp_levels = np.array(["a","b","c","d"],dtype=object)
self.assert_numpy_array_equal(res.__array__(), exp_val)
self.assert_numpy_array_equal(res.levels, exp_levels)

# some NaN positions

cat = Categorical(["a","c","b","d", np.nan], ordered=True)
res = cat.order(ascending=False, na_position='last')
exp_val = np.array(["d","c","b","a", np.nan],dtype=object)
exp_levels = np.array(["a","b","c","d"],dtype=object)
# FIXME: IndexError: Out of bounds on buffer access (axis 0)
#self.assert_numpy_array_equal(res.__array__(), exp_val)
#self.assert_numpy_array_equal(res.levels, exp_levels)

cat = Categorical(["a","c","b","d", np.nan], ordered=True)
res = cat.order(ascending=False, na_position='first')
exp_val = np.array([np.nan, "d","c","b","a"],dtype=object)
exp_levels = np.array(["a","b","c","d"],dtype=object)
# FIXME: IndexError: Out of bounds on buffer access (axis 0)
#self.assert_numpy_array_equal(res.__array__(), exp_val)
#self.assert_numpy_array_equal(res.levels, exp_levels)

def test_slicing(self):
cat = Series(Categorical([1,2,3,4]))
Expand Down Expand Up @@ -1429,6 +1470,22 @@ def f():
pd.concat([df,df_wrong_levels])
self.assertRaises(ValueError, f)

# GH 7864
# make sure ordering is preserverd
df = pd.DataFrame({"id":[1,2,3,4,5,6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']})
df["grade"] = pd.Categorical(df["raw_grade"])
df['grade'].cat.reorder_levels(['e', 'a', 'b'])

df1 = df[0:3]
df2 = df[3:]

self.assert_numpy_array_equal(df['grade'].cat.levels, df1['grade'].cat.levels)
self.assert_numpy_array_equal(df['grade'].cat.levels, df2['grade'].cat.levels)

dfx = pd.concat([df1, df2])
dfx['grade'].cat.levels
self.assert_numpy_array_equal(df['grade'].cat.levels, dfx['grade'].cat.levels)

def test_append(self):
cat = pd.Categorical(["a","b"], levels=["a","b"])
vals = [1,2]
Expand Down