From 139209aee25b7c9f5e68fcd751403ae8ac713aff Mon Sep 17 00:00:00 2001 From: Jan Schulz Date: Fri, 23 Jan 2015 22:45:50 +0100 Subject: [PATCH] Categorical: don't sort the categoricals if Categorical(..., ordered=False) In https://github.com/mwaskom/seaborn/issues/361 it was discussed that lexicographical sorting the categories is only appropiate if an order is specified/implied. If this is explicitly not done, e.g. with `Categorical(..., ordered=False)` then the order should be taken from the order of appearance, similar to the current `Series.unique()` implementation. --- pandas/core/categorical.py | 2 +- pandas/tests/test_categorical.py | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/pandas/core/categorical.py b/pandas/core/categorical.py index fe8b1079f0942..8d7d8e2dbb947 100644 --- a/pandas/core/categorical.py +++ b/pandas/core/categorical.py @@ -268,7 +268,7 @@ def __init__(self, values, categories=None, ordered=None, name=None, fastpath=Fa if categories is None: try: - codes, categories = factorize(values, sort=True) + codes, categories = factorize(values, sort=ordered if not ordered is None else True) # If the underlying data structure was sortable, and the user doesn't want to # "forget" this order, the categorical also is sorted/ordered if ordered is None: diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index 4852e142d2f29..9e61fa5c27cb8 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -169,6 +169,20 @@ def f(): c_old2 = Categorical([0, 1, 2, 0, 1, 2], [1, 2, 3]) cat = Categorical([1,2], categories=[1,2,3]) + # if the categorical is constructed without ordering, use the "order of appearance" in + # the categories instead of sorting the lexiographicaly. + # see https://github.com/mwaskom/seaborn/issues/361 for a discussion on this topic + c1 = Categorical(["a", "c", "b", "a"], ordered=False) + self.assert_numpy_array_equal(c1.categories, np.array(["a","c","b"])) + # mae sure that construction with (implicit) ordered=True sorts the categories + c2 = Categorical(["a", "c", "b", "a"]) + self.assert_numpy_array_equal(c2.categories, np.array(["a","b","c"])) + c2 = Categorical(["a", "c", "b", "a"], ordered=True) + self.assert_numpy_array_equal(c2.categories, np.array(["a","b","c"])) + # ensure that the order in the categories is preserved when setting ordered=False + c2.ordered = False + self.assert_numpy_array_equal(c2.categories, np.array(["a","b","c"])) + def test_constructor_with_generator(self): # This was raising an Error in isnull(single_val).any() because isnull returned a scalar # for a generator