From 6de5608c7c2772b8ce85032d5d07d3432fb2fef7 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 19:56:21 +0000 Subject: [PATCH 01/14] ENH: Categorical.unique can keep same dtype --- doc/source/whatsnew/v1.2.0.rst | 1 + pandas/core/groupby/categorical.py | 5 ++ .../arrays/categorical/test_analytics.py | 79 +++++-------------- 3 files changed, 25 insertions(+), 60 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 6dd011c588702..7d801fa3c07b0 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -585,6 +585,7 @@ Categorical - :meth:`Categorical.fillna` will always return a copy, validate a passed fill value regardless of whether there are any NAs to fill, and disallow an ``NaT`` as a fill value for numeric categories (:issue:`36530`) - Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) - Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`) +- Bug in :meth:`Categorical.unique` where dtype was changed, it there were unused categories (:issue:`xxxxx`). - Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`) - Bug in :meth:`Categorical.to_numpy` and ``np.array(categorical)`` with tz-aware ``datetime64`` categories incorrectly dropping the time zone information instead of casting to object dtype (:issue:`38136`) diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 6de8c1d789097..8c740fe0e5a89 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -75,6 +75,11 @@ def recode_for_groupby( # sort=False should order groups in as-encountered order (GH-8868) cat = c.unique() + # exclude nan from indexer for categories + take_codes = cat.codes[cat.codes != -1] + if cat.ordered: + take_codes = np.sort(take_codes) + cat = cat.set_categories(cat.categories.take(take_codes)) # But for groupby to work, all categories should be present, # including those missing from the data (GH-13179), which .unique() diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 6899d821f80ad..f4108df52d1a0 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -6,13 +6,7 @@ from pandas.compat import PYPY -from pandas import ( - Categorical, - Index, - NaT, - Series, - date_range, -) +from pandas import Categorical, CategoricalDtype, Index, NaT, Series, date_range import pandas._testing as tm from pandas.api.types import is_scalar @@ -196,84 +190,49 @@ def test_searchsorted(self, ordered): with pytest.raises(KeyError, match="cucumber"): ser.searchsorted(["bread", "cucumber"]) - def test_unique(self): + def test_unique(self, ordered): + # GHXXXXX + dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered) + # categories are reordered based on value when ordered=False - cat = Categorical(["a", "b"]) - exp = Index(["a", "b"]) + cat = Categorical(["a", "b", "c"], dtype=dtype) res = cat.unique() - tm.assert_index_equal(res.categories, exp) tm.assert_categorical_equal(res, cat) - cat = Categorical(["a", "b", "a", "a"], categories=["a", "b", "c"]) + cat = Categorical(["a", "b", "a", "a"], dtype=dtype) res = cat.unique() - tm.assert_index_equal(res.categories, exp) - tm.assert_categorical_equal(res, Categorical(exp)) + tm.assert_categorical_equal(res, Categorical(["a", "b"], dtype=dtype)) - cat = Categorical(["c", "a", "b", "a", "a"], categories=["a", "b", "c"]) - exp = Index(["c", "a", "b"]) + cat = Categorical(["c", "a", "b", "a", "a"], dtype=dtype) res = cat.unique() - tm.assert_index_equal(res.categories, exp) - exp_cat = Categorical(exp, categories=["c", "a", "b"]) + exp_cat = Categorical(["c", "a", "b"], dtype=dtype) tm.assert_categorical_equal(res, exp_cat) # nan must be removed - cat = Categorical(["b", np.nan, "b", np.nan, "a"], categories=["a", "b", "c"]) - res = cat.unique() - exp = Index(["b", "a"]) - tm.assert_index_equal(res.categories, exp) - exp_cat = Categorical(["b", np.nan, "a"], categories=["b", "a"]) - tm.assert_categorical_equal(res, exp_cat) - - def test_unique_ordered(self): - # keep categories order when ordered=True - cat = Categorical(["b", "a", "b"], categories=["a", "b"], ordered=True) + cat = Categorical(["b", np.nan, "b", np.nan, "a"], dtype=dtype) res = cat.unique() - exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) + exp_cat = Categorical(["b", np.nan, "a"], dtype=dtype) tm.assert_categorical_equal(res, exp_cat) - cat = Categorical( - ["c", "b", "a", "a"], categories=["a", "b", "c"], ordered=True - ) - res = cat.unique() - exp_cat = Categorical(["c", "b", "a"], categories=["a", "b", "c"], ordered=True) - tm.assert_categorical_equal(res, exp_cat) - - cat = Categorical(["b", "a", "a"], categories=["a", "b", "c"], ordered=True) - res = cat.unique() - exp_cat = Categorical(["b", "a"], categories=["a", "b"], ordered=True) - tm.assert_categorical_equal(res, exp_cat) + def test_unique_index_series(self, ordered): + # GHXXXXX + dtype = CategoricalDtype([3, 2, 1], ordered=ordered) - cat = Categorical( - ["b", "b", np.nan, "a"], categories=["a", "b", "c"], ordered=True - ) - res = cat.unique() - exp_cat = Categorical(["b", np.nan, "a"], categories=["a", "b"], ordered=True) - tm.assert_categorical_equal(res, exp_cat) - - def test_unique_index_series(self): - c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1]) + c = Categorical([3, 1, 2, 2, 1], dtype=dtype) # Categorical.unique sorts categories by appearance order # if ordered=False - exp = Categorical([3, 1, 2], categories=[3, 1, 2]) + exp = Categorical([3, 1, 2], dtype=dtype) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp) - c = Categorical([1, 1, 2, 2], categories=[3, 2, 1]) - exp = Categorical([1, 2], categories=[1, 2]) + c = Categorical([1, 1, 2, 2], dtype=dtype) + exp = Categorical([1, 2], dtype=dtype) tm.assert_categorical_equal(c.unique(), exp) tm.assert_index_equal(Index(c).unique(), Index(exp)) tm.assert_categorical_equal(Series(c).unique(), exp) - c = Categorical([3, 1, 2, 2, 1], categories=[3, 2, 1], ordered=True) - # Categorical.unique keeps categories order if ordered=True - exp = Categorical([3, 1, 2], categories=[3, 2, 1], ordered=True) - tm.assert_categorical_equal(c.unique(), exp) - - tm.assert_index_equal(Index(c).unique(), Index(exp)) - tm.assert_categorical_equal(Series(c).unique(), exp) - def test_shift(self): # GH 9416 cat = Categorical(["a", "b", "c", "d", "a"]) From b0aed5c551b2f9c7f2b71d29b4a2f40ac18b451d Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 20:10:05 +0000 Subject: [PATCH 02/14] fixes --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/tests/arrays/categorical/test_analytics.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 7d801fa3c07b0..ac57dcebcf494 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -585,7 +585,7 @@ Categorical - :meth:`Categorical.fillna` will always return a copy, validate a passed fill value regardless of whether there are any NAs to fill, and disallow an ``NaT`` as a fill value for numeric categories (:issue:`36530`) - Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) - Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`) -- Bug in :meth:`Categorical.unique` where dtype was changed, it there were unused categories (:issue:`xxxxx`). +- Bug in :meth:`Categorical.unique` where the dtype changes in the unique array if there are unused categories in the original array (:issue:`38140`). - Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`) - Bug in :meth:`Categorical.to_numpy` and ``np.array(categorical)`` with tz-aware ``datetime64`` categories incorrectly dropping the time zone information instead of casting to object dtype (:issue:`38136`) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index f4108df52d1a0..7fd15aa02b40c 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -191,7 +191,7 @@ def test_searchsorted(self, ordered): ser.searchsorted(["bread", "cucumber"]) def test_unique(self, ordered): - # GHXXXXX + # GH38140 dtype = CategoricalDtype(["a", "b", "c"], ordered=ordered) # categories are reordered based on value when ordered=False @@ -215,7 +215,7 @@ def test_unique(self, ordered): tm.assert_categorical_equal(res, exp_cat) def test_unique_index_series(self, ordered): - # GHXXXXX + # GH38140 dtype = CategoricalDtype([3, 2, 1], ordered=ordered) c = Categorical([3, 1, 2, 2, 1], dtype=dtype) From 9135f458658c61ada499f7d671a7968e6cd67c31 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 20:35:05 +0000 Subject: [PATCH 03/14] fix doc string --- pandas/core/arrays/categorical.py | 29 ++++++++--------------------- 1 file changed, 8 insertions(+), 21 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f2b5ad447a0cf..84c22298a8a9a 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2127,16 +2127,15 @@ def mode(self, dropna=True): def unique(self): """ Return the ``Categorical`` which ``categories`` and ``codes`` are - unique. Unused categories are NOT returned. + unique. - - unordered category: values and categories are sorted by appearance - order. - - ordered category: values are sorted by appearance order, categories - keeps existing order. + .. versionchanged:: 1.2.0 + + Previously unused categories were dropped. Returns ------- - unique values : ``Categorical`` + Categorical See Also -------- @@ -2146,23 +2145,11 @@ def unique(self): Examples -------- - An unordered Categorical will return categories in the - order of appearance. - >>> pd.Categorical(list("baabc")).unique() ['b', 'a', 'c'] - Categories (3, object): ['b', 'a', 'c'] - - >>> pd.Categorical(list("baabc"), categories=list("abc")).unique() - ['b', 'a', 'c'] - Categories (3, object): ['b', 'a', 'c'] - - An ordered Categorical preserves the category ordering. - - >>> pd.Categorical( - ... list("baabc"), categories=list("abc"), ordered=True - ... ).unique() - ['b', 'a', 'c'] + Categories (3, object): ['a', 'b', 'c'] + >>> pd.Categorical(list("baab"), categories=list("abc"), ordered=True).unique() + ['b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] """ # unlike np.unique, unique1d does not sort From 8fcf4e1d5070214ff2d2baf7ad042e3d24ae57d8 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 21:18:23 +0000 Subject: [PATCH 04/14] fix doc strings --- doc/source/whatsnew/v1.2.0.rst | 2 +- pandas/core/arrays/categorical.py | 2 +- pandas/core/groupby/categorical.py | 2 ++ pandas/core/series.py | 9 +++------ 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index ac57dcebcf494..934f0413f6eb6 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -585,7 +585,7 @@ Categorical - :meth:`Categorical.fillna` will always return a copy, validate a passed fill value regardless of whether there are any NAs to fill, and disallow an ``NaT`` as a fill value for numeric categories (:issue:`36530`) - Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) - Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`) -- Bug in :meth:`Categorical.unique` where the dtype changes in the unique array if there are unused categories in the original array (:issue:`38140`). +- Bug in :meth:`Categorical.unique`, where the dtype changed in the unique array if there were unused categories in the original array (:issue:`38140`). - Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`) - Bug in :meth:`Categorical.to_numpy` and ``np.array(categorical)`` with tz-aware ``datetime64`` categories incorrectly dropping the time zone information instead of casting to object dtype (:issue:`38136`) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 84c22298a8a9a..f6067714c960b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2131,7 +2131,7 @@ def unique(self): .. versionchanged:: 1.2.0 - Previously unused categories were dropped. + Previously, unused categories were dropped from the new categories. Returns ------- diff --git a/pandas/core/groupby/categorical.py b/pandas/core/groupby/categorical.py index 8c740fe0e5a89..297681f1e10f5 100644 --- a/pandas/core/groupby/categorical.py +++ b/pandas/core/groupby/categorical.py @@ -75,6 +75,8 @@ def recode_for_groupby( # sort=False should order groups in as-encountered order (GH-8868) cat = c.unique() + + # See GH-38140 for block below # exclude nan from indexer for categories take_codes = cat.codes[cat.codes != -1] if cat.ordered: diff --git a/pandas/core/series.py b/pandas/core/series.py index 5c605a6b441c6..33e3bfb6ee3aa 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1993,15 +1993,12 @@ def unique(self) -> ArrayLike: ['2016-01-01 00:00:00-05:00'] Length: 1, dtype: datetime64[ns, US/Eastern] - An unordered Categorical will return categories in the order of - appearance. + An Categorical will return categories in the order of + appearance and with the same dtype. >>> pd.Series(pd.Categorical(list('baabc'))).unique() ['b', 'a', 'c'] - Categories (3, object): ['b', 'a', 'c'] - - An ordered Categorical preserves the category ordering. - + Categories (3, object): ['a', 'b', 'c'] >>> pd.Series(pd.Categorical(list('baabc'), categories=list('abc'), ... ordered=True)).unique() ['b', 'a', 'c'] From 356267b74886b0407da482f678ecd97fd8f9b4fb Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 28 Nov 2020 23:49:51 +0000 Subject: [PATCH 05/14] fix categorical tests --- pandas/tests/base/test_unique.py | 2 -- .../indexes/categorical/test_category.py | 19 ++++++++++--------- pandas/tests/test_algos.py | 4 ++-- 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py index 4aefa4be176fb..26e785a2796b1 100644 --- a/pandas/tests/base/test_unique.py +++ b/pandas/tests/base/test_unique.py @@ -67,8 +67,6 @@ def test_unique_null(null_obj, index_or_series_obj): if is_datetime64tz_dtype(obj.dtype): result = result.normalize() expected = expected.normalize() - elif isinstance(obj, pd.CategoricalIndex): - expected = expected.set_categories(unique_values_not_null) tm.assert_index_equal(result, expected) else: expected = np.array(unique_values, dtype=obj.dtype) diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index d3c9b02b3ba23..bd2382b062aae 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -4,7 +4,7 @@ from pandas._libs import index as libindex import pandas as pd -from pandas import Categorical +from pandas import Categorical, CategoricalDtype import pandas._testing as tm from pandas.core.indexes.api import ( CategoricalIndex, @@ -186,18 +186,19 @@ def test_drop_duplicates(self, data, categories, expected): tm.assert_index_equal(result, e) @pytest.mark.parametrize( - "data, categories, expected_data, expected_categories", + "data, categories, expected_data", [ - ([1, 1, 1], [1, 2, 3], [1], [1]), - ([1, 1, 1], list("abc"), [np.nan], []), - ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan], [1, 2]), - ([2, "a", "b"], list("abc"), [np.nan, "a", "b"], ["a", "b"]), + ([1, 1, 1], [1, 2, 3], [1]), + ([1, 1, 1], list("abc"), [np.nan]), + ([1, 2, "a"], [1, 2, 3], [1, 2, np.nan]), + ([2, "a", "b"], list("abc"), [np.nan, "a", "b"]), ], ) - def test_unique(self, data, categories, expected_data, expected_categories): + def test_unique(self, data, categories, expected_data, ordered): + dtype = CategoricalDtype(categories, ordered=ordered) - idx = CategoricalIndex(data, categories=categories) - expected = CategoricalIndex(expected_data, categories=expected_categories) + idx = CategoricalIndex(data, dtype=dtype) + expected = CategoricalIndex(expected_data, dtype=dtype) tm.assert_index_equal(idx.unique(), expected) def test_repr_roundtrip(self): diff --git a/pandas/tests/test_algos.py b/pandas/tests/test_algos.py index 127baae6e9352..c9d034361d8c4 100644 --- a/pandas/tests/test_algos.py +++ b/pandas/tests/test_algos.py @@ -602,7 +602,7 @@ def test_categorical(self): # we are expecting to return in the order # of appearance - expected = Categorical(list("bac"), categories=list("bac")) + expected = Categorical(list("bac")) # we are expecting to return in the order # of the categories @@ -632,7 +632,7 @@ def test_categorical(self): tm.assert_categorical_equal(result, expected) # CI -> return CI - ci = CategoricalIndex(Categorical(list("baabc"), categories=list("bac"))) + ci = CategoricalIndex(Categorical(list("baabc"), categories=list("abc"))) expected = CategoricalIndex(expected) result = ci.unique() tm.assert_index_equal(result, expected) From 1c8f4f9e83f72c7b0a196f152c8b84884755634b Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 29 Nov 2020 00:52:52 +0000 Subject: [PATCH 06/14] fix test failure --- pandas/tests/extension/base/methods.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 3ea5c34201b5c..589b4e3b71db5 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -40,10 +40,10 @@ def test_value_counts_with_normalize(self, data): # GH 33172 data = data[:10].unique() values = np.array(data[~data.isna()]) + ser = pd.Series(data, dtype=data.dtype) - result = ( - pd.Series(data, dtype=data.dtype).value_counts(normalize=True).sort_index() - ) + result = ser.value_counts(normalize=True).sort_index() + result = result[result > 0] expected = pd.Series([1 / len(values)] * len(values), index=result.index) self.assert_series_equal(result, expected) From f31837c9d096cc980167797829dc171e119cb182 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 29 Nov 2020 08:28:18 +0000 Subject: [PATCH 07/14] fix value_count test --- pandas/tests/extension/base/methods.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 589b4e3b71db5..856bd1ac13706 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -38,12 +38,10 @@ def test_value_counts(self, all_data, dropna): def test_value_counts_with_normalize(self, data): # GH 33172 - data = data[:10].unique() + data = data[:10].unique().remove_unused_categories() values = np.array(data[~data.isna()]) - ser = pd.Series(data, dtype=data.dtype) - result = ser.value_counts(normalize=True).sort_index() - result = result[result > 0] + result = pd.Series(data).value_counts(normalize=True).sort_index() expected = pd.Series([1 / len(values)] * len(values), index=result.index) self.assert_series_equal(result, expected) From e261f3c916cbab5470b80739e12d209bf0e98158 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 29 Nov 2020 08:58:56 +0000 Subject: [PATCH 08/14] values_count fix --- pandas/tests/extension/base/methods.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 856bd1ac13706..ca9c2acb9fd12 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -38,12 +38,18 @@ def test_value_counts(self, all_data, dropna): def test_value_counts_with_normalize(self, data): # GH 33172 - data = data[:10].unique().remove_unused_categories() + data = data[:10].unique() values = np.array(data[~data.isna()]) + ser = pd.Series(data, dtype=data.dtype) - result = pd.Series(data).value_counts(normalize=True).sort_index() + result = ser.value_counts(normalize=True).sort_index() + + if not isinstance(data, pd.Categorical): + expected = pd.Series([1 / len(values)] * len(values), index=result.index) + else: + expected = pd.Series(0.0, index=result.index) + expected[result > 0] = 1 / len(values) - expected = pd.Series([1 / len(values)] * len(values), index=result.index) self.assert_series_equal(result, expected) def test_count(self, data_missing): From a9859b6cf229bfef03bdc03d794555e4b16bd996 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 12 Dec 2020 10:32:58 +0000 Subject: [PATCH 09/14] update --- doc/source/whatsnew/v1.2.0.rst | 1 - doc/source/whatsnew/v1.3.0.rst | 31 +++++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 934f0413f6eb6..6dd011c588702 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -585,7 +585,6 @@ Categorical - :meth:`Categorical.fillna` will always return a copy, validate a passed fill value regardless of whether there are any NAs to fill, and disallow an ``NaT`` as a fill value for numeric categories (:issue:`36530`) - Bug in :meth:`Categorical.__setitem__` that incorrectly raised when trying to set a tuple value (:issue:`20439`) - Bug in :meth:`CategoricalIndex.equals` incorrectly casting non-category entries to ``np.nan`` (:issue:`37667`) -- Bug in :meth:`Categorical.unique`, where the dtype changed in the unique array if there were unused categories in the original array (:issue:`38140`). - Bug in :meth:`CategoricalIndex.where` incorrectly setting non-category entries to ``np.nan`` instead of raising ``TypeError`` (:issue:`37977`) - Bug in :meth:`Categorical.to_numpy` and ``np.array(categorical)`` with tz-aware ``datetime64`` categories incorrectly dropping the time zone information instead of casting to object dtype (:issue:`38136`) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 2b0b62ab7facf..dbe008015d9b5 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -230,6 +230,37 @@ Notable bug fixes These are bug fixes that might have notable behavior changes. +``Categorical.unique`` now always maintains same dtype as original +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Previously, when calling :meth:`Categorical.unique`, unused categories in the new array +would be removed, meaning that the dtype of the new array would be different than the +original, if some categories are not present in the unique array: + +As an example of this, given: + +.. ipython:: python + + dtype = pd.CategoricalDtype(['bad', 'neutral', 'good'], ordered=True) + original = pd.Categorical(['good','good', 'bad', 'bad'], dtype=dtype) + unique = original.unique() + +*pandas < 1.2.0*: + +.. code-block:: ipython + + In [1]: unique + ['good', 'bad'] + Categories (2, object): ['bad' < 'good'] + In [2]: original.dtype == unique.dtype + False + +*pandas >= 1.2.0* + +.. ipython:: python + + unique + original.dtype == unique.dtype Preserve dtypes in :meth:`~pandas.DataFrame.combine_first` ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ From 9e29a11aaa5ef656413761fcbea2c9bdb0c23685 Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 12 Dec 2020 11:28:14 +0000 Subject: [PATCH 10/14] fixes --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index dbe008015d9b5..385b0577a86d9 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -242,7 +242,7 @@ As an example of this, given: .. ipython:: python dtype = pd.CategoricalDtype(['bad', 'neutral', 'good'], ordered=True) - original = pd.Categorical(['good','good', 'bad', 'bad'], dtype=dtype) + original = pd.Categorical(['good', 'good', 'bad', 'bad'], dtype=dtype) unique = original.unique() *pandas < 1.2.0*: From 5ed054cf1510227da741a1bc01701fea43e678da Mon Sep 17 00:00:00 2001 From: tp Date: Wed, 16 Dec 2020 06:59:59 +0000 Subject: [PATCH 11/14] Use series in whatsnew example --- doc/source/whatsnew/v1.3.0.rst | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 385b0577a86d9..b07730aaf2293 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -233,16 +233,17 @@ These are bug fixes that might have notable behavior changes. ``Categorical.unique`` now always maintains same dtype as original ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Previously, when calling :meth:`Categorical.unique`, unused categories in the new array +Previously, when calling :meth:`~Categorical.unique` with categorical data, unused categories in the new array would be removed, meaning that the dtype of the new array would be different than the -original, if some categories are not present in the unique array: +original, if some categories are not present in the unique array (:issue:`18291`) As an example of this, given: .. ipython:: python dtype = pd.CategoricalDtype(['bad', 'neutral', 'good'], ordered=True) - original = pd.Categorical(['good', 'good', 'bad', 'bad'], dtype=dtype) + cat = pd.Categorical(['good', 'good', 'bad', 'bad'], dtype=dtype) + original = pd.Series(cat) unique = original.unique() *pandas < 1.2.0*: From f68a38b40d72c71ab4c40ec7dd96640e7e5648ac Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 22 Dec 2020 11:15:21 +0000 Subject: [PATCH 12/14] Update version in docs to v1.3.0 --- doc/source/whatsnew/v1.3.0.rst | 4 ++-- pandas/core/arrays/categorical.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index b07730aaf2293..6631a175ecb72 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -246,7 +246,7 @@ As an example of this, given: original = pd.Series(cat) unique = original.unique() -*pandas < 1.2.0*: +*pandas < 1.3.0*: .. code-block:: ipython @@ -256,7 +256,7 @@ As an example of this, given: In [2]: original.dtype == unique.dtype False -*pandas >= 1.2.0* +*pandas >= 1.3.0* .. ipython:: python diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f6067714c960b..f97a6390e7f8e 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2129,7 +2129,7 @@ def unique(self): Return the ``Categorical`` which ``categories`` and ``codes`` are unique. - .. versionchanged:: 1.2.0 + .. versionchanged:: 1.3.0 Previously, unused categories were dropped from the new categories. From a5e5096a18e1886ac38960441e4d3563e455322c Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 4 Mar 2021 18:00:46 +0000 Subject: [PATCH 13/14] diff from rebase --- pandas/core/arrays/categorical.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f97a6390e7f8e..ba36e4a630e1f 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -2152,18 +2152,8 @@ def unique(self): ['b', 'a'] Categories (3, object): ['a' < 'b' < 'c'] """ - # unlike np.unique, unique1d does not sort unique_codes = unique1d(self.codes) - cat = self.copy() - - # keep nan in codes - cat._ndarray = unique_codes - - # exclude nan from indexer for categories - take_codes = unique_codes[unique_codes != -1] - if self.ordered: - take_codes = np.sort(take_codes) - return cat.set_categories(cat.categories.take(take_codes)) + return self._from_backing_data(unique_codes) def _values_for_factorize(self): return self._ndarray, -1 From 0616c202f3ccd9407fea32d625689fec02f4b6f9 Mon Sep 17 00:00:00 2001 From: tp Date: Thu, 4 Mar 2021 18:24:21 +0000 Subject: [PATCH 14/14] isort cleanup --- pandas/tests/arrays/categorical/test_analytics.py | 9 ++++++++- pandas/tests/indexes/categorical/test_category.py | 5 ++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/pandas/tests/arrays/categorical/test_analytics.py b/pandas/tests/arrays/categorical/test_analytics.py index 7fd15aa02b40c..56d474497a166 100644 --- a/pandas/tests/arrays/categorical/test_analytics.py +++ b/pandas/tests/arrays/categorical/test_analytics.py @@ -6,7 +6,14 @@ from pandas.compat import PYPY -from pandas import Categorical, CategoricalDtype, Index, NaT, Series, date_range +from pandas import ( + Categorical, + CategoricalDtype, + Index, + NaT, + Series, + date_range, +) import pandas._testing as tm from pandas.api.types import is_scalar diff --git a/pandas/tests/indexes/categorical/test_category.py b/pandas/tests/indexes/categorical/test_category.py index bd2382b062aae..678344f5b6909 100644 --- a/pandas/tests/indexes/categorical/test_category.py +++ b/pandas/tests/indexes/categorical/test_category.py @@ -4,7 +4,10 @@ from pandas._libs import index as libindex import pandas as pd -from pandas import Categorical, CategoricalDtype +from pandas import ( + Categorical, + CategoricalDtype, +) import pandas._testing as tm from pandas.core.indexes.api import ( CategoricalIndex,