From 93bb4294c85e25f6052577d99bf2db5676249d77 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Oct 2017 10:15:00 +0200 Subject: [PATCH 1/7] BUG: Categorical(Index) passed as categories --- pandas/core/dtypes/dtypes.py | 14 +++++++++----- pandas/tests/dtypes/test_dtypes.py | 7 +++++++ pandas/tests/test_categorical.py | 24 ++++++++++++++++++++++++ 3 files changed, 40 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 2fdbad93fa63b..2da8a5a860963 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -174,12 +174,12 @@ def _finalize(self, categories, ordered, fastpath=False): if ordered is None: ordered = False + else: + self._validate_ordered(ordered) if categories is not None: - categories = Index(categories, tupleize_cols=False) - # validation - self._validate_categories(categories) - self._validate_ordered(ordered) + categories = self._validate_categories(categories) + self._categories = categories self._ordered = ordered @@ -316,7 +316,11 @@ def _validate_categories(categories, fastpath=False): from pandas import Index if not isinstance(categories, ABCIndexClass): - categories = Index(categories) + categories = Index(categories, tupleize_cols=False) + + if hasattr(categories, 'categories'): + # I am a CategoricalIndex + categories = categories.categories if not fastpath: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index 0b9e2c9fe5ffc..dd78c4995dcd4 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -657,3 +657,10 @@ def test_str_vs_repr(self): # Py2 will have unicode prefixes pat = r"CategoricalDtype\(categories=\[.*\], ordered=False\)" assert re.match(pat, repr(c1)) + + def test_categorical_categories(self): + # GH17884 + c1 = CategoricalDtype(pd.Categorical(['a', 'b'])) + tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) + c1 = CategoricalDtype(pd.CategoricalIndex(['a', 'b'])) + tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index e1d0b756fed1c..bedadacbc4f6c 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -519,6 +519,18 @@ def test_contructor_from_categorical_string(self): result = Categorical(values, categories=['a', 'b', 'c'], ordered=True) tm.assert_categorical_equal(result, expected) + def test_constructor_with_categorical_categories(self): + # GH17884 + expected = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + + result = pd.Categorical( + ['a', 'b'], categories=pd.Categorical(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + + result = pd.Categorical( + ['a', 'b'], categories=pd.CategoricalIndex(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + def test_from_codes(self): # too few categories @@ -560,6 +572,18 @@ def f(): codes = np.random.choice([0, 1], 5, p=[0.9, 0.1]) pd.Categorical.from_codes(codes, categories=["train", "test"]) + def test_from_codes_with_categorical_categories(self): + # GH17884 + expected = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + + result = pd.Categorical.from_codes( + [0, 1], categories=pd.Categorical(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + + result = pd.Categorical.from_codes( + [0, 1], categories=pd.CategoricalIndex(['a', 'b', 'c'])) + tm.assert_categorical_equal(result, expected) + @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): cats = ['a', 'b'] From 7bc64bdf198c6994a01679ccab485519145c54a1 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Oct 2017 10:20:37 +0200 Subject: [PATCH 2/7] better check for categoricalIndex --- pandas/core/dtypes/dtypes.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 2da8a5a860963..9ec9a76ed9d05 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,7 +3,7 @@ import re import numpy as np from pandas import compat -from pandas.core.dtypes.generic import ABCIndexClass +from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex class ExtensionDtype(object): @@ -318,8 +318,7 @@ def _validate_categories(categories, fastpath=False): if not isinstance(categories, ABCIndexClass): categories = Index(categories, tupleize_cols=False) - if hasattr(categories, 'categories'): - # I am a CategoricalIndex + if isinstance(categories, ABCCategoricalIndex): categories = categories.categories if not fastpath: From 938e4e8e31be200ad18dfc3f956bb7c076bcae43 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Oct 2017 11:06:33 +0200 Subject: [PATCH 3/7] fix pep8 --- pandas/core/dtypes/dtypes.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 9ec9a76ed9d05..26bfb940ce8fa 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -170,7 +170,6 @@ def _from_categorical_dtype(cls, dtype, categories=None, ordered=None): return cls(categories, ordered) def _finalize(self, categories, ordered, fastpath=False): - from pandas.core.indexes.base import Index if ordered is None: ordered = False From 7fa85785f1bb1764854234ebce43a2e3f84b3455 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Oct 2017 15:21:30 +0200 Subject: [PATCH 4/7] add whatsnew --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 642ee3c8e54c7..eb7cf21d1bc27 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1023,6 +1023,7 @@ Categorical - Bug in the categorical constructor with empty values and categories causing the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) - Bug in categorical operations with :ref:`Series.cat ` not preserving the original Series' name (:issue:`17509`) - Bug in :func:`DataFrame.merge` failing for categorical columns with boolean/int data types (:issue:`17187`) +- Bug in constructing a ``Categorical``/``CategoricalDtype`` when the specified ``categories`` where of categorical type (:issue:`17884`). .. _whatsnew_0210.pypy: From b02049845d7481a395e3f692aef1311f43009670 Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Oct 2017 20:34:44 +0200 Subject: [PATCH 5/7] fixup feedback --- pandas/core/dtypes/dtypes.py | 10 +++++----- pandas/tests/dtypes/test_dtypes.py | 7 ++++--- pandas/tests/test_categorical.py | 20 ++++++++++---------- 3 files changed, 19 insertions(+), 18 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index bf1490dcfade1..7b63af7553b9b 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,7 +3,8 @@ import re import numpy as np from pandas import compat -from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex +from pandas.core.dtypes.generic import ( + ABCIndexClass, ABCCategoricalIndex, ABCCategorical) class ExtensionDtype(object): @@ -315,11 +316,10 @@ def _validate_categories(categories, fastpath=False): """ from pandas import Index - if not isinstance(categories, ABCIndexClass): - categories = Index(categories, tupleize_cols=False) - - if isinstance(categories, ABCCategoricalIndex): + if isinstance(categories, (ABCCategorical, ABCCategoricalIndex)): categories = categories.categories + elif not isinstance(categories, ABCIndexClass): + categories = Index(categories, tupleize_cols=False) if not fastpath: diff --git a/pandas/tests/dtypes/test_dtypes.py b/pandas/tests/dtypes/test_dtypes.py index dd78c4995dcd4..84e6f0d4f5a7a 100644 --- a/pandas/tests/dtypes/test_dtypes.py +++ b/pandas/tests/dtypes/test_dtypes.py @@ -6,7 +6,8 @@ import numpy as np import pandas as pd -from pandas import Series, Categorical, IntervalIndex, date_range +from pandas import ( + Series, Categorical, CategoricalIndex, IntervalIndex, date_range) from pandas.core.dtypes.dtypes import ( DatetimeTZDtype, PeriodDtype, @@ -660,7 +661,7 @@ def test_str_vs_repr(self): def test_categorical_categories(self): # GH17884 - c1 = CategoricalDtype(pd.Categorical(['a', 'b'])) + c1 = CategoricalDtype(Categorical(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) - c1 = CategoricalDtype(pd.CategoricalIndex(['a', 'b'])) + c1 = CategoricalDtype(CategoricalIndex(['a', 'b'])) tm.assert_index_equal(c1.categories, pd.Index(['a', 'b'])) diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index bedadacbc4f6c..ee5129850a442 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -521,14 +521,14 @@ def test_contructor_from_categorical_string(self): def test_constructor_with_categorical_categories(self): # GH17884 - expected = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) - result = pd.Categorical( - ['a', 'b'], categories=pd.Categorical(['a', 'b', 'c'])) + result = Categorical( + ['a', 'b'], categories=Categorical(['a', 'b', 'c'])) tm.assert_categorical_equal(result, expected) - result = pd.Categorical( - ['a', 'b'], categories=pd.CategoricalIndex(['a', 'b', 'c'])) + result = Categorical( + ['a', 'b'], categories=CategoricalIndex(['a', 'b', 'c'])) tm.assert_categorical_equal(result, expected) def test_from_codes(self): @@ -574,14 +574,14 @@ def f(): def test_from_codes_with_categorical_categories(self): # GH17884 - expected = pd.Categorical(['a', 'b'], categories=['a', 'b', 'c']) + expected = Categorical(['a', 'b'], categories=['a', 'b', 'c']) - result = pd.Categorical.from_codes( - [0, 1], categories=pd.Categorical(['a', 'b', 'c'])) + result = Categorical.from_codes( + [0, 1], categories=Categorical(['a', 'b', 'c'])) tm.assert_categorical_equal(result, expected) - result = pd.Categorical.from_codes( - [0, 1], categories=pd.CategoricalIndex(['a', 'b', 'c'])) + result = Categorical.from_codes( + [0, 1], categories=CategoricalIndex(['a', 'b', 'c'])) tm.assert_categorical_equal(result, expected) @pytest.mark.parametrize('dtype', [None, 'category']) From ba532ee9cb2fa0105fd88b5e2c36a8975d8309db Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Oct 2017 20:47:41 +0200 Subject: [PATCH 6/7] move check to revert allowing non-unique Categoricals --- pandas/core/dtypes/dtypes.py | 10 +++++----- pandas/tests/test_categorical.py | 4 ++++ 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index 7b63af7553b9b..b4467f0f9733b 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -3,8 +3,7 @@ import re import numpy as np from pandas import compat -from pandas.core.dtypes.generic import ( - ABCIndexClass, ABCCategoricalIndex, ABCCategorical) +from pandas.core.dtypes.generic import ABCIndexClass, ABCCategoricalIndex class ExtensionDtype(object): @@ -316,9 +315,7 @@ def _validate_categories(categories, fastpath=False): """ from pandas import Index - if isinstance(categories, (ABCCategorical, ABCCategoricalIndex)): - categories = categories.categories - elif not isinstance(categories, ABCIndexClass): + if not isinstance(categories, ABCIndexClass): categories = Index(categories, tupleize_cols=False) if not fastpath: @@ -329,6 +326,9 @@ def _validate_categories(categories, fastpath=False): if not categories.is_unique: raise ValueError('Categorical categories must be unique') + if isinstance(categories, ABCCategoricalIndex): + categories = categories.categories + return categories @property diff --git a/pandas/tests/test_categorical.py b/pandas/tests/test_categorical.py index ee5129850a442..d88e92a39a6c5 100644 --- a/pandas/tests/test_categorical.py +++ b/pandas/tests/test_categorical.py @@ -584,6 +584,10 @@ def test_from_codes_with_categorical_categories(self): [0, 1], categories=CategoricalIndex(['a', 'b', 'c'])) tm.assert_categorical_equal(result, expected) + # non-unique Categorical still raises + with pytest.raises(ValueError): + Categorical.from_codes([0, 1], Categorical(['a', 'b', 'a'])) + @pytest.mark.parametrize('dtype', [None, 'category']) def test_from_inferred_categories(self, dtype): cats = ['a', 'b'] From 1c56abeb8866698c0961f21ce13be180b191131d Mon Sep 17 00:00:00 2001 From: Joris Van den Bossche Date: Mon, 16 Oct 2017 21:14:13 +0200 Subject: [PATCH 7/7] typo in whatsnew --- doc/source/whatsnew/v0.21.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index eb7cf21d1bc27..30fbe2fd387b6 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -1023,7 +1023,7 @@ Categorical - Bug in the categorical constructor with empty values and categories causing the ``.categories`` to be an empty ``Float64Index`` rather than an empty ``Index`` with object dtype (:issue:`17248`) - Bug in categorical operations with :ref:`Series.cat ` not preserving the original Series' name (:issue:`17509`) - Bug in :func:`DataFrame.merge` failing for categorical columns with boolean/int data types (:issue:`17187`) -- Bug in constructing a ``Categorical``/``CategoricalDtype`` when the specified ``categories`` where of categorical type (:issue:`17884`). +- Bug in constructing a ``Categorical``/``CategoricalDtype`` when the specified ``categories`` are of categorical type (:issue:`17884`). .. _whatsnew_0210.pypy: