From 812eca4cc784d431a76479d06bef62755ab0464c Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 2 May 2021 11:40:47 +0100 Subject: [PATCH 1/5] BUG: RangeIndex.astype('category') --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/core/indexes/base.py | 4 +--- pandas/tests/indexes/common.py | 6 +++--- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 794a7025fe218..41ab7923bbfd9 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -780,6 +780,7 @@ Indexing - Bug in :meth:`DatetimeIndex.insert` when inserting ``np.datetime64("NaT")`` into a timezone-aware index incorrectly treating the timezone-naive value as timezone-aware (:issue:`39769`) - Bug in incorrectly raising in :meth:`Index.insert`, when setting a new column that cannot be held in the existing ``frame.columns``, or in :meth:`Series.reset_index` or :meth:`DataFrame.reset_index` instead of casting to a compatible dtype (:issue:`39068`) - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`) +- Bug in :meth:`RangeIndex.astype` where when converting to :class:`CategoricalIndex`, the categories became a :class:`Int64Index` instead of a :class:`RangeIndex` (:issue:`xxxxx`) - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) - Bug in :meth:`DataFrame.__setitem__` and :meth:`DataFrame.iloc.__setitem__` raising ``ValueError`` when trying to index with a row-slice and setting a list as values (:issue:`40440`) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 7779335bfd3ba..6f414c91ce94c 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -907,9 +907,7 @@ def astype(self, dtype, copy=True): elif is_categorical_dtype(dtype): from pandas.core.indexes.category import CategoricalIndex - return CategoricalIndex( - self._values, name=self.name, dtype=dtype, copy=copy - ) + return CategoricalIndex(self, name=self.name, dtype=dtype, copy=copy) elif is_extension_array_dtype(dtype): return Index(np.asarray(self), name=self.name, dtype=dtype, copy=copy) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 6139d8af48d98..5a3583c17d4d0 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -667,19 +667,19 @@ def test_astype_category(self, copy, name, ordered, simple_index): # standard categories dtype = CategoricalDtype(ordered=ordered) result = idx.astype(dtype, copy=copy) - expected = CategoricalIndex(idx.values, name=name, ordered=ordered) + expected = CategoricalIndex(idx, name=name, ordered=ordered) tm.assert_index_equal(result, expected) # non-standard categories dtype = CategoricalDtype(idx.unique().tolist()[:-1], ordered) result = idx.astype(dtype, copy=copy) - expected = CategoricalIndex(idx.values, name=name, dtype=dtype) + expected = CategoricalIndex(idx, name=name, dtype=dtype) tm.assert_index_equal(result, expected) if ordered is False: # dtype='category' defaults to ordered=False, so only test once result = idx.astype("category", copy=copy) - expected = CategoricalIndex(idx.values, name=name) + expected = CategoricalIndex(idx, name=name) tm.assert_index_equal(result, expected) def test_is_unique(self, simple_index): From 359befae5410ae5e6f227188ac16807c8b1fe561 Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 2 May 2021 11:47:25 +0100 Subject: [PATCH 2/5] add gh number --- doc/source/whatsnew/v1.3.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 41ab7923bbfd9..181989d8b73ff 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -780,7 +780,7 @@ Indexing - Bug in :meth:`DatetimeIndex.insert` when inserting ``np.datetime64("NaT")`` into a timezone-aware index incorrectly treating the timezone-naive value as timezone-aware (:issue:`39769`) - Bug in incorrectly raising in :meth:`Index.insert`, when setting a new column that cannot be held in the existing ``frame.columns``, or in :meth:`Series.reset_index` or :meth:`DataFrame.reset_index` instead of casting to a compatible dtype (:issue:`39068`) - Bug in :meth:`RangeIndex.append` where a single object of length 1 was concatenated incorrectly (:issue:`39401`) -- Bug in :meth:`RangeIndex.astype` where when converting to :class:`CategoricalIndex`, the categories became a :class:`Int64Index` instead of a :class:`RangeIndex` (:issue:`xxxxx`) +- Bug in :meth:`RangeIndex.astype` where when converting to :class:`CategoricalIndex`, the categories became a :class:`Int64Index` instead of a :class:`RangeIndex` (:issue:`41263`) - Bug in setting ``numpy.timedelta64`` values into an object-dtype :class:`Series` using a boolean indexer (:issue:`39488`) - Bug in setting numeric values into a into a boolean-dtypes :class:`Series` using ``at`` or ``iat`` failing to cast to object-dtype (:issue:`39582`) - Bug in :meth:`DataFrame.__setitem__` and :meth:`DataFrame.iloc.__setitem__` raising ``ValueError`` when trying to index with a row-slice and setting a list as values (:issue:`40440`) From 7840030471663b41e5d806240f46c0105c747c8b Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 3 May 2021 06:29:27 +0100 Subject: [PATCH 3/5] add explicit test --- pandas/tests/indexes/ranges/test_range.py | 27 +++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index f7313f100d429..995895f09a3fa 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -5,6 +5,8 @@ import pandas as pd from pandas import ( + CategoricalDtype, + CategoricalIndex, Float64Index, Index, Int64Index, @@ -533,3 +535,28 @@ def test_isin_range(self, base): result = base.isin(values) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) + + @pytest.mark.parametrize("copy", [True, False]) + @pytest.mark.parametrize("name", [None, "foo"]) + @pytest.mark.parametrize("ordered", [True, False]) + def test_astype_category(self, copy, name, ordered, simple_index): + super().test_astype_category(copy, name, ordered, simple_index) + + # GH 41263 + idx = simple_index + if name: + idx = idx.rename(name) + + # standard categories + dtype = CategoricalDtype(ordered=ordered) + result = idx.astype(dtype, copy=copy) + assert isinstance(result, CategoricalIndex) + assert isinstance(result.categories, RangeIndex) + assert (result.categories == idx).all() + + if ordered is False: + # dtype='category' defaults to ordered=False, so only test once + result = idx.astype("category", copy=copy) + assert isinstance(result, CategoricalIndex) + assert isinstance(result.categories, RangeIndex) + assert (result.categories == idx).all() From ebf9febc6c5c78da90a8c3c7a91a02cdd8e62e53 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 3 May 2021 06:44:59 +0100 Subject: [PATCH 4/5] simplify test --- pandas/tests/indexes/ranges/test_range.py | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 995895f09a3fa..743c4d21d2c50 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -543,20 +543,16 @@ def test_astype_category(self, copy, name, ordered, simple_index): super().test_astype_category(copy, name, ordered, simple_index) # GH 41263 - idx = simple_index - if name: - idx = idx.rename(name) - - # standard categories - dtype = CategoricalDtype(ordered=ordered) - result = idx.astype(dtype, copy=copy) - assert isinstance(result, CategoricalIndex) - assert isinstance(result.categories, RangeIndex) - assert (result.categories == idx).all() + idx = simple_index if not name else simple_index.rename(name) + dtypes = [CategoricalDtype(ordered=ordered)] if ordered is False: # dtype='category' defaults to ordered=False, so only test once - result = idx.astype("category", copy=copy) + dtypes.append("category") + + for dtype in dtypes: + # standard categories + result = idx.astype(dtype, copy=copy) assert isinstance(result, CategoricalIndex) assert isinstance(result.categories, RangeIndex) assert (result.categories == idx).all() From d6edb5287beac63cef751ffadf50a051a37d219f Mon Sep 17 00:00:00 2001 From: tp Date: Tue, 4 May 2021 22:36:41 +0100 Subject: [PATCH 5/5] fixing assert_index_equals with exact=True --- doc/source/whatsnew/v1.3.0.rst | 1 + pandas/_testing/asserters.py | 24 +++++++++++-------- .../indexes/categorical/test_constructors.py | 2 +- pandas/tests/indexes/common.py | 6 ++--- pandas/tests/indexes/ranges/test_range.py | 23 ------------------ pandas/tests/util/test_assert_index_equal.py | 24 +++++++++++++++++++ 6 files changed, 43 insertions(+), 37 deletions(-) diff --git a/doc/source/whatsnew/v1.3.0.rst b/doc/source/whatsnew/v1.3.0.rst index 181989d8b73ff..7def8f6bdb2bc 100644 --- a/doc/source/whatsnew/v1.3.0.rst +++ b/doc/source/whatsnew/v1.3.0.rst @@ -946,6 +946,7 @@ Other - Bug in :meth:`Series.where` with numeric dtype and ``other = None`` not casting to ``nan`` (:issue:`39761`) - :meth:`Index.where` behavior now mirrors :meth:`Index.putmask` behavior, i.e. ``index.where(mask, other)`` matches ``index.putmask(~mask, other)`` (:issue:`39412`) - Bug in :func:`pandas.testing.assert_series_equal`, :func:`pandas.testing.assert_frame_equal`, :func:`pandas.testing.assert_index_equal` and :func:`pandas.testing.assert_extension_array_equal` incorrectly raising when an attribute has an unrecognized NA type (:issue:`39461`) +- Bug in :func:`pandas.testing.assert_index_equal` with ``exact=True`` not raising when comparing :class:`CategoricalIndex` instances with ``Int64Index`` and ``RangeIndex`` categories (:issue:`41263`) - Bug in :meth:`DataFrame.equals`, :meth:`Series.equals`, :meth:`Index.equals` with object-dtype containing ``np.datetime64("NaT")`` or ``np.timedelta64("NaT")`` (:issue:`39650`) - Bug in :func:`pandas.util.show_versions` where console JSON output was not proper JSON (:issue:`39701`) - Bug in :meth:`DataFrame.convert_dtypes` incorrectly raised ValueError when called on an empty DataFrame (:issue:`40393`) diff --git a/pandas/_testing/asserters.py b/pandas/_testing/asserters.py index 912039b7571bc..2d695458e32e6 100644 --- a/pandas/_testing/asserters.py +++ b/pandas/_testing/asserters.py @@ -309,18 +309,22 @@ def assert_index_equal( __tracebackhide__ = True def _check_types(left, right, obj="Index"): - if exact: - assert_class_equal(left, right, exact=exact, obj=obj) + if not exact: + return - # Skip exact dtype checking when `check_categorical` is False - if check_categorical: - assert_attr_equal("dtype", left, right, obj=obj) + assert_class_equal(left, right, exact=exact, obj=obj) - # allow string-like to have different inferred_types - if left.inferred_type in ("string"): - assert right.inferred_type in ("string") - else: - assert_attr_equal("inferred_type", left, right, obj=obj) + # Skip exact dtype checking when `check_categorical` is False + if check_categorical: + assert_attr_equal("dtype", left, right, obj=obj) + if is_categorical_dtype(left.dtype) and is_categorical_dtype(right.dtype): + assert_index_equal(left.categories, right.categories, exact=exact) + + # allow string-like to have different inferred_types + if left.inferred_type in ("string"): + assert right.inferred_type in ("string") + else: + assert_attr_equal("inferred_type", left, right, obj=obj) def _get_ilevel_values(index, level): # accept level number only diff --git a/pandas/tests/indexes/categorical/test_constructors.py b/pandas/tests/indexes/categorical/test_constructors.py index 2acf79ee0bced..35620875d5a1a 100644 --- a/pandas/tests/indexes/categorical/test_constructors.py +++ b/pandas/tests/indexes/categorical/test_constructors.py @@ -108,8 +108,8 @@ def test_construction_with_dtype(self): tm.assert_index_equal(result, ci, exact=True) # make sure indexes are handled - expected = CategoricalIndex([0, 1, 2], categories=[0, 1, 2], ordered=True) idx = Index(range(3)) + expected = CategoricalIndex([0, 1, 2], categories=idx, ordered=True) result = CategoricalIndex(idx, categories=idx, ordered=True) tm.assert_index_equal(result, expected, exact=True) diff --git a/pandas/tests/indexes/common.py b/pandas/tests/indexes/common.py index 5a3583c17d4d0..8bbe8f9b9e0e2 100644 --- a/pandas/tests/indexes/common.py +++ b/pandas/tests/indexes/common.py @@ -668,19 +668,19 @@ def test_astype_category(self, copy, name, ordered, simple_index): dtype = CategoricalDtype(ordered=ordered) result = idx.astype(dtype, copy=copy) expected = CategoricalIndex(idx, name=name, ordered=ordered) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) # non-standard categories dtype = CategoricalDtype(idx.unique().tolist()[:-1], ordered) result = idx.astype(dtype, copy=copy) expected = CategoricalIndex(idx, name=name, dtype=dtype) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) if ordered is False: # dtype='category' defaults to ordered=False, so only test once result = idx.astype("category", copy=copy) expected = CategoricalIndex(idx, name=name) - tm.assert_index_equal(result, expected) + tm.assert_index_equal(result, expected, exact=True) def test_is_unique(self, simple_index): # initialize a unique index diff --git a/pandas/tests/indexes/ranges/test_range.py b/pandas/tests/indexes/ranges/test_range.py index 743c4d21d2c50..f7313f100d429 100644 --- a/pandas/tests/indexes/ranges/test_range.py +++ b/pandas/tests/indexes/ranges/test_range.py @@ -5,8 +5,6 @@ import pandas as pd from pandas import ( - CategoricalDtype, - CategoricalIndex, Float64Index, Index, Int64Index, @@ -535,24 +533,3 @@ def test_isin_range(self, base): result = base.isin(values) expected = np.array([True, False]) tm.assert_numpy_array_equal(result, expected) - - @pytest.mark.parametrize("copy", [True, False]) - @pytest.mark.parametrize("name", [None, "foo"]) - @pytest.mark.parametrize("ordered", [True, False]) - def test_astype_category(self, copy, name, ordered, simple_index): - super().test_astype_category(copy, name, ordered, simple_index) - - # GH 41263 - idx = simple_index if not name else simple_index.rename(name) - - dtypes = [CategoricalDtype(ordered=ordered)] - if ordered is False: - # dtype='category' defaults to ordered=False, so only test once - dtypes.append("category") - - for dtype in dtypes: - # standard categories - result = idx.astype(dtype, copy=copy) - assert isinstance(result, CategoricalIndex) - assert isinstance(result.categories, RangeIndex) - assert (result.categories == idx).all() diff --git a/pandas/tests/util/test_assert_index_equal.py b/pandas/tests/util/test_assert_index_equal.py index 82a3a223b442b..1778b6fb9d832 100644 --- a/pandas/tests/util/test_assert_index_equal.py +++ b/pandas/tests/util/test_assert_index_equal.py @@ -3,9 +3,11 @@ from pandas import ( Categorical, + CategoricalIndex, Index, MultiIndex, NaT, + RangeIndex, ) import pandas._testing as tm @@ -199,6 +201,28 @@ def test_index_equal_category_mismatch(check_categorical): tm.assert_index_equal(idx1, idx2, check_categorical=check_categorical) +@pytest.mark.parametrize("exact", [False, True]) +def test_index_equal_range_categories(check_categorical, exact): + # GH41263 + msg = """\ +Index are different + +Index classes are different +\\[left\\]: RangeIndex\\(start=0, stop=10, step=1\\) +\\[right\\]: Int64Index\\(\\[0, 1, 2, 3, 4, 5, 6, 7, 8, 9\\], dtype='int64'\\)""" + + rcat = CategoricalIndex(RangeIndex(10)) + icat = CategoricalIndex(list(range(10))) + + if check_categorical and exact: + with pytest.raises(AssertionError, match=msg): + tm.assert_index_equal(rcat, icat, check_categorical=True, exact=True) + else: + tm.assert_index_equal( + rcat, icat, check_categorical=check_categorical, exact=exact + ) + + def test_assert_index_equal_mixed_dtype(): # GH#39168 idx = Index(["foo", "bar", 42])