From 1df4dd7437576803b71d950886b0dd97dbdfdee4 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 15 Sep 2020 11:05:36 -0500 Subject: [PATCH 1/6] BUG: Always cast to Categorical in lexsort_indexer --- doc/source/whatsnew/v1.1.3.rst | 1 + pandas/core/sorting.py | 8 +------- .../tests/frame/methods/test_sort_values.py | 19 +++++++++++++++++++ 3 files changed, 21 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index 8e283aec39786..dc17b58ac7898 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -29,6 +29,7 @@ Bug fixes - Bug in :func:`read_spss` where passing a ``pathlib.Path`` as ``path`` would raise a ``TypeError`` (:issue:`33666`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) +- Bug in :meth:`DataFrame.sort_values` raising an ``AttributeError`` when sorting on a key that casts a column to categorical dtype (:issue:`36383`) .. --------------------------------------------------------------------------- diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index dd6aadf570baa..a32f47e2cd297 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -294,13 +294,7 @@ def lexsort_indexer( keys = [ensure_key_mapped(k, key) for k in keys] for k, order in zip(keys, orders): - # we are already a Categorical - if is_categorical_dtype(k): - cat = k - - # create the Categorical - else: - cat = Categorical(k, ordered=True) + cat = Categorical(k, ordered=True) if na_position not in ["last", "first"]: raise ValueError(f"invalid na_position: {na_position}") diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index c60e7e3b1bdb6..e2ed487476736 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -691,3 +691,22 @@ def test_sort_values_key_dict_axis(self): result = df.sort_values(1, key=lambda col: -col, axis=1) expected = df.loc[:, ::-1] tm.assert_frame_equal(result, expected) + + def test_sort_values_key_casts_to_categorical(self): + # https://github.com/pandas-dev/pandas/issues/36383 + categories = ["a", "b", "c"] + df = pd.DataFrame({"x": [3, 2, 1], "y": ["c", "b", "a"]}) + + def sorter(key): + if key.name == "y": + return pd.Series( + pd.Categorical(key, categories=categories, ordered=True) + ) + return key + + result = df.sort_values(by=["x", "y"], key=sorter) + expected = pd.DataFrame( + {"x": [1, 2, 3], "y": ["a", "b", "c"]}, index=pd.Index([2, 1, 0]) + ) + + tm.assert_frame_equal(result, expected) From 96520429abd57e5e50f285efad062ea8351c06e1 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 15 Sep 2020 11:07:24 -0500 Subject: [PATCH 2/6] Nit --- doc/source/whatsnew/v1.1.3.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.3.rst b/doc/source/whatsnew/v1.1.3.rst index dc17b58ac7898..f49c6771e6d86 100644 --- a/doc/source/whatsnew/v1.1.3.rst +++ b/doc/source/whatsnew/v1.1.3.rst @@ -29,7 +29,7 @@ Bug fixes - Bug in :func:`read_spss` where passing a ``pathlib.Path`` as ``path`` would raise a ``TypeError`` (:issue:`33666`) - Bug in :meth:`Series.str.startswith` and :meth:`Series.str.endswith` with ``category`` dtype not propagating ``na`` parameter (:issue:`36241`) - Bug in :class:`Series` constructor where integer overflow would occur for sufficiently large scalar inputs when an index was provided (:issue:`36291`) -- Bug in :meth:`DataFrame.sort_values` raising an ``AttributeError`` when sorting on a key that casts a column to categorical dtype (:issue:`36383`) +- Bug in :meth:`DataFrame.sort_values` raising an ``AttributeError`` when sorting on a key that casts column to categorical dtype (:issue:`36383`) .. --------------------------------------------------------------------------- From d45ae2dcb20258164bfff17d3a17472bfc2992e2 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 15 Sep 2020 11:08:51 -0500 Subject: [PATCH 3/6] Edit test --- pandas/tests/frame/methods/test_sort_values.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index e2ed487476736..7a133b3e98e39 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -695,7 +695,7 @@ def test_sort_values_key_dict_axis(self): def test_sort_values_key_casts_to_categorical(self): # https://github.com/pandas-dev/pandas/issues/36383 categories = ["a", "b", "c"] - df = pd.DataFrame({"x": [3, 2, 1], "y": ["c", "b", "a"]}) + df = pd.DataFrame({"x": [1, 1, 1], "y": ["c", "b", "a"]}) def sorter(key): if key.name == "y": @@ -706,7 +706,7 @@ def sorter(key): result = df.sort_values(by=["x", "y"], key=sorter) expected = pd.DataFrame( - {"x": [1, 2, 3], "y": ["a", "b", "c"]}, index=pd.Index([2, 1, 0]) + {"x": [1, 1, 1], "y": ["a", "b", "c"]}, index=pd.Index([2, 1, 0]) ) tm.assert_frame_equal(result, expected) From ab13b9890493270e8d5ac6fa01d242d0ebe80549 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 15 Sep 2020 11:10:26 -0500 Subject: [PATCH 4/6] Diff order --- pandas/tests/frame/methods/test_sort_values.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index 7a133b3e98e39..b15c24ad2368f 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -694,8 +694,8 @@ def test_sort_values_key_dict_axis(self): def test_sort_values_key_casts_to_categorical(self): # https://github.com/pandas-dev/pandas/issues/36383 - categories = ["a", "b", "c"] - df = pd.DataFrame({"x": [1, 1, 1], "y": ["c", "b", "a"]}) + categories = ["c", "b", "a"] + df = pd.DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]}) def sorter(key): if key.name == "y": @@ -706,7 +706,7 @@ def sorter(key): result = df.sort_values(by=["x", "y"], key=sorter) expected = pd.DataFrame( - {"x": [1, 1, 1], "y": ["a", "b", "c"]}, index=pd.Index([2, 1, 0]) + {"x": [1, 1, 1], "y": ["c", "b", "a"]}, index=pd.Index([2, 1, 0]) ) tm.assert_frame_equal(result, expected) From 595791b6fb6db429ea2b15cc2d26f0a25921132e Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 15 Sep 2020 11:36:37 -0500 Subject: [PATCH 5/6] Drop import --- pandas/core/sorting.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index a32f47e2cd297..ec62192464665 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -20,7 +20,6 @@ from pandas.core.dtypes.common import ( ensure_int64, ensure_platform_int, - is_categorical_dtype, is_extension_array_dtype, ) from pandas.core.dtypes.generic import ABCMultiIndex From 578bb3d44f88885a7a8fd1b9ac8452676f0a23e7 Mon Sep 17 00:00:00 2001 From: Daniel Saxton Date: Tue, 15 Sep 2020 18:28:49 -0500 Subject: [PATCH 6/6] Param --- pandas/tests/frame/methods/test_sort_values.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/frame/methods/test_sort_values.py b/pandas/tests/frame/methods/test_sort_values.py index b15c24ad2368f..0ca232ec433e7 100644 --- a/pandas/tests/frame/methods/test_sort_values.py +++ b/pandas/tests/frame/methods/test_sort_values.py @@ -692,7 +692,8 @@ def test_sort_values_key_dict_axis(self): expected = df.loc[:, ::-1] tm.assert_frame_equal(result, expected) - def test_sort_values_key_casts_to_categorical(self): + @pytest.mark.parametrize("ordered", [True, False]) + def test_sort_values_key_casts_to_categorical(self, ordered): # https://github.com/pandas-dev/pandas/issues/36383 categories = ["c", "b", "a"] df = pd.DataFrame({"x": [1, 1, 1], "y": ["a", "b", "c"]}) @@ -700,7 +701,7 @@ def test_sort_values_key_casts_to_categorical(self): def sorter(key): if key.name == "y": return pd.Series( - pd.Categorical(key, categories=categories, ordered=True) + pd.Categorical(key, categories=categories, ordered=ordered) ) return key