From 2f4ecb2481c20963222f64cfa0d82780596ea057 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 16 Apr 2020 11:35:10 +0100 Subject: [PATCH 1/4] TST: added test for GH28597 Added test to ensure that categories stay ordered when grouping with missing values. --- pandas/tests/groupby/test_groupby.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index c88d16e34eab8..6ebc45c23fbe4 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2012,3 +2012,21 @@ def test_groups_repr_truncates(max_seq_items, expected): result = df.groupby(np.array(df.a)).groups.__repr__() assert result == expected + + +def test_sorted_missing_category_values(): + # GH 28597 + df = pd.DataFrame( + { + 'foo' : ['small', 'large', 'large', 'large', 'medium', 'large', 'large', 'medium'], + 'bar' : ['C', 'A', 'A', 'C', 'A', 'C', 'A', 'C'] + }) + df['foo'] = df['foo'].astype('category').cat.set_categories(['tiny','small', 'medium', 'large'], ordered=True) + + expected = pd.DataFrame({'tiny': {'A': 0, 'C': 0}, 'small': {'A': 0, 'C': 1}, 'medium': {'A': 1, 'C': 1}, 'large': {'A': 3, 'C': 2}}) + expected = expected.rename_axis('bar', axis='index') + expected.columns = pd.CategoricalIndex(['tiny', 'small', 'medium', 'large'], categories=['tiny', 'small', 'medium', 'large'], ordered=True, name='foo', dtype='category') + + result = df.groupby(['bar', 'foo']).size().unstack() + + tm.assert_frame_equal(result, expected) From a9d1952510cdaffd79bf2ce1a3ab1ba1da546a23 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 16 Apr 2020 12:09:19 +0100 Subject: [PATCH 2/4] fixed code formatting --- pandas/tests/groupby/test_groupby.py | 45 ++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 9 deletions(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 6ebc45c23fbe4..90986a7585280 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2017,16 +2017,43 @@ def test_groups_repr_truncates(max_seq_items, expected): def test_sorted_missing_category_values(): # GH 28597 df = pd.DataFrame( - { - 'foo' : ['small', 'large', 'large', 'large', 'medium', 'large', 'large', 'medium'], - 'bar' : ['C', 'A', 'A', 'C', 'A', 'C', 'A', 'C'] - }) - df['foo'] = df['foo'].astype('category').cat.set_categories(['tiny','small', 'medium', 'large'], ordered=True) + { + "foo": [ + "small", + "large", + "large", + "large", + "medium", + "large", + "large", + "medium", + ], + "bar": ["C", "A", "A", "C", "A", "C", "A", "C"], + } + ) + df["foo"] = ( + df["foo"] + .astype("category") + .cat.set_categories(["tiny", "small", "medium", "large"], ordered=True) + ) - expected = pd.DataFrame({'tiny': {'A': 0, 'C': 0}, 'small': {'A': 0, 'C': 1}, 'medium': {'A': 1, 'C': 1}, 'large': {'A': 3, 'C': 2}}) - expected = expected.rename_axis('bar', axis='index') - expected.columns = pd.CategoricalIndex(['tiny', 'small', 'medium', 'large'], categories=['tiny', 'small', 'medium', 'large'], ordered=True, name='foo', dtype='category') + expected = pd.DataFrame( + { + "tiny": {"A": 0, "C": 0}, + "small": {"A": 0, "C": 1}, + "medium": {"A": 1, "C": 1}, + "large": {"A": 3, "C": 2}, + } + ) + expected = expected.rename_axis("bar", axis="index") + expected.columns = pd.CategoricalIndex( + ["tiny", "small", "medium", "large"], + categories=["tiny", "small", "medium", "large"], + ordered=True, + name="foo", + dtype="category", + ) - result = df.groupby(['bar', 'foo']).size().unstack() + result = df.groupby(["bar", "foo"]).size().unstack() tm.assert_frame_equal(result, expected) From e529e2deefcb773ab20bc000c5a50d733064bd62 Mon Sep 17 00:00:00 2001 From: Your Name Date: Thu, 16 Apr 2020 15:04:54 +0100 Subject: [PATCH 3/4] moved test to pandas/tests/groupby/test_categorical.py --- pandas/tests/groupby/test_categorical.py | 44 +++++++++++++++++++++++ pandas/tests/groupby/test_groupby.py | 45 ------------------------ 2 files changed, 44 insertions(+), 45 deletions(-) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index da8327f64e26f..8d11e66dbe649 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1392,3 +1392,47 @@ def test_read_only_category_no_sort(): expected = DataFrame(data={"a": [2, 6]}, index=CategoricalIndex([1, 2], name="b")) result = df.groupby("b", sort=False).mean() tm.assert_frame_equal(result, expected) + +def test_sorted_missing_category_values(): + # GH 28597 + df = pd.DataFrame( + { + "foo": [ + "small", + "large", + "large", + "large", + "medium", + "large", + "large", + "medium", + ], + "bar": ["C", "A", "A", "C", "A", "C", "A", "C"], + } + ) + df["foo"] = ( + df["foo"] + .astype("category") + .cat.set_categories(["tiny", "small", "medium", "large"], ordered=True) + ) + + expected = pd.DataFrame( + { + "tiny": {"A": 0, "C": 0}, + "small": {"A": 0, "C": 1}, + "medium": {"A": 1, "C": 1}, + "large": {"A": 3, "C": 2}, + } + ) + expected = expected.rename_axis("bar", axis="index") + expected.columns = pd.CategoricalIndex( + ["tiny", "small", "medium", "large"], + categories=["tiny", "small", "medium", "large"], + ordered=True, + name="foo", + dtype="category", + ) + + result = df.groupby(["bar", "foo"]).size().unstack() + + tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 90986a7585280..c88d16e34eab8 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -2012,48 +2012,3 @@ def test_groups_repr_truncates(max_seq_items, expected): result = df.groupby(np.array(df.a)).groups.__repr__() assert result == expected - - -def test_sorted_missing_category_values(): - # GH 28597 - df = pd.DataFrame( - { - "foo": [ - "small", - "large", - "large", - "large", - "medium", - "large", - "large", - "medium", - ], - "bar": ["C", "A", "A", "C", "A", "C", "A", "C"], - } - ) - df["foo"] = ( - df["foo"] - .astype("category") - .cat.set_categories(["tiny", "small", "medium", "large"], ordered=True) - ) - - expected = pd.DataFrame( - { - "tiny": {"A": 0, "C": 0}, - "small": {"A": 0, "C": 1}, - "medium": {"A": 1, "C": 1}, - "large": {"A": 3, "C": 2}, - } - ) - expected = expected.rename_axis("bar", axis="index") - expected.columns = pd.CategoricalIndex( - ["tiny", "small", "medium", "large"], - categories=["tiny", "small", "medium", "large"], - ordered=True, - name="foo", - dtype="category", - ) - - result = df.groupby(["bar", "foo"]).size().unstack() - - tm.assert_frame_equal(result, expected) From 731d3c009dc038f47abe027a78113ddf7b47c472 Mon Sep 17 00:00:00 2001 From: Your Name Date: Fri, 17 Apr 2020 09:43:07 +0100 Subject: [PATCH 4/4] added blank line to please linter --- pandas/tests/groupby/test_categorical.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/groupby/test_categorical.py b/pandas/tests/groupby/test_categorical.py index 8d11e66dbe649..b2545e0e1b4d2 100644 --- a/pandas/tests/groupby/test_categorical.py +++ b/pandas/tests/groupby/test_categorical.py @@ -1393,6 +1393,7 @@ def test_read_only_category_no_sort(): result = df.groupby("b", sort=False).mean() tm.assert_frame_equal(result, expected) + def test_sorted_missing_category_values(): # GH 28597 df = pd.DataFrame(