From 64b7cbaed1d1d08ac7c5c6964aba85f4c5c20411 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 27 Jul 2020 10:40:43 +0100 Subject: [PATCH 1/5] adding test for .describe() with duplicate columns --- pandas/tests/groupby/test_function.py | 57 +++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index e693962e57ac3..97e99337f5ffd 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -992,6 +992,63 @@ def test_frame_describe_unstacked_format(): tm.assert_frame_equal(result, expected) +@pytest.mark.parametrize("as_index", [True, False]) +def test_describe_with_duplicate_output_column_names(as_index): + # GH #35314 + df = pd.DataFrame( + { + "a": [99, 99, 99, 88, 88, 88], + "b": [1, 2, 3, 4, 5, 6], + "c": [10, 20, 30, 40, 50, 60], + }, + columns=["a", "b", "b"], + ) + + expected = ( + pd.DataFrame.from_records( + [ + ("a", "count", 3.0, 3.0), + ("a", "mean", 88.0, 99.0), + ("a", "std", 0.0, 0.0), + ("a", "min", 88.0, 99.0), + ("a", "25%", 88.0, 99.0), + ("a", "50%", 88.0, 99.0), + ("a", "75%", 88.0, 99.0), + ("a", "max", 88.0, 99.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ("b", "count", 3.0, 3.0), + ("b", "mean", 5.0, 2.0), + ("b", "std", 1.0, 1.0), + ("b", "min", 4.0, 1.0), + ("b", "25%", 4.5, 1.5), + ("b", "50%", 5.0, 2.0), + ("b", "75%", 5.5, 2.5), + ("b", "max", 6.0, 3.0), + ], + ) + .set_index([0, 1]) + .T + ) + expected.columns.names = [None, None] + expected.index = pd.Index([88, 99], name="a") + + if as_index: + expected = expected.drop(columns=["a"], level=0) + else: + expected = expected.reset_index(drop=True) + + result = df.groupby("a", as_index=as_index).describe() + + tm.assert_frame_equal(result, expected) + + def test_groupby_mean_no_overflow(): # Regression test for (#22487) df = pd.DataFrame( From 9bef2dde81890dca489d9d007f9074edd36f79b1 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sat, 1 Aug 2020 23:46:43 +0100 Subject: [PATCH 2/5] addressing PerformanceWarning in test --- pandas/tests/groupby/test_function.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 97e99337f5ffd..01a7ea867105e 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -1046,6 +1046,11 @@ def test_describe_with_duplicate_output_column_names(as_index): result = df.groupby("a", as_index=as_index).describe() + tm.assert_index_equal(result.columns, expected.columns) + + result.columns = pd.RangeIndex(result.shape[1]) + expected.columns = pd.RangeIndex(expected.shape[1]) + tm.assert_frame_equal(result, expected) From 3ec647edc327d3f6407d4264a916602c81ca74aa Mon Sep 17 00:00:00 2001 From: smithto1 Date: Sun, 2 Aug 2020 19:38:33 +0100 Subject: [PATCH 3/5] amend comment to start tests --- pandas/tests/groupby/test_function.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index 01a7ea867105e..f2113f8967e72 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -994,7 +994,7 @@ def test_frame_describe_unstacked_format(): @pytest.mark.parametrize("as_index", [True, False]) def test_describe_with_duplicate_output_column_names(as_index): - # GH #35314 + # GH 35314 df = pd.DataFrame( { "a": [99, 99, 99, 88, 88, 88], From 351726a6c143a88f79260647da8b0e45f4a061d9 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 3 Aug 2020 21:45:00 +0100 Subject: [PATCH 4/5] pytest.mark.filterwarnings --- pandas/tests/groupby/test_function.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index f2113f8967e72..d58aad79e1982 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -992,6 +992,9 @@ def test_frame_describe_unstacked_format(): tm.assert_frame_equal(result, expected) +@pytest.mark.filterwarnings( + "ignore:indexing past lexsort depth may impact performance:pandas.errors.PerformanceWarning" +) @pytest.mark.parametrize("as_index", [True, False]) def test_describe_with_duplicate_output_column_names(as_index): # GH 35314 @@ -1046,11 +1049,6 @@ def test_describe_with_duplicate_output_column_names(as_index): result = df.groupby("a", as_index=as_index).describe() - tm.assert_index_equal(result.columns, expected.columns) - - result.columns = pd.RangeIndex(result.shape[1]) - expected.columns = pd.RangeIndex(expected.shape[1]) - tm.assert_frame_equal(result, expected) From 75208184b46470bda4e7e30e34c11ce078c53071 Mon Sep 17 00:00:00 2001 From: smithto1 Date: Mon, 3 Aug 2020 21:57:58 +0100 Subject: [PATCH 5/5] fix PEP8 violation --- pandas/tests/groupby/test_function.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_function.py b/pandas/tests/groupby/test_function.py index d58aad79e1982..cbfba16223f74 100644 --- a/pandas/tests/groupby/test_function.py +++ b/pandas/tests/groupby/test_function.py @@ -993,7 +993,9 @@ def test_frame_describe_unstacked_format(): @pytest.mark.filterwarnings( - "ignore:indexing past lexsort depth may impact performance:pandas.errors.PerformanceWarning" + "ignore:" + "indexing past lexsort depth may impact performance:" + "pandas.errors.PerformanceWarning" ) @pytest.mark.parametrize("as_index", [True, False]) def test_describe_with_duplicate_output_column_names(as_index):