From b31e07a44d469a731c0221bc87af44308fbef75a Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Mon, 6 Jul 2020 22:06:43 +0200 Subject: [PATCH 1/7] correct wrong if statement in crosstab and add tests for #35144 --- pandas/core/reshape/pivot.py | 2 +- pandas/tests/reshape/test_crosstab.py | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index ea5916eff3afa..85e9dbecc746a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -673,7 +673,7 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): # check if margin name is in (for MI cases) or equal to last # index/column and save the column and index margin - if (margins_name not in table.iloc[-1, :].name) | ( + if (margins_name not in table.iloc[-1, :].name) & ( margins_name != table.iloc[:, -1].name ): raise ValueError(f"{margins_name} not in pivoted DataFrame") diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 8795af2e11122..e063ac9b022e6 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -698,3 +698,26 @@ def test_margin_normalize(self): names=["A", "B"], ) tm.assert_frame_equal(result, expected) + + def test_crosstab_multiple_columns_normalize(self): + df = DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", + "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", + "one", "one", "two", "two"], + "C": ["small", "large", "large", "small", + "small", "large", "small", "small", + "large"], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) + result = crosstab(index=df.C, + columns=[df.A, df.B], + margins=True, + margins_name='margin', normalize=True) + expected = DataFrame([[0.111111, 0.111111, 0.222222, 0.000000, 0.444444], + [0.111111, 0.111111, 0.111111, 0.222222, 0.555556], + [0.222222, 0.222222, 0.333333, 0.222222, 1.]], + index=['large', 'small', 'margin']) + expected.columns = MultiIndex( levels=[['bar', 'foo', 'margin'], ['', 'one', 'two']], codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]], names=["A", "B"], ) + expected.index.name = 'C' + tm.assert_frame_equal(result, expected) + From df57d87cab0f8ee0784afe8f473bdea21e969259 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Mon, 6 Jul 2020 22:09:33 +0200 Subject: [PATCH 2/7] pass "black pandas" --- pandas/tests/reshape/test_crosstab.py | 60 ++++++++++++++++++--------- 1 file changed, 40 insertions(+), 20 deletions(-) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index e063ac9b022e6..5ea5d470df997 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -700,24 +700,44 @@ def test_margin_normalize(self): tm.assert_frame_equal(result, expected) def test_crosstab_multiple_columns_normalize(self): - df = DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", - "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", - "one", "one", "two", "two"], - "C": ["small", "large", "large", "small", - "small", "large", "small", "small", - "large"], - "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) - result = crosstab(index=df.C, - columns=[df.A, df.B], - margins=True, - margins_name='margin', normalize=True) - expected = DataFrame([[0.111111, 0.111111, 0.222222, 0.000000, 0.444444], - [0.111111, 0.111111, 0.111111, 0.222222, 0.555556], - [0.222222, 0.222222, 0.333333, 0.222222, 1.]], - index=['large', 'small', 'margin']) - expected.columns = MultiIndex( levels=[['bar', 'foo', 'margin'], ['', 'one', 'two']], codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]], names=["A", "B"], ) - expected.index.name = 'C' + df = DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + result = crosstab( + index=df.C, + columns=[df.A, df.B], + margins=True, + margins_name="margin", + normalize=True, + ) + expected = DataFrame( + [ + [0.111111, 0.111111, 0.222222, 0.000000, 0.444444], + [0.111111, 0.111111, 0.111111, 0.222222, 0.555556], + [0.222222, 0.222222, 0.333333, 0.222222, 1.0], + ], + index=["large", "small", "margin"], + ) + expected.columns = MultiIndex( + levels=[["bar", "foo", "margin"], ["", "one", "two"]], + codes=[[0, 0, 1, 1, 2], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.index.name = "C" tm.assert_frame_equal(result, expected) - From 01217bc6c6ba0a6d0971c9cafbdbdaab0b2fc28f Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Tue, 14 Jul 2020 09:23:37 +0200 Subject: [PATCH 3/7] add whatsnew entry, update comment and unit tests in test_crosstab.py --- doc/source/whatsnew/v1.1.0.rst | 1 + pandas/core/reshape/pivot.py | 2 +- pandas/tests/reshape/test_crosstab.py | 22 ++-------------------- 3 files changed, 4 insertions(+), 21 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 85b29a58a1f15..aa3fb9ee7e7bf 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -1125,6 +1125,7 @@ Reshaping - Bug in :meth:`Series.where` with an empty Series and empty ``cond`` having non-bool dtype (:issue:`34592`) - Fixed regression where :meth:`DataFrame.apply` would raise ``ValueError`` for elements whth ``S`` dtype (:issue:`34529`) - Bug in :meth:`DataFrame.append` leading to sorting columns even when ``sort=False`` is specified (:issue:`35092`) +- Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) Sparse ^^^^^^ diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 85e9dbecc746a..818153535c4d8 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -671,7 +671,7 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): table_index = table.index table_columns = table.columns - # check if margin name is in (for MI cases) or equal to last + # check if margin name is not in (for MI cases) and not equal to last # index/column and save the column and index margin if (margins_name not in table.iloc[-1, :].name) & ( margins_name != table.iloc[:, -1].name diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 5ea5d470df997..4a904e57a2eb2 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -699,26 +699,8 @@ def test_margin_normalize(self): ) tm.assert_frame_equal(result, expected) - def test_crosstab_multiple_columns_normalize(self): - df = DataFrame( - { - "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], - "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], - "C": [ - "small", - "large", - "large", - "small", - "small", - "large", - "small", - "small", - "large", - ], - "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], - "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], - } - ) + # GH 35144 + # use multiple columns with margins and normalization result = crosstab( index=df.C, columns=[df.A, df.B], From 05303bbbe30ca7b038488e1ef13dfc3aa5a5ef84 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Mon, 20 Jul 2020 20:43:11 +0200 Subject: [PATCH 4/7] split up tests --- pandas/tests/reshape/test_crosstab.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/pandas/tests/reshape/test_crosstab.py b/pandas/tests/reshape/test_crosstab.py index 4a904e57a2eb2..6f5550a6f8209 100644 --- a/pandas/tests/reshape/test_crosstab.py +++ b/pandas/tests/reshape/test_crosstab.py @@ -699,8 +699,28 @@ def test_margin_normalize(self): ) tm.assert_frame_equal(result, expected) + def test_margin_normalize_multiple_columns(self): # GH 35144 # use multiple columns with margins and normalization + df = DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) result = crosstab( index=df.C, columns=[df.A, df.B], From bb00a4b2d15d41127ef5677c8543e2a6c5d4134d Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 15 Aug 2020 10:12:50 +0200 Subject: [PATCH 5/7] add whatsnew/v1.2.0.rst entry --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 42f95d88d74ac..f27c83fafef55 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -258,6 +258,7 @@ Reshaping - Bug in :meth:`DataFrame.pivot_table` with ``aggfunc='count'`` or ``aggfunc='sum'`` returning ``NaN`` for missing categories when pivoted on a ``Categorical``. Now returning ``0`` (:issue:`31422`) - Bug in :func:`union_indexes` where input index names are not preserved in some cases. Affects :func:`concat` and :class:`DataFrame` constructor (:issue:`13475`) +- Bug in func :meth:`crosstab` when using multiple columns with ``margins=True`` and ``normalize=True`` (:issue:`35144`) - Sparse From fa48fffe504f6211218acb41bf68b2134c7040a8 Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 15 Aug 2020 10:16:29 +0200 Subject: [PATCH 6/7] assign variable for duplicated call to last index or column --- pandas/core/reshape/pivot.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 818153535c4d8..2e09180a270fb 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -670,11 +670,12 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): # keep index and column of pivoted table table_index = table.index table_columns = table.columns + last_ind_or_col = table.iloc[-1, :].name # check if margin name is not in (for MI cases) and not equal to last # index/column and save the column and index margin - if (margins_name not in table.iloc[-1, :].name) & ( - margins_name != table.iloc[:, -1].name + if (margins_name not in last_ind_or_col) & ( + margins_name != last_ind_or_col ): raise ValueError(f"{margins_name} not in pivoted DataFrame") column_margin = table.iloc[:-1, -1] From f33e23d3a4c4cd796b293e4163510e7fcc9aec0e Mon Sep 17 00:00:00 2001 From: Tobias Pitters Date: Sat, 15 Aug 2020 12:38:45 +0200 Subject: [PATCH 7/7] apply `black pandas` --- pandas/core/reshape/pivot.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 2e09180a270fb..64a9e2dbf6d99 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -674,9 +674,7 @@ def _normalize(table, normalize, margins: bool, margins_name="All"): # check if margin name is not in (for MI cases) and not equal to last # index/column and save the column and index margin - if (margins_name not in last_ind_or_col) & ( - margins_name != last_ind_or_col - ): + if (margins_name not in last_ind_or_col) & (margins_name != last_ind_or_col): raise ValueError(f"{margins_name} not in pivoted DataFrame") column_margin = table.iloc[:-1, -1] index_margin = table.iloc[-1, :-1]