diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 943a6adb7944e..792dfe4be055e 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -125,6 +125,7 @@ Reshaping ^^^^^^^^^ - A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`) +- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`) - :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`) - diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79716520f6654..d653dd87308cf 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -611,13 +611,21 @@ def _normalize(table, normalize, margins, margins_name="All"): table = table.fillna(0) elif margins is True: - - column_margin = table.loc[:, margins_name].drop(margins_name) - index_margin = table.loc[margins_name, :].drop(margins_name) - table = table.drop(margins_name, axis=1).drop(margins_name) - # to keep index and columns names - table_index_names = table.index.names - table_columns_names = table.columns.names + # keep index and column of pivoted table + table_index = table.index + table_columns = table.columns + + # check if margin name is in (for MI cases) or equal to last + # index/column and save the column and index margin + if (margins_name not in table.iloc[-1, :].name) | ( + margins_name != table.iloc[:, -1].name + ): + raise ValueError("{} not in pivoted DataFrame".format(margins_name)) + column_margin = table.iloc[:-1, -1] + index_margin = table.iloc[-1, :-1] + + # keep the core table + table = table.iloc[:-1, :-1] # Normalize core table = _normalize(table, normalize=normalize, margins=False) @@ -627,11 +635,13 @@ def _normalize(table, normalize, margins, margins_name="All"): column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) + table.columns = table_columns elif normalize == "index": index_margin = index_margin / index_margin.sum() table = table.append(index_margin) table = table.fillna(0) + table.index = table_index elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() @@ -641,13 +651,12 @@ def _normalize(table, normalize, margins, margins_name="All"): table = table.append(index_margin) table = table.fillna(0) + table.index = table_index + table.columns = table_columns else: raise ValueError("Not a valid normalize argument") - table.index.names = table_index_names - table.columns.names = table_columns_names - else: raise ValueError("Not a valid margins argument") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index be82e7f595f8c..03b15d2df1a26 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2447,3 +2447,84 @@ def test_crosstab_unsorted_order(self): [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns ) tm.assert_frame_equal(result, expected) + + def test_margin_normalize(self): + # GH 27500 + df = pd.DataFrame( + { + "A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"], + "C": [ + "small", + "large", + "large", + "small", + "small", + "large", + "small", + "small", + "large", + ], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9], + } + ) + # normalize on index + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0 + ) + expected = pd.DataFrame( + [[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]] + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + expected.columns = Index(["large", "small"], dtype="object", name="C") + tm.assert_frame_equal(result, expected) + + # normalize on columns + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1 + ) + expected = pd.DataFrame( + [ + [0.25, 0.2, 0.222222], + [0.25, 0.2, 0.222222], + [0.5, 0.2, 0.333333], + [0, 0.4, 0.222222], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["bar", "foo"], ["one", "two"]], + codes=[[0, 0, 1, 1], [0, 1, 0, 1]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected) + + # normalize on both index and column + result = pd.crosstab( + [df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True + ) + expected = pd.DataFrame( + [ + [0.111111, 0.111111, 0.222222], + [0.111111, 0.111111, 0.222222], + [0.222222, 0.111111, 0.333333], + [0.000000, 0.222222, 0.222222], + [0.444444, 0.555555, 1], + ] + ) + expected.columns = Index( + ["large", "small", "Sub-Total"], dtype="object", name="C" + ) + expected.index = MultiIndex( + levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=["A", "B"], + ) + tm.assert_frame_equal(result, expected)