diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index eb60272246ebb..c0484f9217396 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -128,7 +128,7 @@ Groupby/resample/rolling Reshaping ^^^^^^^^^ -- +- Bug in :meth:`DataFrame.crosstab` when margins set to True and normalize is not False, an error is raised. (:issue:`27500`) - - diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 79716520f6654..1112f0393908a 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -611,13 +611,17 @@ def _normalize(table, normalize, margins, margins_name="All"): table = table.fillna(0) elif margins is True: - + # keep index and column of pivoted table + table_index = table.index + table_columns = table.columns + # drop margins created in pivot_table and only keep the core column_margin = table.loc[:, margins_name].drop(margins_name) - index_margin = table.loc[margins_name, :].drop(margins_name) + # separate cases between multiindex and index + if isinstance(table_index, MultiIndex): + index_margin = table.loc[margins_name, :].drop(margins_name, axis=1) + else: + index_margin = table.loc[margins_name, :].drop(margins_name) table = table.drop(margins_name, axis=1).drop(margins_name) - # to keep index and columns names - table_index_names = table.index.names - table_columns_names = table.columns.names # Normalize core table = _normalize(table, normalize=normalize, margins=False) @@ -627,11 +631,19 @@ def _normalize(table, normalize, margins, margins_name="All"): column_margin = column_margin / column_margin.sum() table = concat([table, column_margin], axis=1) table = table.fillna(0) + table.columns = table_columns elif normalize == "index": - index_margin = index_margin / index_margin.sum() + # index_margin is a dataframe, and use a hacky way: sum(axis=1)[0] + # to get the normalized result, and use sum() instead for series + if isinstance(index_margin, ABCDataFrame): + sum_index_margin = index_margin.sum(axis=1)[0] + else: + sum_index_margin = index_margin.sum() + index_margin = index_margin / sum_index_margin table = table.append(index_margin) table = table.fillna(0) + table.index = table_index elif normalize == "all" or normalize is True: column_margin = column_margin / column_margin.sum() @@ -641,13 +653,12 @@ def _normalize(table, normalize, margins, margins_name="All"): table = table.append(index_margin) table = table.fillna(0) + table.index = table_index + table.columns = table_columns else: raise ValueError("Not a valid normalize argument") - table.index.names = table_index_names - table.columns.names = table_columns_names - else: raise ValueError("Not a valid margins argument") diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index be82e7f595f8c..ac12165aa0fd1 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -2447,3 +2447,38 @@ def test_crosstab_unsorted_order(self): [[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns ) tm.assert_frame_equal(result, expected) + + def test_margin_normalize(self): + df = pd.DataFrame({"A": ["foo", "foo", "foo", "foo", "foo", + "bar", "bar", "bar", "bar"], + "B": ["one", "one", "one", "two", "two", + "one", "one", "two", "two"], + "C": ["small", "large", "large", "small", + "small", "large", "small", "small", + "large"], + "D": [1, 2, 2, 3, 3, 4, 5, 6, 7], + "E": [2, 4, 5, 5, 6, 6, 8, 9, 9]}) + # normalize on index + result = pd.crosstab([df.A, df.B], df.C, margins=True, margins_name='Sub-Total', + normalize=0) + expected = pd.DataFrame([[0.5, 0.5], [0.5, 0.5], + [0.666667, 0.333333], [0, 1], + [0.444444, 0.555556]]) + expected.index = MultiIndex(levels=[['Sub-Total', 'bar', 'foo'], + ['', 'one', 'two']], + codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]], + names=['A', 'B']) + expected.columns = Index(['large', 'small'], dtype='object', name='C') + tm.assert_frame_equal(result, expected) + + # normalize on columns + result = pd.crosstab([df.A, df.B], df.C, margins=True, margins_name='Sub-Total', + normalize=1) + expected = pd.DataFrame([[0.25, 0.2, 0.222222], [0.25, 0.2, 0.222222], + [0.5, 0.2, 0.333333], [0, 0.4, 0.222222]]) + expected.columns = Index(['large', 'small', 'Sub-Total'], dtype='object', + name='C') + expected.index = MultiIndex(levels=[['bar', 'foo'], ['one', 'two']], + codes=[[1, 1, 2, 2], [1, 2, 1, 2]], + names=['A, B']) + tm.assert_frame_equal(result, expected)