Skip to content

Commit 486ade0

Browse files
meeseeksmachineWillAyd
authored andcommitted
Backport PR #27663: BUG: pd.crosstab not working when margin and normalize are set together (#27783)
1 parent 4341157 commit 486ade0

File tree

3 files changed

+101
-10
lines changed

3 files changed

+101
-10
lines changed

doc/source/whatsnew/v0.25.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ Reshaping
126126
^^^^^^^^^
127127

128128
- A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`)
129+
- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`)
129130
- :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`)
130131
-
131132

pandas/core/reshape/pivot.py

+19-10
Original file line numberDiff line numberDiff line change
@@ -615,13 +615,21 @@ def _normalize(table, normalize, margins, margins_name="All"):
615615
table = table.fillna(0)
616616

617617
elif margins is True:
618-
619-
column_margin = table.loc[:, margins_name].drop(margins_name)
620-
index_margin = table.loc[margins_name, :].drop(margins_name)
621-
table = table.drop(margins_name, axis=1).drop(margins_name)
622-
# to keep index and columns names
623-
table_index_names = table.index.names
624-
table_columns_names = table.columns.names
618+
# keep index and column of pivoted table
619+
table_index = table.index
620+
table_columns = table.columns
621+
622+
# check if margin name is in (for MI cases) or equal to last
623+
# index/column and save the column and index margin
624+
if (margins_name not in table.iloc[-1, :].name) | (
625+
margins_name != table.iloc[:, -1].name
626+
):
627+
raise ValueError("{} not in pivoted DataFrame".format(margins_name))
628+
column_margin = table.iloc[:-1, -1]
629+
index_margin = table.iloc[-1, :-1]
630+
631+
# keep the core table
632+
table = table.iloc[:-1, :-1]
625633

626634
# Normalize core
627635
table = _normalize(table, normalize=normalize, margins=False)
@@ -631,11 +639,13 @@ def _normalize(table, normalize, margins, margins_name="All"):
631639
column_margin = column_margin / column_margin.sum()
632640
table = concat([table, column_margin], axis=1)
633641
table = table.fillna(0)
642+
table.columns = table_columns
634643

635644
elif normalize == "index":
636645
index_margin = index_margin / index_margin.sum()
637646
table = table.append(index_margin)
638647
table = table.fillna(0)
648+
table.index = table_index
639649

640650
elif normalize == "all" or normalize is True:
641651
column_margin = column_margin / column_margin.sum()
@@ -645,13 +655,12 @@ def _normalize(table, normalize, margins, margins_name="All"):
645655
table = table.append(index_margin)
646656

647657
table = table.fillna(0)
658+
table.index = table_index
659+
table.columns = table_columns
648660

649661
else:
650662
raise ValueError("Not a valid normalize argument")
651663

652-
table.index.names = table_index_names
653-
table.columns.names = table_columns_names
654-
655664
else:
656665
raise ValueError("Not a valid margins argument")
657666

pandas/tests/reshape/test_pivot.py

+81
Original file line numberDiff line numberDiff line change
@@ -2447,3 +2447,84 @@ def test_crosstab_unsorted_order(self):
24472447
[[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns
24482448
)
24492449
tm.assert_frame_equal(result, expected)
2450+
2451+
def test_margin_normalize(self):
2452+
# GH 27500
2453+
df = pd.DataFrame(
2454+
{
2455+
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
2456+
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
2457+
"C": [
2458+
"small",
2459+
"large",
2460+
"large",
2461+
"small",
2462+
"small",
2463+
"large",
2464+
"small",
2465+
"small",
2466+
"large",
2467+
],
2468+
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
2469+
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
2470+
}
2471+
)
2472+
# normalize on index
2473+
result = pd.crosstab(
2474+
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
2475+
)
2476+
expected = pd.DataFrame(
2477+
[[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
2478+
)
2479+
expected.index = MultiIndex(
2480+
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
2481+
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
2482+
names=["A", "B"],
2483+
)
2484+
expected.columns = Index(["large", "small"], dtype="object", name="C")
2485+
tm.assert_frame_equal(result, expected)
2486+
2487+
# normalize on columns
2488+
result = pd.crosstab(
2489+
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
2490+
)
2491+
expected = pd.DataFrame(
2492+
[
2493+
[0.25, 0.2, 0.222222],
2494+
[0.25, 0.2, 0.222222],
2495+
[0.5, 0.2, 0.333333],
2496+
[0, 0.4, 0.222222],
2497+
]
2498+
)
2499+
expected.columns = Index(
2500+
["large", "small", "Sub-Total"], dtype="object", name="C"
2501+
)
2502+
expected.index = MultiIndex(
2503+
levels=[["bar", "foo"], ["one", "two"]],
2504+
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
2505+
names=["A", "B"],
2506+
)
2507+
tm.assert_frame_equal(result, expected)
2508+
2509+
# normalize on both index and column
2510+
result = pd.crosstab(
2511+
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
2512+
)
2513+
expected = pd.DataFrame(
2514+
[
2515+
[0.111111, 0.111111, 0.222222],
2516+
[0.111111, 0.111111, 0.222222],
2517+
[0.222222, 0.111111, 0.333333],
2518+
[0.000000, 0.222222, 0.222222],
2519+
[0.444444, 0.555555, 1],
2520+
]
2521+
)
2522+
expected.columns = Index(
2523+
["large", "small", "Sub-Total"], dtype="object", name="C"
2524+
)
2525+
expected.index = MultiIndex(
2526+
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
2527+
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
2528+
names=["A", "B"],
2529+
)
2530+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)