Skip to content

Commit 640d9e1

Browse files
charlesdong1991jreback
authored andcommitted
BUG: pd.crosstab not working when margin and normalize are set together (#27663)
1 parent 341043d commit 640d9e1

File tree

3 files changed

+101
-10
lines changed

3 files changed

+101
-10
lines changed

doc/source/whatsnew/v0.25.1.rst

+1
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,7 @@ Reshaping
126126
^^^^^^^^^
127127

128128
- A ``KeyError`` is now raised if ``.unstack()`` is called on a :class:`Series` or :class:`DataFrame` with a flat :class:`Index` passing a name which is not the correct one (:issue:`18303`)
129+
- Bug in :meth:`DataFrame.crosstab` when ``margins`` set to ``True`` and ``normalize`` is not ``False``, an error is raised. (:issue:`27500`)
129130
- :meth:`DataFrame.join` now suppresses the ``FutureWarning`` when the sort parameter is specified (:issue:`21952`)
130131
-
131132

pandas/core/reshape/pivot.py

+19-10
Original file line numberDiff line numberDiff line change
@@ -611,13 +611,21 @@ def _normalize(table, normalize, margins, margins_name="All"):
611611
table = table.fillna(0)
612612

613613
elif margins is True:
614-
615-
column_margin = table.loc[:, margins_name].drop(margins_name)
616-
index_margin = table.loc[margins_name, :].drop(margins_name)
617-
table = table.drop(margins_name, axis=1).drop(margins_name)
618-
# to keep index and columns names
619-
table_index_names = table.index.names
620-
table_columns_names = table.columns.names
614+
# keep index and column of pivoted table
615+
table_index = table.index
616+
table_columns = table.columns
617+
618+
# check if margin name is in (for MI cases) or equal to last
619+
# index/column and save the column and index margin
620+
if (margins_name not in table.iloc[-1, :].name) | (
621+
margins_name != table.iloc[:, -1].name
622+
):
623+
raise ValueError("{} not in pivoted DataFrame".format(margins_name))
624+
column_margin = table.iloc[:-1, -1]
625+
index_margin = table.iloc[-1, :-1]
626+
627+
# keep the core table
628+
table = table.iloc[:-1, :-1]
621629

622630
# Normalize core
623631
table = _normalize(table, normalize=normalize, margins=False)
@@ -627,11 +635,13 @@ def _normalize(table, normalize, margins, margins_name="All"):
627635
column_margin = column_margin / column_margin.sum()
628636
table = concat([table, column_margin], axis=1)
629637
table = table.fillna(0)
638+
table.columns = table_columns
630639

631640
elif normalize == "index":
632641
index_margin = index_margin / index_margin.sum()
633642
table = table.append(index_margin)
634643
table = table.fillna(0)
644+
table.index = table_index
635645

636646
elif normalize == "all" or normalize is True:
637647
column_margin = column_margin / column_margin.sum()
@@ -641,13 +651,12 @@ def _normalize(table, normalize, margins, margins_name="All"):
641651
table = table.append(index_margin)
642652

643653
table = table.fillna(0)
654+
table.index = table_index
655+
table.columns = table_columns
644656

645657
else:
646658
raise ValueError("Not a valid normalize argument")
647659

648-
table.index.names = table_index_names
649-
table.columns.names = table_columns_names
650-
651660
else:
652661
raise ValueError("Not a valid margins argument")
653662

pandas/tests/reshape/test_pivot.py

+81
Original file line numberDiff line numberDiff line change
@@ -2447,3 +2447,84 @@ def test_crosstab_unsorted_order(self):
24472447
[[1, 0, 0], [0, 1, 0], [0, 0, 1]], index=e_idx, columns=e_columns
24482448
)
24492449
tm.assert_frame_equal(result, expected)
2450+
2451+
def test_margin_normalize(self):
2452+
# GH 27500
2453+
df = pd.DataFrame(
2454+
{
2455+
"A": ["foo", "foo", "foo", "foo", "foo", "bar", "bar", "bar", "bar"],
2456+
"B": ["one", "one", "one", "two", "two", "one", "one", "two", "two"],
2457+
"C": [
2458+
"small",
2459+
"large",
2460+
"large",
2461+
"small",
2462+
"small",
2463+
"large",
2464+
"small",
2465+
"small",
2466+
"large",
2467+
],
2468+
"D": [1, 2, 2, 3, 3, 4, 5, 6, 7],
2469+
"E": [2, 4, 5, 5, 6, 6, 8, 9, 9],
2470+
}
2471+
)
2472+
# normalize on index
2473+
result = pd.crosstab(
2474+
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=0
2475+
)
2476+
expected = pd.DataFrame(
2477+
[[0.5, 0.5], [0.5, 0.5], [0.666667, 0.333333], [0, 1], [0.444444, 0.555556]]
2478+
)
2479+
expected.index = MultiIndex(
2480+
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
2481+
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
2482+
names=["A", "B"],
2483+
)
2484+
expected.columns = Index(["large", "small"], dtype="object", name="C")
2485+
tm.assert_frame_equal(result, expected)
2486+
2487+
# normalize on columns
2488+
result = pd.crosstab(
2489+
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=1
2490+
)
2491+
expected = pd.DataFrame(
2492+
[
2493+
[0.25, 0.2, 0.222222],
2494+
[0.25, 0.2, 0.222222],
2495+
[0.5, 0.2, 0.333333],
2496+
[0, 0.4, 0.222222],
2497+
]
2498+
)
2499+
expected.columns = Index(
2500+
["large", "small", "Sub-Total"], dtype="object", name="C"
2501+
)
2502+
expected.index = MultiIndex(
2503+
levels=[["bar", "foo"], ["one", "two"]],
2504+
codes=[[0, 0, 1, 1], [0, 1, 0, 1]],
2505+
names=["A", "B"],
2506+
)
2507+
tm.assert_frame_equal(result, expected)
2508+
2509+
# normalize on both index and column
2510+
result = pd.crosstab(
2511+
[df.A, df.B], df.C, margins=True, margins_name="Sub-Total", normalize=True
2512+
)
2513+
expected = pd.DataFrame(
2514+
[
2515+
[0.111111, 0.111111, 0.222222],
2516+
[0.111111, 0.111111, 0.222222],
2517+
[0.222222, 0.111111, 0.333333],
2518+
[0.000000, 0.222222, 0.222222],
2519+
[0.444444, 0.555555, 1],
2520+
]
2521+
)
2522+
expected.columns = Index(
2523+
["large", "small", "Sub-Total"], dtype="object", name="C"
2524+
)
2525+
expected.index = MultiIndex(
2526+
levels=[["Sub-Total", "bar", "foo"], ["", "one", "two"]],
2527+
codes=[[1, 1, 2, 2, 0], [1, 2, 1, 2, 0]],
2528+
names=["A", "B"],
2529+
)
2530+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)