Skip to content

Commit bbb979c

Browse files
author
Christoph Möhl
committed
pandas-dev#15150 added conditional calculation of crosstable margins based on normalization type, corrected expected margin values in test_margin_dropna
1 parent e456ab3 commit bbb979c

File tree

2 files changed

+59
-50
lines changed

2 files changed

+59
-50
lines changed

pandas/core/reshape/pivot.py

+48-41
Original file line numberDiff line numberDiff line change
@@ -199,22 +199,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
199199

200200
def _add_margins(table, data, values, rows, cols, aggfunc,
201201
margins_name='All'):
202-
if not isinstance(margins_name, compat.string_types):
203-
raise ValueError('margins_name argument must be a string')
204202

205-
exception_msg = 'Conflicting name "{0}" in margins'.format(margins_name)
206-
for level in table.index.names:
207-
if margins_name in table.index.get_level_values(level):
208-
raise ValueError(exception_msg)
203+
_check_margins_name(margins_name, table)
209204

210205
grand_margin = _compute_grand_margin(data, values, aggfunc, margins_name)
211206

212-
# could be passed a Series object with no 'columns'
213-
if hasattr(table, 'columns'):
214-
for level in table.columns.names[1:]:
215-
if margins_name in table.columns.get_level_values(level):
216-
raise ValueError(exception_msg)
217-
218207
if len(rows) > 1:
219208
key = (margins_name,) + ('',) * (len(rows) - 1)
220209
else:
@@ -263,6 +252,21 @@ def _add_margins(table, data, values, rows, cols, aggfunc,
263252
return result
264253

265254

255+
def _check_margins_name(margins_name, table):
256+
if not isinstance(margins_name, compat.string_types):
257+
raise ValueError('margins_name argument must be a string')
258+
259+
exception_msg = 'Conflicting name "{0}" in margins'.format(margins_name)
260+
for level in table.index.names:
261+
if margins_name in table.index.get_level_values(level):
262+
raise ValueError(exception_msg)
263+
# could be passed a Series object with no 'columns'
264+
if hasattr(table, 'columns'):
265+
for level in table.columns.names[1:]:
266+
if margins_name in table.columns.get_level_values(level):
267+
raise ValueError(exception_msg)
268+
269+
266270
def _compute_grand_margin(data, values, aggfunc,
267271
margins_name='All'):
268272

@@ -507,17 +511,32 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
507511
df = DataFrame(data)
508512
df['__dummy__'] = 0
509513
table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
510-
aggfunc=len, margins=margins,
514+
aggfunc=len, margins=False,
511515
margins_name=margins_name, dropna=dropna)
512516
table = table.fillna(0).astype(np.int64)
513517

514518
else:
515519
data['__dummy__'] = values
516520
df = DataFrame(data)
517521
table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
518-
aggfunc=aggfunc, margins=margins,
522+
aggfunc=aggfunc, margins=False,
519523
margins_name=margins_name, dropna=dropna)
520524

525+
if margins:
526+
_check_margins_name(margins_name, table)
527+
528+
if normalize != 'index':
529+
# add margin column
530+
table[margins_name] = table.sum(axis=1)
531+
532+
if normalize != 'columns':
533+
# add margin row
534+
if type(table.index) is MultiIndex:
535+
table = table.transpose()
536+
table[margins_name] = table.sum(axis=1)
537+
table = table.transpose()
538+
else:
539+
table.loc[margins_name] = table.sum(axis=0)
521540
# Post-process
522541
if normalize is not False:
523542
table = _normalize(table, normalize=normalize, margins=margins,
@@ -534,49 +553,37 @@ def _normalize(table, normalize, margins, margins_name='All'):
534553
try:
535554
normalize = axis_subs[normalize]
536555
except KeyError:
537-
raise ValueError("Not a valid normalize argument")
556+
raise ValueError(
557+
"Not a valid normalize argument: {!r}".format(normalize))
558+
559+
# Actual Normalizations
560+
normalizers = {
561+
'columns': lambda x: x / x.sum(),
562+
'index': lambda x: x.div(x.sum(axis=1), axis=0)
563+
}
538564

539565
if margins is False:
540-
# Actual Normalizations
541-
normalizers = {
542-
'all': lambda x: x / x.sum(axis=1).sum(axis=0),
543-
'columns': lambda x: x / x.sum(),
544-
'index': lambda x: x.div(x.sum(axis=1), axis=0)
545-
}
566+
normalizers['all'] = lambda x: x / x.sum(axis=1).sum(axis=0)
546567

547568
elif margins is True:
548-
# skip margin rows and/or cols for normalization
549-
normalizers = {
550-
'all': lambda x: x / x.iloc[:-1, :-1].sum(axis=1).sum(axis=0),
551-
'columns': lambda x: x.div(x.iloc[:-1, :].sum()).iloc[:-1, :],
552-
'index': lambda x: (x.div(x.iloc[:, :-1].sum(axis=1),
553-
axis=0)).iloc[:, :-1]
554-
}
569+
# skip margin rows and cols for normalization
570+
normalizers['all'] = lambda x: x / x.iloc[:-1, :-1].sum(axis=1)\
571+
.sum(axis=0)
555572

556573
else:
557-
raise ValueError("Not a valid margins argument")
574+
raise ValueError("Not a valid margins argument: {!r}".format(margins))
558575

559576
normalizers[True] = normalizers['all']
560577

561578
try:
562579
f = normalizers[normalize]
563580
except KeyError:
564-
raise ValueError("Not a valid normalize argument")
581+
raise ValueError(
582+
"Not a valid normalize argument: {!r}".format(normalize))
565583

566584
table = f(table)
567585
table = table.fillna(0)
568586

569-
if margins is True:
570-
# reset index to ensure default index dtype
571-
if normalize == 'index':
572-
colnames = table.columns.names
573-
table.columns = Index(table.columns.tolist())
574-
table.columns.names = colnames
575-
if normalize == 'columns':
576-
rownames = table.index.names
577-
table.index = Index(table.index.tolist())
578-
table.index.names = rownames
579-
580587
return table
581588

582589

pandas/tests/reshape/test_pivot.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -1184,15 +1184,15 @@ def test_margin_dropna(self):
11841184
df = pd.DataFrame({'a': [1, 2, 2, 2, 2, np.nan],
11851185
'b': [3, 3, 4, 4, 4, 4]})
11861186
actual = pd.crosstab(df.a, df.b, margins=True, dropna=False)
1187-
expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 4, 6]])
1187+
expected = pd.DataFrame([[1, 0, 1], [1, 3, 4], [2, 3, 5]])
11881188
expected.index = Index([1.0, 2.0, 'All'], name='a')
11891189
expected.columns = Index([3, 4, 'All'], name='b')
11901190
tm.assert_frame_equal(actual, expected)
11911191

11921192
df = DataFrame({'a': [1, np.nan, np.nan, np.nan, 2, np.nan],
11931193
'b': [3, np.nan, 4, 4, 4, 4]})
11941194
actual = pd.crosstab(df.a, df.b, margins=True, dropna=False)
1195-
expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 4, 6]])
1195+
expected = pd.DataFrame([[1, 0, 1], [0, 1, 1], [1, 1, 2]])
11961196
expected.index = Index([1.0, 2.0, 'All'], name='a')
11971197
expected.columns = Index([3.0, 4.0, 'All'], name='b')
11981198
tm.assert_frame_equal(actual, expected)
@@ -1209,8 +1209,8 @@ def test_margin_dropna(self):
12091209
m = MultiIndex.from_arrays([['one', 'one', 'two', 'two', 'All'],
12101210
['dull', 'shiny', 'dull', 'shiny', '']],
12111211
names=['b', 'c'])
1212-
expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 5],
1213-
[3, 0, 2, 1, 7]], columns=m)
1212+
expected = DataFrame([[1, 0, 1, 0, 2], [2, 0, 1, 1, 4],
1213+
[3, 0, 2, 1, 6]], columns=m)
12141214
expected.index = Index(['bar', 'foo', 'All'], name='a')
12151215
tm.assert_frame_equal(actual, expected)
12161216

@@ -1220,7 +1220,7 @@ def test_margin_dropna(self):
12201220
['one', 'two', 'one', 'two', '']],
12211221
names=['a', 'b'])
12221222
expected = DataFrame([[1, 0, 1], [1, 0, 1], [2, 0, 2], [1, 1, 2],
1223-
[5, 2, 7]], index=m)
1223+
[5, 1, 6]], index=m)
12241224
expected.columns = Index(['dull', 'shiny', 'All'], name='c')
12251225
tm.assert_frame_equal(actual, expected)
12261226

@@ -1421,22 +1421,23 @@ def test_crosstab_errors(self):
14211421
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
14221422
'c': [1, 1, np.nan, 1, 1]})
14231423

1424-
error = 'values cannot be used without an aggfunc.'
1424+
error = "values cannot be used without an aggfunc."
14251425
with tm.assert_raises_regex(ValueError, error):
14261426
pd.crosstab(df.a, df.b, values=df.c)
14271427

1428-
error = 'aggfunc cannot be used without values'
1428+
error = "aggfunc cannot be used without values"
14291429
with tm.assert_raises_regex(ValueError, error):
14301430
pd.crosstab(df.a, df.b, aggfunc=np.mean)
14311431

1432-
error = 'Not a valid normalize argument'
1432+
error = "Not a valid normalize argument: '42'"
14331433
with tm.assert_raises_regex(ValueError, error):
14341434
pd.crosstab(df.a, df.b, normalize='42')
14351435

1436+
error = "Not a valid normalize argument: 42"
14361437
with tm.assert_raises_regex(ValueError, error):
14371438
pd.crosstab(df.a, df.b, normalize=42)
14381439

1439-
error = 'Not a valid margins argument'
1440+
error = "Not a valid margins argument: 42"
14401441
with tm.assert_raises_regex(ValueError, error):
14411442
pd.crosstab(df.a, df.b, normalize='all', margins=42)
14421443

@@ -1495,6 +1496,7 @@ def test_crosstab_with_numpy_size(self):
14951496
expected = pd.DataFrame(expected_data,
14961497
index=expected_index,
14971498
columns=expected_column)
1499+
14981500
tm.assert_frame_equal(result, expected)
14991501

15001502

0 commit comments

Comments
 (0)