Skip to content

Commit bb494b7

Browse files
Nick Eubankjreback
Nick Eubank
authored andcommitted
ENH: add normalization to crosstab
closes #12578 closes #12569
1 parent 445d1c6 commit bb494b7

File tree

4 files changed

+290
-5
lines changed

4 files changed

+290
-5
lines changed

doc/source/reshaping.rst

+47
Original file line numberDiff line numberDiff line change
@@ -383,9 +383,12 @@ calling ``to_string`` if you wish:
383383
384384
Note that ``pivot_table`` is also available as an instance method on DataFrame.
385385

386+
.. _reshaping.crosstabulations:
387+
386388
Cross tabulations
387389
~~~~~~~~~~~~~~~~~
388390

391+
389392
Use the ``crosstab`` function to compute a cross-tabulation of two (or more)
390393
factors. By default ``crosstab`` computes a frequency table of the factors
391394
unless an array of values and an aggregation function are passed.
@@ -402,6 +405,9 @@ It takes a number of arguments
402405
- ``colnames``: sequence, default None, if passed, must match number of column
403406
arrays passed
404407
- ``margins``: boolean, default False, Add row/column margins (subtotals)
408+
- ``normalize``: boolean, {'all', 'index', 'columns'}, or {0,1}, default False.
409+
Normalize by dividing all values by the sum of values.
410+
405411

406412
Any Series passed will have their name attributes used unless row or column
407413
names for the cross-tabulation are specified
@@ -416,6 +422,47 @@ For example:
416422
c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object)
417423
pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
418424
425+
426+
If ``crosstab`` receives only two Series, it will provide a frequency table.
427+
428+
.. ipython:: python
429+
430+
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
431+
'c': [1, 1, np.nan, 1, 1]})
432+
df
433+
434+
pd.crosstab(df.a, df.b)
435+
436+
.. versionadded:: 0.18.1
437+
438+
Frequency tables can also be normalized to show percentages rather than counts
439+
using the ``normalize`` argument:
440+
441+
.. ipython:: python
442+
443+
pd.crosstab(df.a, df.b, normalize=True)
444+
445+
``normalize`` can also normalize values within each row or within each column:
446+
447+
.. ipython:: python
448+
449+
pd.crosstab(df.a, df.b, normalize='columns')
450+
451+
``crosstab`` can also be passed a third Series and an aggregation function
452+
(``aggfunc``) that will be applied to the values of the third Series within each
453+
group defined by the first two Series:
454+
455+
.. ipython:: python
456+
457+
pd.crosstab(df.a, df.b, values=df.c, aggfunc=np.sum)
458+
459+
And finally, one can also add margins or normalize this output.
460+
461+
.. ipython:: python
462+
463+
pd.crosstab(df.a, df.b, values=df.c, aggfunc=np.sum, normalize=True,
464+
margins=True)
465+
419466
.. _reshaping.pivot.margins:
420467

421468
Adding margins (partial aggregates)

doc/source/whatsnew/v0.18.1.txt

+4
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,8 @@ Other Enhancements
9494
idx = pd.Index(['a|b', 'a|c', 'b|c'])
9595
idx.str.get_dummies('|')
9696

97+
- ``pd.crosstab()`` has gained a ``normalize`` argument for normalizing frequency tables (:issue:`12569`). Examples in the updated docs :ref:`here <reshaping.crosstabulations>`.
98+
9799

98100
.. _whatsnew_0181.sparse:
99101

@@ -364,6 +366,8 @@ Bug Fixes
364366
- Bug in ``.concat`` of datetime tz-aware and naive DataFrames (:issue:`12467`)
365367
- Bug in correctly raising a ``ValueError`` in ``.resample(..).fillna(..)`` when passing a non-string (:issue:`12952`)
366368
- Bug fixes in various encoding and header processing issues in ``pd.read_sas()`` (:issue:`12659`, :issue:`12654`, :issue:`12647`, :issue:`12809`)
369+
- Bug in ``pd.crosstab()`` where would silently ignore ``aggfunc`` if ``values=None`` (:issue:`12569`).
370+
367371

368372
- Bug in ``Timestamp.__repr__`` that caused ``pprint`` to fail in nested structures (:issue:`12622`)
369373
- Bug in ``Timedelta.min`` and ``Timedelta.max``, the properties now report the true minimum/maximum ``timedeltas`` as recognized by Pandas. See :ref:`documentation <timedeltas.limitations>`. (:issue:`12727`)

pandas/tools/pivot.py

+95-5
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ def _convert_by(by):
371371

372372

373373
def crosstab(index, columns, values=None, rownames=None, colnames=None,
374-
aggfunc=None, margins=False, dropna=True):
374+
aggfunc=None, margins=False, dropna=True, normalize=False):
375375
"""
376376
Compute a simple cross-tabulation of two (or more) factors. By default
377377
computes a frequency table of the factors unless an array of values and an
@@ -384,9 +384,10 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
384384
columns : array-like, Series, or list of arrays/Series
385385
Values to group by in the columns
386386
values : array-like, optional
387-
Array of values to aggregate according to the factors
387+
Array of values to aggregate according to the factors.
388+
Requires `aggfunc` be specified.
388389
aggfunc : function, optional
389-
If no values array is passed, computes a frequency table
390+
If specified, requires `values` be specified as well
390391
rownames : sequence, default None
391392
If passed, must match number of row arrays passed
392393
colnames : sequence, default None
@@ -395,6 +396,16 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
395396
Add row/column margins (subtotals)
396397
dropna : boolean, default True
397398
Do not include columns whose entries are all NaN
399+
normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
400+
Normalize by dividing all values by the sum of values.
401+
402+
- If passed 'all' or `True`, will normalize over all values.
403+
- If passed 'index' will normalize over each row.
404+
- If passed 'columns' will normalize over each column.
405+
- If margins is `True`, will also normalize margin values.
406+
407+
.. versionadded:: 0.18.1
408+
398409
399410
Notes
400411
-----
@@ -438,18 +449,97 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
438449
data.update(zip(rownames, index))
439450
data.update(zip(colnames, columns))
440451

452+
if values is None and aggfunc is not None:
453+
raise ValueError("aggfunc cannot be used without values.")
454+
455+
if values is not None and aggfunc is None:
456+
raise ValueError("values cannot be used without an aggfunc.")
457+
441458
if values is None:
442459
df = DataFrame(data)
443460
df['__dummy__'] = 0
444461
table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
445462
aggfunc=len, margins=margins, dropna=dropna)
446-
return table.fillna(0).astype(np.int64)
463+
table = table.fillna(0).astype(np.int64)
464+
447465
else:
448466
data['__dummy__'] = values
449467
df = DataFrame(data)
450468
table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
451469
aggfunc=aggfunc, margins=margins, dropna=dropna)
452-
return table
470+
471+
# Post-process
472+
if normalize is not False:
473+
table = _normalize(table, normalize=normalize, margins=margins)
474+
475+
return table
476+
477+
478+
def _normalize(table, normalize, margins):
479+
480+
if not isinstance(normalize, bool) and not isinstance(normalize,
481+
compat.string_types):
482+
axis_subs = {0: 'index', 1: 'columns'}
483+
try:
484+
normalize = axis_subs[normalize]
485+
except KeyError:
486+
raise ValueError("Not a valid normalize argument")
487+
488+
if margins is False:
489+
490+
# Actual Normalizations
491+
normalizers = {
492+
'all': lambda x: x / x.sum(axis=1).sum(axis=0),
493+
'columns': lambda x: x / x.sum(),
494+
'index': lambda x: x.div(x.sum(axis=1), axis=0)
495+
}
496+
497+
normalizers[True] = normalizers['all']
498+
499+
try:
500+
f = normalizers[normalize]
501+
except KeyError:
502+
raise ValueError("Not a valid normalize argument")
503+
504+
table = f(table)
505+
table = table.fillna(0)
506+
507+
elif margins is True:
508+
509+
column_margin = table.loc[:, 'All'].drop('All')
510+
index_margin = table.loc['All', :].drop('All')
511+
table = table.drop('All', axis=1).drop('All')
512+
513+
# Normalize core
514+
table = _normalize(table, normalize=normalize, margins=False)
515+
516+
# Fix Margins
517+
if normalize == 'columns':
518+
column_margin = column_margin / column_margin.sum()
519+
table = concat([table, column_margin], axis=1)
520+
table = table.fillna(0)
521+
522+
elif normalize == 'index':
523+
index_margin = index_margin / index_margin.sum()
524+
table = table.append(index_margin)
525+
table = table.fillna(0)
526+
527+
elif normalize == "all" or normalize is True:
528+
column_margin = column_margin / column_margin.sum()
529+
index_margin = index_margin / index_margin.sum()
530+
index_margin.loc['All'] = 1
531+
table = concat([table, column_margin], axis=1)
532+
table = table.append(index_margin)
533+
534+
table = table.fillna(0)
535+
536+
else:
537+
raise ValueError("Not a valid normalize argument")
538+
539+
else:
540+
raise ValueError("Not a valid margins argument")
541+
542+
return table
453543

454544

455545
def _get_names(arrs, names, prefix='row'):

pandas/tools/tests/test_pivot.py

+144
Original file line numberDiff line numberDiff line change
@@ -1021,6 +1021,150 @@ def test_margin_dropna(self):
10211021
expected.columns = Index(['dull', 'shiny', 'All'], name='c')
10221022
tm.assert_frame_equal(actual, expected)
10231023

1024+
def test_crosstab_normalize(self):
1025+
# Issue 12578
1026+
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
1027+
'c': [1, 1, np.nan, 1, 1]})
1028+
1029+
rindex = pd.Index([1, 2], name='a')
1030+
cindex = pd.Index([3, 4], name='b')
1031+
full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]],
1032+
index=rindex, columns=cindex)
1033+
row_normal = pd.DataFrame([[1.0, 0], [0.25, 0.75]],
1034+
index=rindex, columns=cindex)
1035+
col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]],
1036+
index=rindex, columns=cindex)
1037+
1038+
# Check all normalize args
1039+
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='all'),
1040+
full_normal)
1041+
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True),
1042+
full_normal)
1043+
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'),
1044+
row_normal)
1045+
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'),
1046+
col_normal)
1047+
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1),
1048+
pd.crosstab(df.a, df.b, normalize='columns'))
1049+
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0),
1050+
pd.crosstab(df.a, df.b, normalize='index'))
1051+
1052+
row_normal_margins = pd.DataFrame([[1.0, 0],
1053+
[0.25, 0.75],
1054+
[0.4, 0.6]],
1055+
index=pd.Index([1, 2, 'All'],
1056+
name='a',
1057+
dtype='object'),
1058+
columns=pd.Index([3, 4], name='b'))
1059+
col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
1060+
index=pd.Index([1, 2], name='a',
1061+
dtype='object'),
1062+
columns=pd.Index([3, 4, 'All'],
1063+
name='b'))
1064+
1065+
all_normal_margins = pd.DataFrame([[0.2, 0, 0.2],
1066+
[0.2, 0.6, 0.8],
1067+
[0.4, 0.6, 1]],
1068+
index=pd.Index([1, 2, 'All'],
1069+
name='a',
1070+
dtype='object'),
1071+
columns=pd.Index([3, 4, 'All'],
1072+
name='b'))
1073+
1074+
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index',
1075+
margins=True), row_normal_margins)
1076+
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',
1077+
margins=True), col_normal_margins)
1078+
tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True,
1079+
margins=True), all_normal_margins)
1080+
1081+
# Test arrays
1082+
pd.crosstab([np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])],
1083+
np.array([1, 2, 1, 2]))
1084+
1085+
# Test with aggfunc
1086+
norm_counts = pd.DataFrame([[0.25, 0, 0.25],
1087+
[0.25, 0.5, 0.75],
1088+
[0.5, 0.5, 1]],
1089+
index=pd.Index([1, 2, 'All'],
1090+
name='a',
1091+
dtype='object'),
1092+
columns=pd.Index([3, 4, 'All'],
1093+
name='b'))
1094+
test_case = pd.crosstab(df.a, df.b, df.c, aggfunc='count',
1095+
normalize='all',
1096+
margins=True)
1097+
tm.assert_frame_equal(test_case, norm_counts)
1098+
1099+
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
1100+
'c': [0, 4, np.nan, 3, 3]})
1101+
1102+
norm_sum = pd.DataFrame([[0, 0, 0.],
1103+
[0.4, 0.6, 1],
1104+
[0.4, 0.6, 1]],
1105+
index=pd.Index([1, 2, 'All'],
1106+
name='a',
1107+
dtype='object'),
1108+
columns=pd.Index([3, 4, 'All'],
1109+
name='b',
1110+
dtype='object'))
1111+
test_case = pd.crosstab(df.a, df.b, df.c, aggfunc=np.sum,
1112+
normalize='all',
1113+
margins=True)
1114+
tm.assert_frame_equal(test_case, norm_sum)
1115+
1116+
def test_crosstab_with_empties(self):
1117+
# Check handling of empties
1118+
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
1119+
'c': [np.nan, np.nan, np.nan, np.nan, np.nan]})
1120+
1121+
empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]],
1122+
index=pd.Index([1, 2],
1123+
name='a',
1124+
dtype='int64'),
1125+
columns=pd.Index([3, 4], name='b'))
1126+
1127+
for i in [True, 'index', 'columns']:
1128+
calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
1129+
normalize=i)
1130+
tm.assert_frame_equal(empty, calculated)
1131+
1132+
nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]],
1133+
index=pd.Index([1, 2],
1134+
name='a',
1135+
dtype='int64'),
1136+
columns=pd.Index([3, 4], name='b'))
1137+
1138+
calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
1139+
normalize=False)
1140+
tm.assert_frame_equal(nans, calculated)
1141+
1142+
def test_crosstab_errors(self):
1143+
# Issue 12578
1144+
1145+
df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
1146+
'c': [1, 1, np.nan, 1, 1]})
1147+
1148+
error = 'values cannot be used without an aggfunc.'
1149+
with tm.assertRaisesRegexp(ValueError, error):
1150+
pd.crosstab(df.a, df.b, values=df.c)
1151+
1152+
error = 'aggfunc cannot be used without values'
1153+
with tm.assertRaisesRegexp(ValueError, error):
1154+
pd.crosstab(df.a, df.b, aggfunc=np.mean)
1155+
1156+
error = 'Not a valid normalize argument'
1157+
with tm.assertRaisesRegexp(ValueError, error):
1158+
pd.crosstab(df.a, df.b, normalize='42')
1159+
1160+
with tm.assertRaisesRegexp(ValueError, error):
1161+
pd.crosstab(df.a, df.b, normalize=42)
1162+
1163+
error = 'Not a valid margins argument'
1164+
with tm.assertRaisesRegexp(ValueError, error):
1165+
pd.crosstab(df.a, df.b, normalize='all', margins=42)
1166+
1167+
10241168
if __name__ == '__main__':
10251169
import nose
10261170
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)