From e5015f8d4e84ce5256ae175aa4a27a2950a8ada0 Mon Sep 17 00:00:00 2001 From: Nick Eubank Date: Wed, 9 Mar 2016 13:34:52 -0800 Subject: [PATCH] add normalization to crosstab --- doc/source/reshaping.rst | 47 ++++++++++ doc/source/whatsnew/v0.18.1.txt | 4 +- pandas/tools/pivot.py | 100 +++++++++++++++++++-- pandas/tools/tests/test_pivot.py | 144 +++++++++++++++++++++++++++++++ 4 files changed, 289 insertions(+), 6 deletions(-) diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 611771a47c233..f21b936df93f6 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -383,9 +383,12 @@ calling ``to_string`` if you wish: Note that ``pivot_table`` is also available as an instance method on DataFrame. +.. _reshaping.crosstabulations: + Cross tabulations ~~~~~~~~~~~~~~~~~ + Use the ``crosstab`` function to compute a cross-tabulation of two (or more) factors. By default ``crosstab`` computes a frequency table of the factors unless an array of values and an aggregation function are passed. @@ -402,6 +405,9 @@ It takes a number of arguments - ``colnames``: sequence, default None, if passed, must match number of column arrays passed - ``margins``: boolean, default False, Add row/column margins (subtotals) +- ``normalize``: boolean, {'all', 'index', 'columns'}, or {0,1}, default False. + Normalize by dividing all values by the sum of values. + Any Series passed will have their name attributes used unless row or column names for the cross-tabulation are specified @@ -416,6 +422,47 @@ For example: c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object) pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c']) + +If ``crosstab`` receives only two Series, it will provide a frequency table. + +.. ipython:: python + + df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + 'c': [1, 1, np.nan, 1, 1]}) + df + + pd.crosstab(df.a, df.b) + +.. versionadded:: 0.18.1 + +Frequency tables can also be normalized to show percentages rather than counts +using the ``normalize`` argument: + +.. ipython:: python + + pd.crosstab(df.a, df.b, normalize=True) + +``normalize`` can also normalize values within each row or within each column: + +.. ipython:: python + + pd.crosstab(df.a, df.b, normalize='columns') + +``crosstab`` can also be passed a third Series and an aggregation function +(``aggfunc``) that will be applied to the values of the third Series within each +group defined by the first two Series: + +.. ipython:: python + + pd.crosstab(df.a, df.b, values=df.c, aggfunc=np.sum) + +And finally, one can also add margins or normalize this output. + +.. ipython:: python + + pd.crosstab(df.a, df.b, values=df.c, aggfunc=np.sum, normalize=True, + margins=True) + .. _reshaping.pivot.margins: Adding margins (partial aggregates) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 821f093083026..1455283a0e961 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -93,6 +93,8 @@ Other Enhancements idx = pd.Index(['a|b', 'a|c', 'b|c']) idx.str.get_dummies('|') +- ``pd.crosstab()`` has gained ``normalize`` argument for normalizing frequency tables (:issue:`12569`). Examples in updated docs :ref:`here `. + .. _whatsnew_0181.sparse: @@ -277,9 +279,9 @@ Bug Fixes - Bug in ``__name__`` of ``.cum*`` functions (:issue:`12021`) - Bug in ``.astype()`` of a ``Float64Inde/Int64Index`` to an ``Int64Index`` (:issue:`12881`) - Bug in roundtripping an integer based index in ``.to_json()/.read_json()`` when ``orient='index'`` (the default) (:issue:`12866`) - - Bug in ``.drop()`` with a non-unique ``MultiIndex``. (:issue:`12701`) - Bug in ``.concat`` of datetime tz-aware and naive DataFrames (:issue:`12467`) +- Bug in ``pd.crosstab()`` where would silently ignore ``aggfunc`` if ``values=None`` (:issue:`12569`). - Bug in ``Timestamp.__repr__`` that caused ``pprint`` to fail in nested structures (:issue:`12622`) diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py index d7798bf1e7982..de79e54e22270 100644 --- a/pandas/tools/pivot.py +++ b/pandas/tools/pivot.py @@ -371,7 +371,7 @@ def _convert_by(by): def crosstab(index, columns, values=None, rownames=None, colnames=None, - aggfunc=None, margins=False, dropna=True): + aggfunc=None, margins=False, dropna=True, normalize=False): """ Compute a simple cross-tabulation of two (or more) factors. By default computes a frequency table of the factors unless an array of values and an @@ -384,9 +384,10 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, columns : array-like, Series, or list of arrays/Series Values to group by in the columns values : array-like, optional - Array of values to aggregate according to the factors + Array of values to aggregate according to the factors. + Requires `aggfunc` be specified. aggfunc : function, optional - If no values array is passed, computes a frequency table + If specified, requires `values` be specified as well rownames : sequence, default None If passed, must match number of row arrays passed colnames : sequence, default None @@ -395,6 +396,16 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, Add row/column margins (subtotals) dropna : boolean, default True Do not include columns whose entries are all NaN + normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False + Normalize by dividing all values by the sum of values. + + - If passed 'all' or `True`, will normalize over all values. + - If passed 'index' will normalize over each row. + - If passed 'columns' will normalize over each column. + - If margins is `True`, will also normalize margin values. + + .. versionadded:: 0.18.1 + Notes ----- @@ -438,18 +449,97 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None, data.update(zip(rownames, index)) data.update(zip(colnames, columns)) + if values is None and aggfunc is not None: + raise ValueError("aggfunc cannot be used without values.") + + if values is not None and aggfunc is None: + raise ValueError("values cannot be used without an aggfunc.") + if values is None: df = DataFrame(data) df['__dummy__'] = 0 table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=len, margins=margins, dropna=dropna) - return table.fillna(0).astype(np.int64) + table = table.fillna(0).astype(np.int64) + else: data['__dummy__'] = values df = DataFrame(data) table = df.pivot_table('__dummy__', index=rownames, columns=colnames, aggfunc=aggfunc, margins=margins, dropna=dropna) - return table + + # Post-process + if normalize is not False: + table = _normalize(table, normalize=normalize, margins=margins) + + return table + + +def _normalize(table, normalize, margins): + + if not isinstance(normalize, bool) and not isinstance(normalize, + compat.string_types): + axis_subs = {0: 'index', 1: 'columns'} + try: + normalize = axis_subs[normalize] + except KeyError: + raise ValueError("Not a valid normalize argument") + + if margins is False: + + # Actual Normalizations + normalizers = { + 'all': lambda x: x / x.sum(axis=1).sum(axis=0), + 'columns': lambda x: x / x.sum(), + 'index': lambda x: x.div(x.sum(axis=1), axis=0) + } + + normalizers[True] = normalizers['all'] + + try: + f = normalizers[normalize] + except KeyError: + raise ValueError("Not a valid normalize argument") + + table = f(table) + table = table.fillna(0) + + elif margins is True: + + column_margin = table.loc[:, 'All'].drop('All') + index_margin = table.loc['All', :].drop('All') + table = table.drop('All', axis=1).drop('All') + + # Normalize core + table = _normalize(table, normalize=normalize, margins=False) + + # Fix Margins + if normalize == 'columns': + column_margin = column_margin / column_margin.sum() + table = concat([table, column_margin], axis=1) + table = table.fillna(0) + + elif normalize == 'index': + index_margin = index_margin / index_margin.sum() + table = table.append(index_margin) + table = table.fillna(0) + + elif normalize == "all" or normalize is True: + column_margin = column_margin / column_margin.sum() + index_margin = index_margin / index_margin.sum() + index_margin.loc['All'] = 1 + table = concat([table, column_margin], axis=1) + table = table.append(index_margin) + + table = table.fillna(0) + + else: + raise ValueError("Not a valid normalize argument") + + else: + raise ValueError("Not a valid margins argument") + + return table def _get_names(arrs, names, prefix='row'): diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py index bff82e32dccc0..5ebd2e4f693cf 100644 --- a/pandas/tools/tests/test_pivot.py +++ b/pandas/tools/tests/test_pivot.py @@ -1021,6 +1021,150 @@ def test_margin_dropna(self): expected.columns = Index(['dull', 'shiny', 'All'], name='c') tm.assert_frame_equal(actual, expected) + def test_crosstab_normalize(self): + # Issue 12578 + df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + 'c': [1, 1, np.nan, 1, 1]}) + + rindex = pd.Index([1, 2], name='a') + cindex = pd.Index([3, 4], name='b') + full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]], + index=rindex, columns=cindex) + row_normal = pd.DataFrame([[1.0, 0], [0.25, 0.75]], + index=rindex, columns=cindex) + col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]], + index=rindex, columns=cindex) + + # Check all normalize args + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='all'), + full_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True), + full_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'), + row_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'), + col_normal) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1), + pd.crosstab(df.a, df.b, normalize='columns')) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0), + pd.crosstab(df.a, df.b, normalize='index')) + + row_normal_margins = pd.DataFrame([[1.0, 0], + [0.25, 0.75], + [0.4, 0.6]], + index=pd.Index([1, 2, 'All'], + name='a', + dtype='object'), + columns=pd.Index([3, 4], name='b')) + col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]], + index=pd.Index([1, 2], name='a', + dtype='object'), + columns=pd.Index([3, 4, 'All'], + name='b')) + + all_normal_margins = pd.DataFrame([[0.2, 0, 0.2], + [0.2, 0.6, 0.8], + [0.4, 0.6, 1]], + index=pd.Index([1, 2, 'All'], + name='a', + dtype='object'), + columns=pd.Index([3, 4, 'All'], + name='b')) + + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index', + margins=True), row_normal_margins) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns', + margins=True), col_normal_margins) + tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True, + margins=True), all_normal_margins) + + # Test arrays + pd.crosstab([np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])], + np.array([1, 2, 1, 2])) + + # Test with aggfunc + norm_counts = pd.DataFrame([[0.25, 0, 0.25], + [0.25, 0.5, 0.75], + [0.5, 0.5, 1]], + index=pd.Index([1, 2, 'All'], + name='a', + dtype='object'), + columns=pd.Index([3, 4, 'All'], + name='b')) + test_case = pd.crosstab(df.a, df.b, df.c, aggfunc='count', + normalize='all', + margins=True) + tm.assert_frame_equal(test_case, norm_counts) + + df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + 'c': [0, 4, np.nan, 3, 3]}) + + norm_sum = pd.DataFrame([[0, 0, 0.], + [0.4, 0.6, 1], + [0.4, 0.6, 1]], + index=pd.Index([1, 2, 'All'], + name='a', + dtype='object'), + columns=pd.Index([3, 4, 'All'], + name='b', + dtype='object')) + test_case = pd.crosstab(df.a, df.b, df.c, aggfunc=np.sum, + normalize='all', + margins=True) + tm.assert_frame_equal(test_case, norm_sum) + + def test_crosstab_with_empties(self): + # Check handling of empties + df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + 'c': [np.nan, np.nan, np.nan, np.nan, np.nan]}) + + empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]], + index=pd.Index([1, 2], + name='a', + dtype='int64'), + columns=pd.Index([3, 4], name='b')) + + for i in [True, 'index', 'columns']: + calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', + normalize=i) + tm.assert_frame_equal(empty, calculated) + + nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]], + index=pd.Index([1, 2], + name='a', + dtype='int64'), + columns=pd.Index([3, 4], name='b')) + + calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count', + normalize=False) + tm.assert_frame_equal(nans, calculated) + + def test_crosstab_errors(self): + # Issue 12578 + + df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4], + 'c': [1, 1, np.nan, 1, 1]}) + + error = 'values cannot be used without an aggfunc.' + with tm.assertRaisesRegexp(ValueError, error): + pd.crosstab(df.a, df.b, values=df.c) + + error = 'aggfunc cannot be used without values' + with tm.assertRaisesRegexp(ValueError, error): + pd.crosstab(df.a, df.b, aggfunc=np.mean) + + error = 'Not a valid normalize argument' + with tm.assertRaisesRegexp(ValueError, error): + pd.crosstab(df.a, df.b, normalize='42') + + with tm.assertRaisesRegexp(ValueError, error): + pd.crosstab(df.a, df.b, normalize=42) + + error = 'Not a valid margins argument' + with tm.assertRaisesRegexp(ValueError, error): + pd.crosstab(df.a, df.b, normalize='all', margins=42) + + if __name__ == '__main__': import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],