ENH: add normalization to crosstab

Nick Eubank · jreback · commit bb494b77ce1e · 2016-04-25T10:33:50.000-04:00
closes #12578 closes #12569
diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst
@@ -383,9 +383,12 @@ calling ``to_string`` if you wish:
 
 Note that ``pivot_table`` is also available as an instance method on DataFrame.
 
+.. _reshaping.crosstabulations:
+
 Cross tabulations
 ~~~~~~~~~~~~~~~~~
 
+
 Use the ``crosstab`` function to compute a cross-tabulation of two (or more)
 factors. By default ``crosstab`` computes a frequency table of the factors
 unless an array of values and an aggregation function are passed.
@@ -402,6 +405,9 @@ It takes a number of arguments
 - ``colnames``: sequence, default None, if passed, must match number of column
   arrays passed
 - ``margins``: boolean, default False, Add row/column margins (subtotals)
+- ``normalize``: boolean, {'all', 'index', 'columns'}, or {0,1}, default False.
+  Normalize by dividing all values by the sum of values.
+
 
 Any Series passed will have their name attributes used unless row or column
 names for the cross-tabulation are specified
@@ -416,6 +422,47 @@ For example:
     c = np.array([dull, dull, shiny, dull, dull, shiny], dtype=object)
     pd.crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'])
 
+
+If ``crosstab`` receives only two Series, it will provide a frequency table.
+
+.. ipython:: python
+
+    df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
+                       'c': [1, 1, np.nan, 1, 1]})
+    df
+
+    pd.crosstab(df.a, df.b)
+
+.. versionadded:: 0.18.1
+
+Frequency tables can also be normalized to show percentages rather than counts
+using the ``normalize`` argument:
+
+.. ipython:: python
+
+   pd.crosstab(df.a, df.b, normalize=True)
+
+``normalize`` can also normalize values within each row or within each column:
+
+.. ipython:: python
+
+   pd.crosstab(df.a, df.b, normalize='columns')
+
+``crosstab`` can also be passed a third Series and an aggregation function
+(``aggfunc``) that will be applied to the values of the third Series within each
+group defined by the first two Series:
+
+.. ipython:: python
+
+   pd.crosstab(df.a, df.b, values=df.c, aggfunc=np.sum)
+
+And finally, one can also add margins or normalize this output.
+
+.. ipython:: python
+
+   pd.crosstab(df.a, df.b, values=df.c, aggfunc=np.sum, normalize=True,
+               margins=True)
+
 .. _reshaping.pivot.margins:
 
 Adding margins (partial aggregates)
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -94,6 +94,8 @@ Other Enhancements
    idx = pd.Index(['a|b', 'a|c', 'b|c'])
    idx.str.get_dummies('|')
 
+- ``pd.crosstab()`` has gained a ``normalize`` argument for normalizing frequency tables (:issue:`12569`). Examples in the updated docs :ref:`here <reshaping.crosstabulations>`.
+
 
 .. _whatsnew_0181.sparse:
 
@@ -364,6 +366,8 @@ Bug Fixes
 - Bug in ``.concat`` of datetime tz-aware and naive DataFrames (:issue:`12467`)
 - Bug in correctly raising a ``ValueError`` in ``.resample(..).fillna(..)`` when passing a non-string (:issue:`12952`)
 - Bug fixes in various encoding and header processing issues in ``pd.read_sas()`` (:issue:`12659`, :issue:`12654`, :issue:`12647`, :issue:`12809`)
+- Bug in ``pd.crosstab()`` where would silently ignore ``aggfunc`` if ``values=None`` (:issue:`12569`).
+
 
 - Bug in ``Timestamp.__repr__`` that caused ``pprint`` to fail in nested structures (:issue:`12622`)
 - Bug in ``Timedelta.min`` and ``Timedelta.max``, the properties now report the true minimum/maximum ``timedeltas`` as recognized by Pandas. See :ref:`documentation <timedeltas.limitations>`. (:issue:`12727`)
diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py
@@ -371,7 +371,7 @@ def _convert_by(by):
 
 
 def crosstab(index, columns, values=None, rownames=None, colnames=None,
-             aggfunc=None, margins=False, dropna=True):
+             aggfunc=None, margins=False, dropna=True, normalize=False):
     """
     Compute a simple cross-tabulation of two (or more) factors. By default
     computes a frequency table of the factors unless an array of values and an
@@ -384,9 +384,10 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
     columns : array-like, Series, or list of arrays/Series
         Values to group by in the columns
     values : array-like, optional
-        Array of values to aggregate according to the factors
+        Array of values to aggregate according to the factors.
+        Requires `aggfunc` be specified.
     aggfunc : function, optional
-        If no values array is passed, computes a frequency table
+        If specified, requires `values` be specified as well
     rownames : sequence, default None
         If passed, must match number of row arrays passed
     colnames : sequence, default None
@@ -395,6 +396,16 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
         Add row/column margins (subtotals)
     dropna : boolean, default True
         Do not include columns whose entries are all NaN
+    normalize : boolean, {'all', 'index', 'columns'}, or {0,1}, default False
+        Normalize by dividing all values by the sum of values.
+
+        - If passed 'all' or `True`, will normalize over all values.
+        - If passed 'index' will normalize over each row.
+        - If passed 'columns' will normalize over each column.
+        - If margins is `True`, will also normalize margin values.
+
+        .. versionadded:: 0.18.1
+
 
     Notes
     -----
@@ -438,18 +449,97 @@ def crosstab(index, columns, values=None, rownames=None, colnames=None,
     data.update(zip(rownames, index))
     data.update(zip(colnames, columns))
 
+    if values is None and aggfunc is not None:
+        raise ValueError("aggfunc cannot be used without values.")
+
+    if values is not None and aggfunc is None:
+        raise ValueError("values cannot be used without an aggfunc.")
+
     if values is None:
         df = DataFrame(data)
         df['__dummy__'] = 0
         table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                                aggfunc=len, margins=margins, dropna=dropna)
-        return table.fillna(0).astype(np.int64)
+        table = table.fillna(0).astype(np.int64)
+
     else:
         data['__dummy__'] = values
         df = DataFrame(data)
         table = df.pivot_table('__dummy__', index=rownames, columns=colnames,
                                aggfunc=aggfunc, margins=margins, dropna=dropna)
-        return table
+
+    # Post-process
+    if normalize is not False:
+        table = _normalize(table, normalize=normalize, margins=margins)
+
+    return table
+
+
+def _normalize(table, normalize, margins):
+
+    if not isinstance(normalize, bool) and not isinstance(normalize,
+                                                          compat.string_types):
+        axis_subs = {0: 'index', 1: 'columns'}
+        try:
+            normalize = axis_subs[normalize]
+        except KeyError:
+            raise ValueError("Not a valid normalize argument")
+
+    if margins is False:
+
+        # Actual Normalizations
+        normalizers = {
+            'all': lambda x: x / x.sum(axis=1).sum(axis=0),
+            'columns': lambda x: x / x.sum(),
+            'index': lambda x: x.div(x.sum(axis=1), axis=0)
+        }
+
+        normalizers[True] = normalizers['all']
+
+        try:
+            f = normalizers[normalize]
+        except KeyError:
+            raise ValueError("Not a valid normalize argument")
+
+        table = f(table)
+        table = table.fillna(0)
+
+    elif margins is True:
+
+        column_margin = table.loc[:, 'All'].drop('All')
+        index_margin = table.loc['All', :].drop('All')
+        table = table.drop('All', axis=1).drop('All')
+
+        # Normalize core
+        table = _normalize(table, normalize=normalize, margins=False)
+
+        # Fix Margins
+        if normalize == 'columns':
+            column_margin = column_margin / column_margin.sum()
+            table = concat([table, column_margin], axis=1)
+            table = table.fillna(0)
+
+        elif normalize == 'index':
+            index_margin = index_margin / index_margin.sum()
+            table = table.append(index_margin)
+            table = table.fillna(0)
+
+        elif normalize == "all" or normalize is True:
+            column_margin = column_margin / column_margin.sum()
+            index_margin = index_margin / index_margin.sum()
+            index_margin.loc['All'] = 1
+            table = concat([table, column_margin], axis=1)
+            table = table.append(index_margin)
+
+            table = table.fillna(0)
+
+        else:
+            raise ValueError("Not a valid normalize argument")
+
+    else:
+        raise ValueError("Not a valid margins argument")
+
+    return table
 
 
 def _get_names(arrs, names, prefix='row'):
diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py
@@ -1021,6 +1021,150 @@ def test_margin_dropna(self):
         expected.columns = Index(['dull', 'shiny', 'All'], name='c')
         tm.assert_frame_equal(actual, expected)
 
+    def test_crosstab_normalize(self):
+        # Issue 12578
+        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
+                           'c': [1, 1, np.nan, 1, 1]})
+
+        rindex = pd.Index([1, 2], name='a')
+        cindex = pd.Index([3, 4], name='b')
+        full_normal = pd.DataFrame([[0.2, 0], [0.2, 0.6]],
+                                   index=rindex, columns=cindex)
+        row_normal = pd.DataFrame([[1.0, 0], [0.25, 0.75]],
+                                  index=rindex, columns=cindex)
+        col_normal = pd.DataFrame([[0.5, 0], [0.5, 1.0]],
+                                  index=rindex, columns=cindex)
+
+        # Check all normalize args
+        tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='all'),
+                              full_normal)
+        tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True),
+                              full_normal)
+        tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index'),
+                              row_normal)
+        tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns'),
+                              col_normal)
+        tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=1),
+                              pd.crosstab(df.a, df.b, normalize='columns'))
+        tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=0),
+                              pd.crosstab(df.a, df.b, normalize='index'))
+
+        row_normal_margins = pd.DataFrame([[1.0, 0],
+                                          [0.25, 0.75],
+                                          [0.4, 0.6]],
+                                          index=pd.Index([1, 2, 'All'],
+                                                         name='a',
+                                                         dtype='object'),
+                                          columns=pd.Index([3, 4], name='b'))
+        col_normal_margins = pd.DataFrame([[0.5, 0, 0.2], [0.5, 1.0, 0.8]],
+                                          index=pd.Index([1, 2], name='a',
+                                                         dtype='object'),
+                                          columns=pd.Index([3, 4, 'All'],
+                                                           name='b'))
+
+        all_normal_margins = pd.DataFrame([[0.2, 0, 0.2],
+                                          [0.2, 0.6, 0.8],
+                                          [0.4, 0.6, 1]],
+                                          index=pd.Index([1, 2, 'All'],
+                                                         name='a',
+                                                         dtype='object'),
+                                          columns=pd.Index([3, 4, 'All'],
+                                                           name='b'))
+
+        tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='index',
+                                          margins=True), row_normal_margins)
+        tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize='columns',
+                                          margins=True), col_normal_margins)
+        tm.assert_frame_equal(pd.crosstab(df.a, df.b, normalize=True,
+                                          margins=True), all_normal_margins)
+
+        # Test arrays
+        pd.crosstab([np.array([1, 1, 2, 2]), np.array([1, 2, 1, 2])],
+                    np.array([1, 2, 1, 2]))
+
+        # Test with aggfunc
+        norm_counts = pd.DataFrame([[0.25, 0, 0.25],
+                                    [0.25, 0.5, 0.75],
+                                    [0.5, 0.5, 1]],
+                                   index=pd.Index([1, 2, 'All'],
+                                                  name='a',
+                                                  dtype='object'),
+                                   columns=pd.Index([3, 4, 'All'],
+                                                    name='b'))
+        test_case = pd.crosstab(df.a, df.b, df.c, aggfunc='count',
+                                normalize='all',
+                                margins=True)
+        tm.assert_frame_equal(test_case, norm_counts)
+
+        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
+                           'c': [0, 4, np.nan, 3, 3]})
+
+        norm_sum = pd.DataFrame([[0, 0, 0.],
+                                 [0.4, 0.6, 1],
+                                 [0.4, 0.6, 1]],
+                                index=pd.Index([1, 2, 'All'],
+                                               name='a',
+                                               dtype='object'),
+                                columns=pd.Index([3, 4, 'All'],
+                                                 name='b',
+                                                 dtype='object'))
+        test_case = pd.crosstab(df.a, df.b, df.c, aggfunc=np.sum,
+                                normalize='all',
+                                margins=True)
+        tm.assert_frame_equal(test_case, norm_sum)
+
+    def test_crosstab_with_empties(self):
+        # Check handling of empties
+        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
+                           'c': [np.nan, np.nan, np.nan, np.nan, np.nan]})
+
+        empty = pd.DataFrame([[0.0, 0.0], [0.0, 0.0]],
+                             index=pd.Index([1, 2],
+                                            name='a',
+                                            dtype='int64'),
+                             columns=pd.Index([3, 4], name='b'))
+
+        for i in [True, 'index', 'columns']:
+            calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
+                                     normalize=i)
+            tm.assert_frame_equal(empty, calculated)
+
+        nans = pd.DataFrame([[0.0, np.nan], [0.0, 0.0]],
+                            index=pd.Index([1, 2],
+                                           name='a',
+                                           dtype='int64'),
+                            columns=pd.Index([3, 4], name='b'))
+
+        calculated = pd.crosstab(df.a, df.b, values=df.c, aggfunc='count',
+                                 normalize=False)
+        tm.assert_frame_equal(nans, calculated)
+
+    def test_crosstab_errors(self):
+        # Issue 12578
+
+        df = pd.DataFrame({'a': [1, 2, 2, 2, 2], 'b': [3, 3, 4, 4, 4],
+                           'c': [1, 1, np.nan, 1, 1]})
+
+        error = 'values cannot be used without an aggfunc.'
+        with tm.assertRaisesRegexp(ValueError, error):
+            pd.crosstab(df.a, df.b, values=df.c)
+
+        error = 'aggfunc cannot be used without values'
+        with tm.assertRaisesRegexp(ValueError, error):
+            pd.crosstab(df.a, df.b, aggfunc=np.mean)
+
+        error = 'Not a valid normalize argument'
+        with tm.assertRaisesRegexp(ValueError, error):
+            pd.crosstab(df.a, df.b, normalize='42')
+
+        with tm.assertRaisesRegexp(ValueError, error):
+            pd.crosstab(df.a, df.b, normalize=42)
+
+        error = 'Not a valid margins argument'
+        with tm.assertRaisesRegexp(ValueError, error):
+            pd.crosstab(df.a, df.b, normalize='all', margins=42)
+
+
 if __name__ == '__main__':
     import nose
     nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],