-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: #3335 Pivot table support for setting name of margins column. #10296
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,22 @@ | ||
# pylint: disable=E1103 | ||
|
||
import warnings | ||
|
||
from pandas import Series, DataFrame | ||
from pandas.core.index import MultiIndex, Index | ||
from pandas.core.groupby import Grouper | ||
from pandas.tools.merge import concat | ||
from pandas.tools.util import cartesian_product | ||
from pandas.compat import range, lrange, zip | ||
from pandas.util.decorators import deprecate_kwarg | ||
from pandas import compat | ||
import pandas.core.common as com | ||
import numpy as np | ||
|
||
DEFAULT_MARGIN_COLUMN_NAME = 'All' | ||
|
||
|
||
def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', | ||
fill_value=None, margins=False, dropna=True): | ||
fill_value=None, margins=False, dropna=True, | ||
margins_column=DEFAULT_MARGIN_COLUMN_NAME): | ||
""" | ||
Create a spreadsheet-style pivot table as a DataFrame. The levels in the | ||
pivot table will be stored in MultiIndex objects (hierarchical indexes) on | ||
|
@@ -40,6 +42,9 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', | |
Add all row / columns (e.g. for subtotal / grand totals) | ||
dropna : boolean, default True | ||
Do not include columns whose entries are all NaN | ||
margins_column : string, default 'All' | ||
Name of the row / column that will contain the totals | ||
when margins is True. | ||
|
||
Examples | ||
-------- | ||
|
@@ -127,7 +132,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', | |
m = MultiIndex.from_arrays(cartesian_product(table.columns.levels)) | ||
table = table.reindex_axis(m, axis=1) | ||
except AttributeError: | ||
pass # it's a single level or a series | ||
pass # it's a single level or a series | ||
|
||
if isinstance(table, DataFrame): | ||
if isinstance(table.columns, MultiIndex): | ||
|
@@ -140,7 +145,8 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', | |
|
||
if margins: | ||
table = _add_margins(table, data, values, rows=index, | ||
cols=columns, aggfunc=aggfunc) | ||
cols=columns, aggfunc=aggfunc, | ||
margins_column=margins_column) | ||
|
||
# discard the top level | ||
if values_passed and not values_multi: | ||
|
@@ -155,28 +161,50 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', | |
DataFrame.pivot_table = pivot_table | ||
|
||
|
||
def _add_margins(table, data, values, rows, cols, aggfunc): | ||
def _add_margins(table, data, values, rows, cols, aggfunc, | ||
margins_column=DEFAULT_MARGIN_COLUMN_NAME): | ||
exception_message = 'Must choose different value for margins_column' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe |
||
for level in table.index.names: | ||
if margins_column in table.index.get_level_values(level): | ||
raise ValueError(exception_message) | ||
# could be passed a Series object with no 'columns' | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can move this a bit lower (e.g. past 176) |
||
if hasattr(table, 'columns'): | ||
for level in table.columns.names[1:]: | ||
if margins_column in table.columns.get_level_values(level): | ||
raise ValueError(exception_message) | ||
|
||
grand_margin = _compute_grand_margin(data, values, aggfunc) | ||
grand_margin = _compute_grand_margin(data, values, aggfunc, margins_column) | ||
|
||
if not values and isinstance(table, Series): | ||
# If there are no values and the table is a series, then there is only | ||
# one column in the data. Compute grand margin and return it. | ||
row_key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All' | ||
return table.append(Series({row_key: grand_margin['All']})) | ||
|
||
if len(rows) > 1: | ||
row_key = (margins_column,) + ('',) * (len(rows) - 1) | ||
else: | ||
row_key = margins_column | ||
|
||
return table.append(Series({row_key: grand_margin[margins_column]})) | ||
|
||
if values: | ||
marginal_result_set = _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin) | ||
marginal_result_set = _generate_marginal_results(table, data, values, | ||
rows, cols, aggfunc, | ||
grand_margin, | ||
margins_column) | ||
if not isinstance(marginal_result_set, tuple): | ||
return marginal_result_set | ||
result, margin_keys, row_margin = marginal_result_set | ||
else: | ||
marginal_result_set = _generate_marginal_results_without_values(table, data, rows, cols, aggfunc) | ||
marginal_result_set = _generate_marginal_results_without_values( | ||
table, data, rows, cols, aggfunc, margins_column) | ||
if not isinstance(marginal_result_set, tuple): | ||
return marginal_result_set | ||
result, margin_keys, row_margin = marginal_result_set | ||
|
||
key = ('All',) + ('',) * (len(rows) - 1) if len(rows) > 1 else 'All' | ||
if len(rows) > 1: | ||
key = (margins_column,) + ('',) * (len(rows) - 1) | ||
else: | ||
key = margins_column | ||
|
||
row_margin = row_margin.reindex(result.columns) | ||
# populate grand margin | ||
|
@@ -195,7 +223,8 @@ def _add_margins(table, data, values, rows, cols, aggfunc): | |
return result | ||
|
||
|
||
def _compute_grand_margin(data, values, aggfunc): | ||
def _compute_grand_margin(data, values, aggfunc, | ||
margins_column=DEFAULT_MARGIN_COLUMN_NAME): | ||
|
||
if values: | ||
grand_margin = {} | ||
|
@@ -214,17 +243,19 @@ def _compute_grand_margin(data, values, aggfunc): | |
pass | ||
return grand_margin | ||
else: | ||
return {'All': aggfunc(data.index)} | ||
return {margins_column: aggfunc(data.index)} | ||
|
||
|
||
def _generate_marginal_results(table, data, values, rows, cols, aggfunc, grand_margin): | ||
def _generate_marginal_results(table, data, values, rows, cols, aggfunc, | ||
grand_margin, | ||
margins_column=DEFAULT_MARGIN_COLUMN_NAME): | ||
if len(cols) > 0: | ||
# need to "interleave" the margins | ||
table_pieces = [] | ||
margin_keys = [] | ||
|
||
def _all_key(key): | ||
return (key, 'All') + ('',) * (len(cols) - 1) | ||
return (key, margins_column) + ('',) * (len(cols) - 1) | ||
|
||
if len(rows) > 0: | ||
margin = data[rows + values].groupby(rows).agg(aggfunc) | ||
|
@@ -269,15 +300,17 @@ def _all_key(key): | |
return result, margin_keys, row_margin | ||
|
||
|
||
def _generate_marginal_results_without_values(table, data, rows, cols, aggfunc): | ||
def _generate_marginal_results_without_values( | ||
table, data, rows, cols, aggfunc, | ||
margins_column=DEFAULT_MARGIN_COLUMN_NAME): | ||
if len(cols) > 0: | ||
# need to "interleave" the margins | ||
margin_keys = [] | ||
|
||
def _all_key(): | ||
if len(cols) == 1: | ||
return 'All' | ||
return ('All', ) + ('', ) * (len(cols) - 1) | ||
return margins_column | ||
return (margins_column, ) + ('', ) * (len(cols) - 1) | ||
|
||
if len(rows) > 0: | ||
margin = data[rows].groupby(rows).apply(aggfunc) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,6 +7,7 @@ | |
from pandas import DataFrame, Series, Index, MultiIndex, Grouper | ||
from pandas.tools.merge import concat | ||
from pandas.tools.pivot import pivot_table, crosstab | ||
from pandas.tools.pivot import DEFAULT_MARGIN_COLUMN_NAME | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you know its defaulted to |
||
from pandas.compat import range, u, product | ||
import pandas.util.testing as tm | ||
|
||
|
@@ -224,82 +225,106 @@ def test_pivot_with_tz(self): | |
tm.assert_frame_equal(pv, expected) | ||
|
||
def test_margins(self): | ||
def _check_output(res, col, index=['A', 'B'], columns=['C']): | ||
cmarg = res['All'][:-1] | ||
exp = self.data.groupby(index)[col].mean() | ||
tm.assert_series_equal(cmarg, exp, check_names=False) | ||
self.assertEqual(cmarg.name, 'All') | ||
|
||
res = res.sortlevel() | ||
rmarg = res.xs(('All', ''))[:-1] | ||
exp = self.data.groupby(columns)[col].mean() | ||
tm.assert_series_equal(rmarg, exp, check_names=False) | ||
self.assertEqual(rmarg.name, ('All', '')) | ||
|
||
gmarg = res['All']['All', ''] | ||
exp = self.data[col].mean() | ||
self.assertEqual(gmarg, exp) | ||
def _check_output(result, values_col, index=['A', 'B'], | ||
columns=['C'], | ||
margins_col=DEFAULT_MARGIN_COLUMN_NAME): | ||
col_margins = result.ix[:-1, margins_col] | ||
expected_col_margins = self.data.groupby(index)[values_col].mean() | ||
tm.assert_series_equal(col_margins, expected_col_margins, | ||
check_names=False) | ||
self.assertEqual(col_margins.name, margins_col) | ||
|
||
result = result.sortlevel() | ||
index_margins = result.ix[(margins_col, '')].iloc[:-1] | ||
expected_ix_margins = self.data.groupby(columns)[values_col].mean() | ||
tm.assert_series_equal(index_margins, expected_ix_margins, | ||
check_names=False) | ||
self.assertEqual(index_margins.name, (margins_col, '')) | ||
|
||
grand_total_margins = result.loc[(margins_col, ''), margins_col] | ||
expected_total_margins = self.data[values_col].mean() | ||
self.assertEqual(grand_total_margins, expected_total_margins) | ||
|
||
# column specified | ||
table = self.data.pivot_table('D', index=['A', 'B'], columns='C', | ||
margins=True, aggfunc=np.mean) | ||
_check_output(table, 'D') | ||
result = self.data.pivot_table(values='D', index=['A', 'B'], | ||
columns='C', | ||
margins=True, aggfunc=np.mean) | ||
_check_output(result, 'D') | ||
|
||
# Set a different margins_column (not 'All') | ||
result = self.data.pivot_table(values='D', index=['A', 'B'], | ||
columns='C', | ||
margins=True, aggfunc=np.mean, | ||
margins_column='Totals') | ||
_check_output(result, 'D', margins_col='Totals') | ||
|
||
# no column specified | ||
table = self.data.pivot_table(index=['A', 'B'], columns='C', | ||
margins=True, aggfunc=np.mean) | ||
for valcol in table.columns.levels[0]: | ||
_check_output(table[valcol], valcol) | ||
for value_col in table.columns.levels[0]: | ||
_check_output(table[value_col], value_col) | ||
|
||
# no col | ||
|
||
# to help with a buglet | ||
self.data.columns = [k * 2 for k in self.data.columns] | ||
table = self.data.pivot_table(index=['AA', 'BB'], margins=True, | ||
aggfunc=np.mean) | ||
for valcol in table.columns: | ||
gmarg = table[valcol]['All', ''] | ||
self.assertEqual(gmarg, self.data[valcol].mean()) | ||
|
||
# this is OK | ||
table = self.data.pivot_table(index=['AA', 'BB'], margins=True, | ||
aggfunc='mean') | ||
for value_col in table.columns: | ||
totals = table.loc[(DEFAULT_MARGIN_COLUMN_NAME, ''), value_col] | ||
self.assertEqual(totals, self.data[value_col].mean()) | ||
|
||
# no rows | ||
rtable = self.data.pivot_table(columns=['AA', 'BB'], margins=True, | ||
aggfunc=np.mean) | ||
tm.assert_isinstance(rtable, Series) | ||
|
||
table = self.data.pivot_table(index=['AA', 'BB'], margins=True, | ||
aggfunc='mean') | ||
for item in ['DD', 'EE', 'FF']: | ||
gmarg = table[item]['All', ''] | ||
self.assertEqual(gmarg, self.data[item].mean()) | ||
totals = table.loc[(DEFAULT_MARGIN_COLUMN_NAME, ''), item] | ||
self.assertEqual(totals, self.data[item].mean()) | ||
|
||
# issue number #8349: pivot_table with margins and dictionary aggfunc | ||
data = [ | ||
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2013, | ||
'MONTH': 12, 'DAYS': 3, 'SALARY': 17}, | ||
{'JOB': 'Employ', 'NAME': | ||
'Mary', 'YEAR': 2013, 'MONTH': 12, 'DAYS': 5, 'SALARY': 23}, | ||
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, | ||
'MONTH': 1, 'DAYS': 10, 'SALARY': 100}, | ||
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, | ||
'MONTH': 1, 'DAYS': 11, 'SALARY': 110}, | ||
{'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014, | ||
'MONTH': 1, 'DAYS': 15, 'SALARY': 200}, | ||
{'JOB': 'Worker', 'NAME': 'Bob', 'YEAR': 2014, | ||
'MONTH': 2, 'DAYS': 8, 'SALARY': 80}, | ||
{'JOB': 'Employ', 'NAME': 'Mary', 'YEAR': 2014, | ||
'MONTH': 2, 'DAYS': 5, 'SALARY': 190}, | ||
] | ||
|
||
df=DataFrame([ {'JOB':'Worker','NAME':'Bob' ,'YEAR':2013,'MONTH':12,'DAYS': 3,'SALARY': 17}, | ||
{'JOB':'Employ','NAME':'Mary','YEAR':2013,'MONTH':12,'DAYS': 5,'SALARY': 23}, | ||
{'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':10,'SALARY':100}, | ||
{'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 1,'DAYS':11,'SALARY':110}, | ||
{'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 1,'DAYS':15,'SALARY':200}, | ||
{'JOB':'Worker','NAME':'Bob' ,'YEAR':2014,'MONTH': 2,'DAYS': 8,'SALARY': 80}, | ||
{'JOB':'Employ','NAME':'Mary','YEAR':2014,'MONTH': 2,'DAYS': 5,'SALARY':190} ]) | ||
|
||
df=df.set_index(['JOB','NAME','YEAR','MONTH'],drop=False,append=False) | ||
|
||
rs=df.pivot_table( index=['JOB','NAME'], | ||
columns=['YEAR','MONTH'], | ||
values=['DAYS','SALARY'], | ||
aggfunc={'DAYS':'mean','SALARY':'sum'}, | ||
margins=True) | ||
df = DataFrame(data) | ||
|
||
ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['DAYS'],aggfunc='mean',margins=True) | ||
df = df.set_index(['JOB', 'NAME', 'YEAR', 'MONTH'], drop=False, | ||
append=False) | ||
|
||
tm.assert_frame_equal(rs['DAYS'], ex['DAYS']) | ||
result = df.pivot_table(index=['JOB', 'NAME'], | ||
columns=['YEAR', 'MONTH'], | ||
values=['DAYS', 'SALARY'], | ||
aggfunc={'DAYS': 'mean', 'SALARY': 'sum'}, | ||
margins=True) | ||
|
||
ex=df.pivot_table(index=['JOB','NAME'],columns=['YEAR','MONTH'],values=['SALARY'],aggfunc='sum',margins=True) | ||
expected = df.pivot_table(index=['JOB', 'NAME'], | ||
columns=['YEAR', 'MONTH'], values=['DAYS'], | ||
aggfunc='mean', margins=True) | ||
|
||
tm.assert_frame_equal(rs['SALARY'], ex['SALARY']) | ||
tm.assert_frame_equal(result['DAYS'], expected['DAYS']) | ||
|
||
expected = df.pivot_table(index=['JOB', 'NAME'], | ||
columns=['YEAR', 'MONTH'], values=['SALARY'], | ||
aggfunc='sum', margins=True) | ||
|
||
tm.assert_frame_equal(result['SALARY'], expected['SALARY']) | ||
|
||
def test_pivot_integer_columns(self): | ||
# caused by upstream bug in unstack | ||
|
@@ -402,6 +427,24 @@ def test_margins_no_values_two_row_two_cols(self): | |
result = self.data[['A', 'B', 'C', 'D']].pivot_table(index=['A', 'B'], columns=['C', 'D'], aggfunc=len, margins=True) | ||
self.assertEqual(result.All.tolist(), [3.0, 1.0, 4.0, 3.0, 11.0]) | ||
|
||
def test_pivot_table_with_margins_set_margin_column(self): | ||
for margin_column in ['foo', 'one']: | ||
with self.assertRaises(ValueError): | ||
# multi-index index | ||
pivot_table(self.data, values='D', index=['A', 'B'], | ||
columns=['C'], margins=True, | ||
margins_column=margin_column) | ||
with self.assertRaises(ValueError): | ||
# multi-index column | ||
pivot_table(self.data, values='D', index=['C'], | ||
columns=['A', 'B'], margins=True, | ||
margins_column=margin_column) | ||
with self.assertRaises(ValueError): | ||
# non-multi-index index/column | ||
pivot_table(self.data, values='D', index=['A'], | ||
columns=['B'], margins=True, | ||
margins_column=margin_column) | ||
|
||
def test_pivot_timegrouper(self): | ||
df = DataFrame({ | ||
'Branch' : 'A A A A A A A B'.split(), | ||
|
@@ -678,17 +721,17 @@ def test_crosstab_margins(self): | |
self.assertEqual(result.index.names, ('a',)) | ||
self.assertEqual(result.columns.names, ['b', 'c']) | ||
|
||
all_cols = result['All', ''] | ||
all_cols = result[DEFAULT_MARGIN_COLUMN_NAME, ''] | ||
exp_cols = df.groupby(['a']).size().astype('i8') | ||
exp_cols = exp_cols.append(Series([len(df)], index=['All'])) | ||
exp_cols.name = ('All', '') | ||
exp_cols = exp_cols.append(Series([len(df)], index=[DEFAULT_MARGIN_COLUMN_NAME])) | ||
exp_cols.name = (DEFAULT_MARGIN_COLUMN_NAME, '') | ||
|
||
tm.assert_series_equal(all_cols, exp_cols) | ||
|
||
all_rows = result.ix['All'] | ||
all_rows = result.ix[DEFAULT_MARGIN_COLUMN_NAME] | ||
exp_rows = df.groupby(['b', 'c']).size().astype('i8') | ||
exp_rows = exp_rows.append(Series([len(df)], index=[('All', '')])) | ||
exp_rows.name = 'All' | ||
exp_rows = exp_rows.append(Series([len(df)], index=[(DEFAULT_MARGIN_COLUMN_NAME, '')])) | ||
exp_rows.name = DEFAULT_MARGIN_COLUMN_NAME | ||
|
||
exp_rows = exp_rows.reindex(all_rows.index) | ||
exp_rows = exp_rows.fillna(0).astype(np.int64) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
no need to have another variable for this, just set it here in the constructor.
how about call this
margins_name