Skip to content

ENH: add dropna argument to pivot_table #4106

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 10, 2013
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/release.rst
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ pandas 0.12
- support python3 (via ``PyTables 3.0.0``) (:issue:`3750`)
- Add modulo operator to Series, DataFrame
- Add ``date`` method to DatetimeIndex
- Add ``dropna`` argument to pivot_table (:issue: `3820`)
- Simplified the API and added a describe method to Categorical
- ``melt`` now accepts the optional parameters ``var_name`` and ``value_name``
to specify custom column names of the returned DataFrame (:issue:`3649`),
Expand Down
26 changes: 22 additions & 4 deletions pandas/tools/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
from pandas.core.index import MultiIndex
from pandas.core.reshape import _unstack_multiple
from pandas.tools.merge import concat
from pandas.tools.util import cartesian_product
import pandas.core.common as com
import numpy as np


def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
fill_value=None, margins=False):
fill_value=None, margins=False, dropna=True):
"""
Create a spreadsheet-style pivot table as a DataFrame. The levels in the
pivot table will be stored in MultiIndex objects (hierarchical indexes) on
Expand All @@ -31,6 +32,8 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
Value to replace missing values with
margins : boolean, default False
Add all row / columns (e.g. for subtotal / grand totals)
dropna : boolean, default True
Do not include columns whose entries are all NaN

Examples
--------
Expand Down Expand Up @@ -105,6 +108,19 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
for i in range(len(rows), len(keys))]
table = agged.unstack(to_unstack)

if not dropna:
try:
m = MultiIndex.from_arrays(cartesian_product(table.index.levels))
table = table.reindex_axis(m, axis=0)
except AttributeError:
pass # it's a single level

try:
m = MultiIndex.from_arrays(cartesian_product(table.columns.levels))
table = table.reindex_axis(m, axis=1)
except AttributeError:
pass # it's a single level or a series

if isinstance(table, DataFrame):
if isinstance(table.columns, MultiIndex):
table = table.sortlevel(axis=1)
Expand Down Expand Up @@ -216,7 +232,7 @@ def _convert_by(by):


def crosstab(rows, cols, values=None, rownames=None, colnames=None,
aggfunc=None, margins=False):
aggfunc=None, margins=False, dropna=True):
"""
Compute a simple cross-tabulation of two (or more) factors. By default
computes a frequency table of the factors unless an array of values and an
Expand All @@ -238,6 +254,8 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None,
If passed, must match number of column arrays passed
margins : boolean, default False
Add row/column margins (subtotals)
dropna : boolean, default True
Do not include columns whose entries are all NaN

Notes
-----
Expand Down Expand Up @@ -281,13 +299,13 @@ def crosstab(rows, cols, values=None, rownames=None, colnames=None,
df = DataFrame(data)
df['__dummy__'] = 0
table = df.pivot_table('__dummy__', rows=rownames, cols=colnames,
aggfunc=len, margins=margins)
aggfunc=len, margins=margins, dropna=dropna)
return table.fillna(0).astype(np.int64)
else:
data['__dummy__'] = values
df = DataFrame(data)
table = df.pivot_table('__dummy__', rows=rownames, cols=colnames,
aggfunc=aggfunc, margins=margins)
aggfunc=aggfunc, margins=margins, dropna=dropna)
return table


Expand Down
29 changes: 28 additions & 1 deletion pandas/tools/tests/test_pivot.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import unittest

import numpy as np
from numpy.testing import assert_equal

from pandas import DataFrame, Series, Index
from pandas import DataFrame, Series, Index, MultiIndex
from pandas.tools.merge import concat
from pandas.tools.pivot import pivot_table, crosstab
import pandas.util.testing as tm
Expand Down Expand Up @@ -62,6 +63,22 @@ def test_pivot_table_nocols(self):
xp = df.pivot_table(rows='cols', aggfunc={'values': 'mean'}).T
tm.assert_frame_equal(rs, xp)

def test_pivot_table_dropna(self):
df = DataFrame({'amount': {0: 60000, 1: 100000, 2: 50000, 3: 30000},
'customer': {0: 'A', 1: 'A', 2: 'B', 3: 'C'},
'month': {0: 201307, 1: 201309, 2: 201308, 3: 201310},
'product': {0: 'a', 1: 'b', 2: 'c', 3: 'd'},
'quantity': {0: 2000000, 1: 500000, 2: 1000000, 3: 1000000}})
pv_col = df.pivot_table('quantity', 'month', ['customer', 'product'], dropna=False)
pv_ind = df.pivot_table('quantity', ['customer', 'product'], 'month', dropna=False)

m = MultiIndex.from_tuples([(u'A', u'a'), (u'A', u'b'), (u'A', u'c'), (u'A', u'd'),
(u'B', u'a'), (u'B', u'b'), (u'B', u'c'), (u'B', u'd'),
(u'C', u'a'), (u'C', u'b'), (u'C', u'c'), (u'C', u'd')])

assert_equal(pv_col.columns.values, m.values)
assert_equal(pv_ind.index.values, m.values)


def test_pass_array(self):
result = self.data.pivot_table('D', rows=self.data.A, cols=self.data.C)
Expand Down Expand Up @@ -374,6 +391,16 @@ def test_crosstab_pass_values(self):
aggfunc=np.sum)
tm.assert_frame_equal(table, expected)

def test_crosstab_dropna(self):
# GH 3820
a = np.array(['foo', 'foo', 'foo', 'bar', 'bar', 'foo', 'foo'], dtype=object)
b = np.array(['one', 'one', 'two', 'one', 'two', 'two', 'two'], dtype=object)
c = np.array(['dull', 'dull', 'dull', 'dull', 'dull', 'shiny', 'shiny'], dtype=object)
res = crosstab(a, [b, c], rownames=['a'], colnames=['b', 'c'], dropna=False)
m = MultiIndex.from_tuples([('one', 'dull'), ('one', 'shiny'),
('two', 'dull'), ('two', 'shiny')])
assert_equal(res.columns.values, m.values)

if __name__ == '__main__':
import nose
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
Expand Down
21 changes: 21 additions & 0 deletions pandas/tools/tests/test_util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
import nose
import unittest

import numpy as np
from numpy.testing import assert_equal

from pandas.tools.util import cartesian_product

class TestCartesianProduct(unittest.TestCase):

def test_simple(self):
x, y = list('ABC'), [1, 22]
result = cartesian_product([x, y])
expected = [np.array(['A', 'A', 'B', 'B', 'C', 'C']),
np.array([ 1, 22, 1, 22, 1, 22])]
assert_equal(result, expected)

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
28 changes: 27 additions & 1 deletion pandas/tools/util.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,32 @@
from pandas.core.index import Index
import numpy as np

def match(needles, haystack):
haystack = Index(haystack)
needles = Index(needles)
return haystack.get_indexer(needles)
return haystack.get_indexer(needles)

def cartesian_product(X):
'''
Numpy version of itertools.product or pandas.util.compat.product.
Sometimes faster (for large inputs)...

Examples
--------
>>> cartesian_product([list('ABC'), [1, 2]])
[array(['A', 'A', 'B', 'B', 'C', 'C'], dtype='|S1'),
array([1, 2, 1, 2, 1, 2])]

'''

lenX = np.fromiter((len(x) for x in X), dtype=int)
cumprodX = np.cumproduct(lenX)

a = np.roll(cumprodX, 1)
a[0] = 1

b = cumprodX[-1] / cumprodX

return [np.tile(np.repeat(x, b[i]),
np.product(a[i]))
for i, x in enumerate(X)]