ENH: unstack multiple columns in one shot to eliminate empty columns in pivot table operations, close #1181

wesm · wesm · commit 773d86177092 · 2012-05-08T14:27:01.000-04:00
diff --git a/pandas/core/reshape.py b/pandas/core/reshape.py
@@ -9,7 +9,10 @@
 from pandas.core.frame import DataFrame
 
 from pandas.core.common import notnull, _ensure_platform_int
-from pandas.core.groupby import get_group_index
+from pandas.core.groupby import (get_group_index, _compress_group_index,
+                                 decons_group_index)
+
+
 from pandas.core.index import MultiIndex
 
 
@@ -198,6 +201,71 @@ def get_new_index(self):
         return new_index
 
 
+def _unstack_multiple(data, clocs):
+    if len(clocs) == 0:
+        return data
+
+    # NOTE: This doesn't deal with hierarchical columns yet
+
+    index = data.index
+
+    clevels, rlevels = _partition(index.levels, clocs)
+    clabels, rlabels = _partition(index.labels, clocs)
+    cnames, rnames = _partition(index.names, clocs)
+
+    shape = [len(x) for x in clevels]
+    group_index = get_group_index(clabels, shape)
+
+    comp_ids, obs_ids = _compress_group_index(group_index, sort=False)
+
+    dummy_index = MultiIndex(levels=rlevels + [obs_ids],
+                             labels=rlabels + [comp_ids],
+                             names=rnames + ['__placeholder__'])
+
+    dummy = DataFrame(data.values, index=dummy_index,
+                      columns=data.columns)
+
+    unstacked = dummy.unstack('__placeholder__')
+
+    if isinstance(unstacked, Series):
+        unstcols = unstacked.index
+    else:
+        unstcols = unstacked.columns
+
+    new_levels = [unstcols.levels[0]] + clevels
+    new_names = [data.columns.name] + cnames
+
+    recons_labels = decons_group_index(obs_ids, shape)
+
+    new_labels = [unstcols.labels[0]]
+    for rec in recons_labels:
+        new_labels.append(rec.take(unstcols.labels[-1]))
+
+    new_columns = MultiIndex(levels=new_levels, labels=new_labels,
+                             names=new_names)
+
+    if isinstance(unstacked, Series):
+        unstacked.index = new_columns
+    else:
+        unstacked.columns = new_columns
+
+    return unstacked
+
+
+def _partition(values, inds):
+    left = []
+    right = []
+
+    set_inds = set(inds)
+
+    for i, val in enumerate(values):
+        if i in set_inds:
+            left.append(val)
+        else:
+            right.append(val)
+
+    return left, right
+
 
 def pivot(self, index=None, columns=None, values=None):
     """
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -607,6 +607,16 @@ def test_unstack(self):
         # test that ints work
         unstacked = self.ymd.astype(int).unstack()
 
+    # def test_unstack_multiple_no_empty_columns(self):
+    #     index = MultiIndex.from_tuples([(0, 'foo', 0), (0, 'bar', 0),
+    #                                     (1, 'baz', 1), (1, 'qux', 1)])
+
+    #     s = Series(np.random.randn(4), index=index)
+
+    #     unstacked = s.unstack([1, 2])
+    #     expected = unstacked.dropna(axis=1, how='all')
+    #     assert_frame_equal(unstacked, expected)
+
     def test_stack(self):
         # regular roundtrip
         unstacked = self.ymd.unstack()
diff --git a/pandas/tools/pivot.py b/pandas/tools/pivot.py
@@ -1,10 +1,11 @@
 # pylint: disable=E1103
 
 from pandas import Series, DataFrame
+from pandas.core.reshape import _unstack_multiple
 from pandas.tools.merge import concat
 import pandas.core.common as com
 import numpy as np
-import types
+
 
 def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
                 fill_value=None, margins=False):
@@ -97,10 +98,12 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
     grouped = data.groupby(keys)
     agged = grouped.agg(aggfunc)
 
-    table = agged
-    for i in range(len(cols)):
-        name = table.index.names[len(rows)]
-        table = table.unstack(name)
+    table = _unstack_multiple(agged, range(len(rows), len(keys)))
+
+    # table = agged
+    # for i in range(len(cols)):
+    #     name = table.index.names[len(rows)]
+    #     table = table.unstack(name)
 
     if fill_value is not None:
         table = table.fillna(value=fill_value)
@@ -115,6 +118,7 @@ def pivot_table(data, values=None, rows=None, cols=None, aggfunc='mean',
 
     return table
 
+
 DataFrame.pivot_table = pivot_table
 
 def _add_margins(table, data, values, rows=None, cols=None, aggfunc=np.mean):
diff --git a/pandas/tools/tests/test_pivot.py b/pandas/tools/tests/test_pivot.py
@@ -157,6 +157,21 @@ def test_pivot_integer_columns(self):
 
         tm.assert_frame_equal(table, table2)
 
+    def test_pivot_no_level_overlap(self):
+        # GH #1181
+
+        data = DataFrame({'a': ['a', 'a', 'a', 'a', 'b', 'b', 'b', 'b'] * 2,
+                          'b': [0, 0, 0, 0, 1, 1, 1, 1] * 2,
+                          'c': (['foo'] * 4 + ['bar'] * 4) * 2,
+                          'value': np.random.randn(16)})
+
+        table = data.pivot_table('value', rows='a', cols=['b', 'c'])
+
+        grouped = data.groupby(['a', 'b', 'c'])['value'].mean()
+        expected = grouped.unstack('b').unstack('c').dropna(axis=1, how='all')
+        tm.assert_frame_equal(table, expected)
+
+
 class TestCrosstab(unittest.TestCase):
 
     def setUp(self):
diff --git a/vb_suite/groupby.py b/vb_suite/groupby.py
@@ -121,3 +121,26 @@ def f():
 
 series_value_counts_int64 = Benchmark('s.value_counts()', setup,
                                       start_date=datetime(2011, 10, 21))
+
+#----------------------------------------------------------------------
+# pivot_table
+
+setup = common_setup + """
+fac1 = np.array(['A', 'B', 'C'], dtype='O')
+fac2 = np.array(['one', 'two'], dtype='O')
+
+ind1 = np.random.randint(0, 3, size=100000)
+ind2 = np.random.randint(0, 2, size=100000)
+
+df = DataFrame({'key1': fac1.take(ind1),
+                'key2': fac2.take(ind2),
+                'key3': fac2.take(ind2),
+                'value1' : np.random.randn(100000),
+                'value2' : np.random.randn(100000),
+                'value3' : np.random.randn(100000)})
+"""
+
+stmt = "df.pivot_table(rows='key1', cols=['key2', 'key3'])"
+groupby_pivot_table = Benchmark(stmt, setup, start_date=datetime(2011, 12, 15))
+
+
diff --git a/vb_suite/reshape.py b/vb_suite/reshape.py
@@ -0,0 +1,18 @@
+from vbench.api import Benchmark
+from datetime import datetime
+
+common_setup = """from pandas_vb_common import *
+index = MultiIndex.from_arrays([np.arange(100).repeat(100),
+                               np.roll(np.tile(np.arange(100), 100), 25)])
+df = DataFrame(np.random.randn(10000, 4), index=index)
+"""
+
+reshape_unstack_simple = Benchmark('df.unstack(1)', common_setup,
+                                   start_date=datetime(2011, 10, 1))
+
+setup = common_setup + """
+udf = df.unstack(1)
+"""
+
+reshape_stack_simple = Benchmark('udf.stack()', setup,
+                                 start_date=datetime(2011, 10, 1))