pivot_table very slow on Categorical data; how about an observed keyword argument? #24923 (#24953)

benjaminr · jreback · commit 65466f040254 · 2019-04-25T21:16:09.000-04:00
diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py
@@ -127,6 +127,10 @@ def setup(self):
                              'value1': np.random.randn(N),
                              'value2': np.random.randn(N),
                              'value3': np.random.randn(N)})
+        self.df2 = DataFrame({'col1': list('abcde'), 'col2': list('fghij'),
+                              'col3': [1, 2, 3, 4, 5]})
+        self.df2.col1 = self.df2.col1.astype('category')
+        self.df2.col2 = self.df2.col2.astype('category')
 
     def time_pivot_table(self):
         self.df.pivot_table(index='key1', columns=['key2', 'key3'])
@@ -139,6 +143,14 @@ def time_pivot_table_margins(self):
         self.df.pivot_table(index='key1', columns=['key2', 'key3'],
                             margins=True)
 
+    def time_pivot_table_categorical(self):
+        self.df2.pivot_table(index='col1', values='col3', columns='col2',
+                             aggfunc=np.sum, fill_value=0)
+
+    def time_pivot_table_categorical_observed(self):
+        self.df2.pivot_table(index='col1', values='col3', columns='col2',
+                             aggfunc=np.sum, fill_value=0, observed=True)
+
 
 class Crosstab:
 
diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst
@@ -28,6 +28,7 @@ Other Enhancements
 - Indexing of ``DataFrame`` and ``Series`` now accepts zerodim ``np.ndarray`` (:issue:`24919`)
 - :meth:`Timestamp.replace` now supports the ``fold`` argument to disambiguate DST transition times (:issue:`25017`)
 - :meth:`DataFrame.at_time` and :meth:`Series.at_time` now support :meth:`datetime.time` objects with timezones (:issue:`24043`)
+- :meth:`DataFrame.pivot_table` now accepts an ``observed`` parameter which is passed to underlying calls to :meth:`DataFrame.groupby` to speed up grouping categorical data. (:issue:`24923`)
 - ``Series.str`` has gained :meth:`Series.str.casefold` method to removes all case distinctions present in a string (:issue:`25405`)
 - :meth:`DataFrame.set_index` now works for instances of ``abc.Iterator``, provided their output is of the same length as the calling frame (:issue:`22484`, :issue:`24984`)
 - :meth:`DatetimeIndex.union` now supports the ``sort`` argument. The behaviour of the sort parameter matches that of :meth:`Index.union` (:issue:`24994`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -5695,6 +5695,12 @@ def pivot(self, index=None, columns=None, values=None):
         margins_name : string, default 'All'
             Name of the row / column that will contain the totals
             when margins is True.
+        observed : boolean, default False
+            This only applies if any of the groupers are Categoricals.
+            If True: only show observed values for categorical groupers.
+            If False: show all values for categorical groupers.
+
+            .. versionchanged :: 0.25.0
 
         Returns
         -------
@@ -5785,12 +5791,12 @@ def pivot(self, index=None, columns=None, values=None):
     @Appender(_shared_docs['pivot_table'])
     def pivot_table(self, values=None, index=None, columns=None,
                     aggfunc='mean', fill_value=None, margins=False,
-                    dropna=True, margins_name='All'):
+                    dropna=True, margins_name='All', observed=False):
         from pandas.core.reshape.pivot import pivot_table
         return pivot_table(self, values=values, index=index, columns=columns,
                            aggfunc=aggfunc, fill_value=fill_value,
                            margins=margins, dropna=dropna,
-                           margins_name=margins_name)
+                           margins_name=margins_name, observed=observed)
 
     def stack(self, level=-1, dropna=True):
         """
diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -22,7 +22,7 @@
 @Appender(_shared_docs['pivot_table'], indents=1)
 def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
                 fill_value=None, margins=False, dropna=True,
-                margins_name='All'):
+                margins_name='All', observed=False):
     index = _convert_by(index)
     columns = _convert_by(columns)
 
@@ -34,7 +34,8 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
                                 columns=columns,
                                 fill_value=fill_value, aggfunc=func,
                                 margins=margins, dropna=dropna,
-                                margins_name=margins_name)
+                                margins_name=margins_name,
+                                observed=observed)
             pieces.append(table)
             keys.append(getattr(func, '__name__', func))
 
@@ -77,7 +78,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
                 pass
         values = list(values)
 
-    grouped = data.groupby(keys, observed=False)
+    grouped = data.groupby(keys, observed=observed)
     agged = grouped.agg(aggfunc)
     if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
         agged = agged.dropna(how='all')
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -37,18 +37,18 @@ def setup_method(self, method):
                                'E': np.random.randn(11),
                                'F': np.random.randn(11)})
 
-    def test_pivot_table(self):
+    def test_pivot_table(self, observed):
         index = ['A', 'B']
         columns = 'C'
         table = pivot_table(self.data, values='D',
-                            index=index, columns=columns)
+                            index=index, columns=columns, observed=observed)
 
         table2 = self.data.pivot_table(
-            values='D', index=index, columns=columns)
+            values='D', index=index, columns=columns, observed=observed)
         tm.assert_frame_equal(table, table2)
 
         # this works
-        pivot_table(self.data, values='D', index=index)
+        pivot_table(self.data, values='D', index=index, observed=observed)
 
         if len(index) > 1:
             assert table.index.names == tuple(index)
@@ -64,6 +64,28 @@ def test_pivot_table(self):
             index + [columns])['D'].agg(np.mean).unstack()
         tm.assert_frame_equal(table, expected)
 
+    def test_pivot_table_categorical_observed_equal(self, observed):
+        # issue #24923
+        df = pd.DataFrame({'col1': list('abcde'),
+                           'col2': list('fghij'),
+                           'col3': [1, 2, 3, 4, 5]})
+
+        expected = df.pivot_table(index='col1', values='col3',
+                                  columns='col2', aggfunc=np.sum,
+                                  fill_value=0)
+
+        expected.index = expected.index.astype('category')
+        expected.columns = expected.columns.astype('category')
+
+        df.col1 = df.col1.astype('category')
+        df.col2 = df.col2.astype('category')
+
+        result = df.pivot_table(index='col1', values='col3',
+                                columns='col2', aggfunc=np.sum,
+                                fill_value=0, observed=observed)
+
+        tm.assert_frame_equal(result, expected)
+
     def test_pivot_table_nocols(self):
         df = DataFrame({'rows': ['a', 'b', 'c'],
                         'cols': ['x', 'y', 'z'],