pandas-dev · anhqle · Apr 16, 2018 · Apr 16, 2018 · Apr 16, 2018 · Apr 22, 2018
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -615,7 +615,7 @@ Reshaping
 - Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`)
 - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`)
 - Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`)
--
+- Bug in :func:`pandas.pivot_table` when the number of unique index combination exceeds int32 (:issue:`20601`)
 -
 
 Build Changes

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
@@ -81,9 +81,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
                 pass
         values = list(values)
 
-    # group by the cartesian product of the grouper
-    # if we have a categorical
-    grouped = data.groupby(keys, observed=False)
+    grouped = data.groupby(keys)
     agged = grouped.agg(aggfunc)
     if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
         agged = agged.dropna(how='all')

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
@@ -126,6 +126,17 @@ def __init__(self, values, index, level=-1, value_columns=None,
         self.removed_level = self.new_index_levels.pop(self.level)
         self.removed_level_full = index.levels[self.level]
 
+        # Bug fix GH 20601
+        # If the data frame is too big,
+        # the number of unique index combination will cause int32 overflow
+        # We want to check and raise an error before this happens
+        num_rows = np.max([index_level.size for index_level
+                           in self.new_index_levels])
+        num_columns = self.removed_level.size
+        if num_rows * num_columns > (2 ** 31 - 1):
+            raise ValueError('Unstacked DataFrame is too big, '
+                             'causing int32 overflow')
+
         self._make_sorted_values_labels()
         self._make_selectors()
 

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
@@ -1275,6 +1275,16 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
                                aggfunc=f_numpy)
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.slow
+    def test_pivot_number_of_levels_larger_than_int32(self):
+        # GH 20601
+        df = DataFrame({'ind1': np.arange(2 ** 16),
+                        'ind2': np.arange(2 ** 16),
+                        'count': np.arange(2 ** 16)})
+        with tm.assert_raises_regex(ValueError, 'int32 overflow'):
+            df.pivot_table(index='ind1', columns='ind2',
+                           values='count', aggfunc='count')
+
 
 class TestCrosstab(object):
 

diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
@@ -1195,6 +1195,14 @@ def test_unstack_unobserved_keys(self):
         recons = result.stack()
         tm.assert_frame_equal(recons, df)
 
+    @pytest.mark.slow
+    def test_unstack_number_of_levels_larger_than_int32(self):
+        # GH 20601
+        df = DataFrame(np.random.randn(2 ** 16, 2),
+                       index=[np.arange(2 ** 16), np.arange(2 ** 16)])
+        with tm.assert_raises_regex(ValueError, 'int32 overflow'):
+            df.unstack()
+
     def test_stack_order_with_unsorted_levels(self):
         # GH 16323