From 1f5ed032de3de12eb10ee1f64ff11016388eaab1 Mon Sep 17 00:00:00 2001
From: Anh Le <anh.le91@gmail.com>
Date: Mon, 16 Apr 2018 01:14:23 -0400
Subject: [PATCH 1/8] ENH GH20601 raise an error when the number of levels in a
 pivot table larger than int32

---
 pandas/core/reshape/reshape.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index f9ab813855f47..e1d1b2fd72770 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -161,6 +161,8 @@ def _make_selectors(self):
         self.full_shape = ngroups, stride
 
         selector = self.sorted_labels[-1] + stride * comp_index + self.lift
+        if np.prod(self.full_shape) > (2 ** 31 - 1):
+            raise ValueError('Pivot table is too big, causing int32 overflow')
         mask = np.zeros(np.prod(self.full_shape), dtype=bool)
         mask.put(selector, True)
 

From e78e82a6e16d88ee8a8da82ba6f57a7f36175f29 Mon Sep 17 00:00:00 2001
From: Anh Le <anh.le91@gmail.com>
Date: Mon, 16 Apr 2018 01:53:06 -0400
Subject: [PATCH 2/8] TST add a test for pivot table large number of levels
 causing int32 overflow

---
 pandas/tests/reshape/test_pivot.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 7e7e081408534..e44a32ce8870a 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1275,6 +1275,14 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
                                aggfunc=f_numpy)
         tm.assert_frame_equal(result, expected)
 
+    @pytest.mark.slow
+    def test_pivot_number_of_levels_larger_than_int32(self):
+        # GH 20601
+        data = DataFrame({'ind1': list(range(1337600)) * 2,
+                          'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600})
+        with tm.assert_raises_regex(ValueError, 'int32 overflow'):
+            data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count')
+
 
 class TestCrosstab(object):
 

From 5d773efcb4977e1d09d3d4fed8373010fe3320df Mon Sep 17 00:00:00 2001
From: Anh Le <anh.le91@gmail.com>
Date: Mon, 16 Apr 2018 01:55:56 -0400
Subject: [PATCH 3/8] CLN PEP8 compliance

---
 pandas/tests/reshape/test_pivot.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index e44a32ce8870a..941f5db9e5138 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1279,9 +1279,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
     def test_pivot_number_of_levels_larger_than_int32(self):
         # GH 20601
         data = DataFrame({'ind1': list(range(1337600)) * 2,
-                          'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600})
+                          'ind2': list(range(3040)) * 2 * 440,
+                          'count': [1] * 2 * 1337600})
         with tm.assert_raises_regex(ValueError, 'int32 overflow'):
-            data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count')
+            data.pivot_table(index='ind1', columns='ind2',
+                             values='count', aggfunc='count')
 
 
 class TestCrosstab(object):

From 01a79439e4ede04d4e03aeb812e7c72035a17e1d Mon Sep 17 00:00:00 2001
From: Anh Le <anh.le91@gmail.com>
Date: Sun, 22 Apr 2018 02:20:18 -0400
Subject: [PATCH 4/8] ENH catch the int32 overflow error earlier and in two
 separate places: in pivot_table and unstack

---
 pandas/core/reshape/pivot.py       | 5 +++++
 pandas/core/reshape/reshape.py     | 7 +++++--
 pandas/tests/reshape/test_pivot.py | 8 ++++----
 pandas/tests/test_multilevel.py    | 7 +++++++
 4 files changed, 21 insertions(+), 6 deletions(-)

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 0d1caa3d57d73..962558de562c7 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -31,6 +31,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
     index = _convert_by(index)
     columns = _convert_by(columns)
 
+    num_rows = data.reindex(index, axis='columns').shape[0]
+    num_columns = data.reindex(columns, axis='columns').shape[0]
+    if num_rows * num_columns > (2 ** 31 - 1):
+        raise ValueError('Pivot table is too big, causing int32 overflow')
+
     if isinstance(aggfunc, list):
         pieces = []
         keys = []
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index e1d1b2fd72770..c310710a4a40d 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -126,6 +126,11 @@ def __init__(self, values, index, level=-1, value_columns=None,
         self.removed_level = self.new_index_levels.pop(self.level)
         self.removed_level_full = index.levels[self.level]
 
+        num_rows = np.max([index_level.size for index_level in self.new_index_levels])
+        num_columns = self.removed_level.size
+        if num_rows * num_columns > (2 ** 31 - 1):
+            raise ValueError('Unstacked data frame is too big, causing int32 overflow')
+
         self._make_sorted_values_labels()
         self._make_selectors()
 
@@ -161,8 +166,6 @@ def _make_selectors(self):
         self.full_shape = ngroups, stride
 
         selector = self.sorted_labels[-1] + stride * comp_index + self.lift
-        if np.prod(self.full_shape) > (2 ** 31 - 1):
-            raise ValueError('Pivot table is too big, causing int32 overflow')
         mask = np.zeros(np.prod(self.full_shape), dtype=bool)
         mask.put(selector, True)
 
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index 941f5db9e5138..a1ccae1718081 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1278,11 +1278,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
     @pytest.mark.slow
     def test_pivot_number_of_levels_larger_than_int32(self):
         # GH 20601
-        data = DataFrame({'ind1': list(range(1337600)) * 2,
-                          'ind2': list(range(3040)) * 2 * 440,
-                          'count': [1] * 2 * 1337600})
+        df = DataFrame({'ind1': np.arange(2 ** 16),
+                          'ind2': np.arange(2 ** 16),
+                          'count': np.arange(2 ** 16)})
         with tm.assert_raises_regex(ValueError, 'int32 overflow'):
-            data.pivot_table(index='ind1', columns='ind2',
+            df.pivot_table(index='ind1', columns='ind2',
                              values='count', aggfunc='count')
 
 
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index 3caee2b44c579..10f2b6fca74a1 100644
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -1195,6 +1195,13 @@ def test_unstack_unobserved_keys(self):
         recons = result.stack()
         tm.assert_frame_equal(recons, df)
 
+    @pytest.mark.slow
+    def test_unstack_number_of_levels_larger_than_int32(self):
+        # GH 20601
+        df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)])
+        with tm.assert_raises_regex(ValueError, 'int32 overflow'):
+            df.unstack()
+
     def test_stack_order_with_unsorted_levels(self):
         # GH 16323
 

From 0efaa8efc14433430a4f9df5c4108672f57a7a28 Mon Sep 17 00:00:00 2001
From: Anh Le <anh.le91@gmail.com>
Date: Sun, 22 Apr 2018 02:48:08 -0400
Subject: [PATCH 5/8] CLN PEP8 compliance

---
 pandas/core/reshape/reshape.py  | 6 ++++--
 pandas/tests/test_multilevel.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index c310710a4a40d..9428bcc1700ab 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -126,10 +126,12 @@ def __init__(self, values, index, level=-1, value_columns=None,
         self.removed_level = self.new_index_levels.pop(self.level)
         self.removed_level_full = index.levels[self.level]
 
-        num_rows = np.max([index_level.size for index_level in self.new_index_levels])
+        num_rows = np.max([index_level.size for index_level
+                           in self.new_index_levels])
         num_columns = self.removed_level.size
         if num_rows * num_columns > (2 ** 31 - 1):
-            raise ValueError('Unstacked data frame is too big, causing int32 overflow')
+            raise ValueError('Unstacked DataFrame is too big, '
+                             'causing int32 overflow')
 
         self._make_sorted_values_labels()
         self._make_selectors()
diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py
index 10f2b6fca74a1..bdac0d13b84a3 100644
--- a/pandas/tests/test_multilevel.py
+++ b/pandas/tests/test_multilevel.py
@@ -1198,7 +1198,8 @@ def test_unstack_unobserved_keys(self):
     @pytest.mark.slow
     def test_unstack_number_of_levels_larger_than_int32(self):
         # GH 20601
-        df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)])
+        df = DataFrame(np.random.randn(2 ** 16, 2),
+                       index=[np.arange(2 ** 16), np.arange(2 ** 16)])
         with tm.assert_raises_regex(ValueError, 'int32 overflow'):
             df.unstack()
 

From 8edc9a0e6bddb4bfb5ca63c45cbdb647aff4d81a Mon Sep 17 00:00:00 2001
From: Anh Le <anh.le91@gmail.com>
Date: Sun, 22 Apr 2018 13:15:48 -0400
Subject: [PATCH 6/8] ENH calculate size of the resulting pivot table and raise
 error if it's too big

---
 pandas/core/reshape/pivot.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 962558de562c7..329c0bca7deb8 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -31,11 +31,6 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
     index = _convert_by(index)
     columns = _convert_by(columns)
 
-    num_rows = data.reindex(index, axis='columns').shape[0]
-    num_columns = data.reindex(columns, axis='columns').shape[0]
-    if num_rows * num_columns > (2 ** 31 - 1):
-        raise ValueError('Pivot table is too big, causing int32 overflow')
-
     if isinstance(aggfunc, list):
         pieces = []
         keys = []
@@ -86,9 +81,14 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
                 pass
         values = list(values)
 
-    # group by the cartesian product of the grouper
-    # if we have a categorical
-    grouped = data.groupby(keys, observed=False)
+    num_rows = (data.reindex(columns=index).drop_duplicates().shape[0]
+                if index else 1)
+    num_cols = (data.reindex(columns=columns).drop_duplicates().shape[0]
+                if columns else 1)
+    if num_rows * num_cols * len(values) > (2 ** 31 - 1):
+        raise ValueError('Pivot table is too big, causing int32 overflow')
+
+    grouped = data.groupby(keys)
     agged = grouped.agg(aggfunc)
     if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
         agged = agged.dropna(how='all')

From f2021f10fc7bc5007ff2ec4b4d4a01fd15a147fe Mon Sep 17 00:00:00 2001
From: Anh Le <anh.le91@gmail.com>
Date: Mon, 30 Jul 2018 15:40:26 -0500
Subject: [PATCH 7/8] rebase onto upstream master

---
 pandas/core/reshape/pivot.py       | 7 -------
 pandas/tests/reshape/test_pivot.py | 6 +++---
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py
index 329c0bca7deb8..99772f0fe36ad 100644
--- a/pandas/core/reshape/pivot.py
+++ b/pandas/core/reshape/pivot.py
@@ -81,13 +81,6 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
                 pass
         values = list(values)
 
-    num_rows = (data.reindex(columns=index).drop_duplicates().shape[0]
-                if index else 1)
-    num_cols = (data.reindex(columns=columns).drop_duplicates().shape[0]
-                if columns else 1)
-    if num_rows * num_cols * len(values) > (2 ** 31 - 1):
-        raise ValueError('Pivot table is too big, causing int32 overflow')
-
     grouped = data.groupby(keys)
     agged = grouped.agg(aggfunc)
     if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py
index a1ccae1718081..06a7c72969d37 100644
--- a/pandas/tests/reshape/test_pivot.py
+++ b/pandas/tests/reshape/test_pivot.py
@@ -1279,11 +1279,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
     def test_pivot_number_of_levels_larger_than_int32(self):
         # GH 20601
         df = DataFrame({'ind1': np.arange(2 ** 16),
-                          'ind2': np.arange(2 ** 16),
-                          'count': np.arange(2 ** 16)})
+                        'ind2': np.arange(2 ** 16),
+                        'count': np.arange(2 ** 16)})
         with tm.assert_raises_regex(ValueError, 'int32 overflow'):
             df.pivot_table(index='ind1', columns='ind2',
-                             values='count', aggfunc='count')
+                           values='count', aggfunc='count')
 
 
 class TestCrosstab(object):

From 7e6246c91cf6e059ca4ff65d74d47217bcf1133b Mon Sep 17 00:00:00 2001
From: Anh Le <anh.le91@gmail.com>
Date: Tue, 31 Jul 2018 12:22:42 -0500
Subject: [PATCH 8/8] DOC add whatsnew and comments explaining the bug fix

---
 doc/source/whatsnew/v0.24.0.txt | 2 +-
 pandas/core/reshape/reshape.py  | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
index d2d5d40393b62..a5a34811c4b83 100644
--- a/doc/source/whatsnew/v0.24.0.txt
+++ b/doc/source/whatsnew/v0.24.0.txt
@@ -615,7 +615,7 @@ Reshaping
 - Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`)
 - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`)
 - Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`)
--
+- Bug in :func:`pandas.pivot_table` when the number of unique index combination exceeds int32 (:issue:`20601`)
 -
 
 Build Changes
diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py
index 9428bcc1700ab..694a1d3336469 100644
--- a/pandas/core/reshape/reshape.py
+++ b/pandas/core/reshape/reshape.py
@@ -126,6 +126,10 @@ def __init__(self, values, index, level=-1, value_columns=None,
         self.removed_level = self.new_index_levels.pop(self.level)
         self.removed_level_full = index.levels[self.level]
 
+        # Bug fix GH 20601
+        # If the data frame is too big,
+        # the number of unique index combination will cause int32 overflow
+        # We want to check and raise an error before this happens
         num_rows = np.max([index_level.size for index_level
                            in self.new_index_levels])
         num_columns = self.removed_level.size