From 1f5ed032de3de12eb10ee1f64ff11016388eaab1 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:14:23 -0400 Subject: [PATCH 1/8] ENH GH20601 raise an error when the number of levels in a pivot table larger than int32 --- pandas/core/reshape/reshape.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f9ab813855f47..e1d1b2fd72770 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -161,6 +161,8 @@ def _make_selectors(self): self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift + if np.prod(self.full_shape) > (2 ** 31 - 1): + raise ValueError('Pivot table is too big, causing int32 overflow') mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) From e78e82a6e16d88ee8a8da82ba6f57a7f36175f29 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:53:06 -0400 Subject: [PATCH 2/8] TST add a test for pivot table large number of levels causing int32 overflow --- pandas/tests/reshape/test_pivot.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7e7e081408534..e44a32ce8870a 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1275,6 +1275,14 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): aggfunc=f_numpy) tm.assert_frame_equal(result, expected) + @pytest.mark.slow + def test_pivot_number_of_levels_larger_than_int32(self): + # GH 20601 + data = DataFrame({'ind1': list(range(1337600)) * 2, + 'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600}) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') + class TestCrosstab(object): From 5d773efcb4977e1d09d3d4fed8373010fe3320df Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:55:56 -0400 Subject: [PATCH 3/8] CLN PEP8 compliance --- pandas/tests/reshape/test_pivot.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e44a32ce8870a..941f5db9e5138 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1279,9 +1279,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 data = DataFrame({'ind1': list(range(1337600)) * 2, - 'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600}) + 'ind2': list(range(3040)) * 2 * 440, + 'count': [1] * 2 * 1337600}) with tm.assert_raises_regex(ValueError, 'int32 overflow'): - data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') + data.pivot_table(index='ind1', columns='ind2', + values='count', aggfunc='count') class TestCrosstab(object): From 01a79439e4ede04d4e03aeb812e7c72035a17e1d Mon Sep 17 00:00:00 2001 From: Anh Le Date: Sun, 22 Apr 2018 02:20:18 -0400 Subject: [PATCH 4/8] ENH catch the int32 overflow error earlier and in two separate places: in pivot_table and unstack --- pandas/core/reshape/pivot.py | 5 +++++ pandas/core/reshape/reshape.py | 7 +++++-- pandas/tests/reshape/test_pivot.py | 8 ++++---- pandas/tests/test_multilevel.py | 7 +++++++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0d1caa3d57d73..962558de562c7 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -31,6 +31,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', index = _convert_by(index) columns = _convert_by(columns) + num_rows = data.reindex(index, axis='columns').shape[0] + num_columns = data.reindex(columns, axis='columns').shape[0] + if num_rows * num_columns > (2 ** 31 - 1): + raise ValueError('Pivot table is too big, causing int32 overflow') + if isinstance(aggfunc, list): pieces = [] keys = [] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index e1d1b2fd72770..c310710a4a40d 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -126,6 +126,11 @@ def __init__(self, values, index, level=-1, value_columns=None, self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] + num_rows = np.max([index_level.size for index_level in self.new_index_levels]) + num_columns = self.removed_level.size + if num_rows * num_columns > (2 ** 31 - 1): + raise ValueError('Unstacked data frame is too big, causing int32 overflow') + self._make_sorted_values_labels() self._make_selectors() @@ -161,8 +166,6 @@ def _make_selectors(self): self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift - if np.prod(self.full_shape) > (2 ** 31 - 1): - raise ValueError('Pivot table is too big, causing int32 overflow') mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 941f5db9e5138..a1ccae1718081 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1278,11 +1278,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 - data = DataFrame({'ind1': list(range(1337600)) * 2, - 'ind2': list(range(3040)) * 2 * 440, - 'count': [1] * 2 * 1337600}) + df = DataFrame({'ind1': np.arange(2 ** 16), + 'ind2': np.arange(2 ** 16), + 'count': np.arange(2 ** 16)}) with tm.assert_raises_regex(ValueError, 'int32 overflow'): - data.pivot_table(index='ind1', columns='ind2', + df.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 3caee2b44c579..10f2b6fca74a1 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1195,6 +1195,13 @@ def test_unstack_unobserved_keys(self): recons = result.stack() tm.assert_frame_equal(recons, df) + @pytest.mark.slow + def test_unstack_number_of_levels_larger_than_int32(self): + # GH 20601 + df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)]) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + df.unstack() + def test_stack_order_with_unsorted_levels(self): # GH 16323 From 0efaa8efc14433430a4f9df5c4108672f57a7a28 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Sun, 22 Apr 2018 02:48:08 -0400 Subject: [PATCH 5/8] CLN PEP8 compliance --- pandas/core/reshape/reshape.py | 6 ++++-- pandas/tests/test_multilevel.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c310710a4a40d..9428bcc1700ab 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -126,10 +126,12 @@ def __init__(self, values, index, level=-1, value_columns=None, self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] - num_rows = np.max([index_level.size for index_level in self.new_index_levels]) + num_rows = np.max([index_level.size for index_level + in self.new_index_levels]) num_columns = self.removed_level.size if num_rows * num_columns > (2 ** 31 - 1): - raise ValueError('Unstacked data frame is too big, causing int32 overflow') + raise ValueError('Unstacked DataFrame is too big, ' + 'causing int32 overflow') self._make_sorted_values_labels() self._make_selectors() diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 10f2b6fca74a1..bdac0d13b84a3 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1198,7 +1198,8 @@ def test_unstack_unobserved_keys(self): @pytest.mark.slow def test_unstack_number_of_levels_larger_than_int32(self): # GH 20601 - df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)]) + df = DataFrame(np.random.randn(2 ** 16, 2), + index=[np.arange(2 ** 16), np.arange(2 ** 16)]) with tm.assert_raises_regex(ValueError, 'int32 overflow'): df.unstack() From 8edc9a0e6bddb4bfb5ca63c45cbdb647aff4d81a Mon Sep 17 00:00:00 2001 From: Anh Le Date: Sun, 22 Apr 2018 13:15:48 -0400 Subject: [PATCH 6/8] ENH calculate size of the resulting pivot table and raise error if it's too big --- pandas/core/reshape/pivot.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 962558de562c7..329c0bca7deb8 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -31,11 +31,6 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', index = _convert_by(index) columns = _convert_by(columns) - num_rows = data.reindex(index, axis='columns').shape[0] - num_columns = data.reindex(columns, axis='columns').shape[0] - if num_rows * num_columns > (2 ** 31 - 1): - raise ValueError('Pivot table is too big, causing int32 overflow') - if isinstance(aggfunc, list): pieces = [] keys = [] @@ -86,9 +81,14 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - # group by the cartesian product of the grouper - # if we have a categorical - grouped = data.groupby(keys, observed=False) + num_rows = (data.reindex(columns=index).drop_duplicates().shape[0] + if index else 1) + num_cols = (data.reindex(columns=columns).drop_duplicates().shape[0] + if columns else 1) + if num_rows * num_cols * len(values) > (2 ** 31 - 1): + raise ValueError('Pivot table is too big, causing int32 overflow') + + grouped = data.groupby(keys) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how='all') From f2021f10fc7bc5007ff2ec4b4d4a01fd15a147fe Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 30 Jul 2018 15:40:26 -0500 Subject: [PATCH 7/8] rebase onto upstream master --- pandas/core/reshape/pivot.py | 7 ------- pandas/tests/reshape/test_pivot.py | 6 +++--- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 329c0bca7deb8..99772f0fe36ad 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -81,13 +81,6 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - num_rows = (data.reindex(columns=index).drop_duplicates().shape[0] - if index else 1) - num_cols = (data.reindex(columns=columns).drop_duplicates().shape[0] - if columns else 1) - if num_rows * num_cols * len(values) > (2 ** 31 - 1): - raise ValueError('Pivot table is too big, causing int32 overflow') - grouped = data.groupby(keys) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index a1ccae1718081..06a7c72969d37 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1279,11 +1279,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 df = DataFrame({'ind1': np.arange(2 ** 16), - 'ind2': np.arange(2 ** 16), - 'count': np.arange(2 ** 16)}) + 'ind2': np.arange(2 ** 16), + 'count': np.arange(2 ** 16)}) with tm.assert_raises_regex(ValueError, 'int32 overflow'): df.pivot_table(index='ind1', columns='ind2', - values='count', aggfunc='count') + values='count', aggfunc='count') class TestCrosstab(object): From 7e6246c91cf6e059ca4ff65d74d47217bcf1133b Mon Sep 17 00:00:00 2001 From: Anh Le Date: Tue, 31 Jul 2018 12:22:42 -0500 Subject: [PATCH 8/8] DOC add whatsnew and comments explaining the bug fix --- doc/source/whatsnew/v0.24.0.txt | 2 +- pandas/core/reshape/reshape.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index d2d5d40393b62..a5a34811c4b83 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -615,7 +615,7 @@ Reshaping - Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) - Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) -- +- Bug in :func:`pandas.pivot_table` when the number of unique index combination exceeds int32 (:issue:`20601`) - Build Changes diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 9428bcc1700ab..694a1d3336469 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -126,6 +126,10 @@ def __init__(self, values, index, level=-1, value_columns=None, self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] + # Bug fix GH 20601 + # If the data frame is too big, + # the number of unique index combination will cause int32 overflow + # We want to check and raise an error before this happens num_rows = np.max([index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size