From da25b19189f6b43c3d2bac73f3ebded9c85ac4b2 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:14:23 -0400 Subject: [PATCH 01/13] ENH GH20601 raise an error when the number of levels in a pivot table larger than int32 --- pandas/core/reshape/reshape.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 03b77f0e787f0..31aa91fec9622 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -162,6 +162,8 @@ def _make_selectors(self): self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift + if np.prod(self.full_shape) > (2 ** 31 - 1): + raise ValueError('Pivot table is too big, causing int32 overflow') mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) From e6c88c1e18becc79f02c61dbf3846659d61e135f Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:53:06 -0400 Subject: [PATCH 02/13] TST add a test for pivot table large number of levels causing int32 overflow --- pandas/tests/reshape/test_pivot.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 1cb036dccf23c..f2fb8625f6d3f 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1276,6 +1276,14 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): aggfunc=f_numpy) tm.assert_frame_equal(result, expected) + @pytest.mark.slow + def test_pivot_number_of_levels_larger_than_int32(self): + # GH 20601 + data = DataFrame({'ind1': list(range(1337600)) * 2, + 'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600}) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') + class TestCrosstab(object): From db2319eee68650ad8089eaf6dc3680badf32bb1a Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 16 Apr 2018 01:55:56 -0400 Subject: [PATCH 03/13] CLN PEP8 compliance --- pandas/tests/reshape/test_pivot.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index f2fb8625f6d3f..5ccedf92391af 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1280,9 +1280,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 data = DataFrame({'ind1': list(range(1337600)) * 2, - 'ind2': list(range(3040)) * 2 * 440, 'count': [1] * 2 * 1337600}) + 'ind2': list(range(3040)) * 2 * 440, + 'count': [1] * 2 * 1337600}) with tm.assert_raises_regex(ValueError, 'int32 overflow'): - data.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') + data.pivot_table(index='ind1', columns='ind2', + values='count', aggfunc='count') class TestCrosstab(object): From 6b7b03065fa859dd367589a2d3c65835669d7cd1 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Sun, 22 Apr 2018 02:20:18 -0400 Subject: [PATCH 04/13] ENH catch the int32 overflow error earlier and in two separate places: in pivot_table and unstack --- pandas/core/reshape/pivot.py | 5 +++++ pandas/core/reshape/reshape.py | 7 +++++-- pandas/tests/reshape/test_pivot.py | 8 ++++---- pandas/tests/test_multilevel.py | 7 +++++++ 4 files changed, 21 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 22e591e776a22..bb6e849cc94dd 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -31,6 +31,11 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', index = _convert_by(index) columns = _convert_by(columns) + num_rows = data.reindex(index, axis='columns').shape[0] + num_columns = data.reindex(columns, axis='columns').shape[0] + if num_rows * num_columns > (2 ** 31 - 1): + raise ValueError('Pivot table is too big, causing int32 overflow') + if isinstance(aggfunc, list): pieces = [] keys = [] diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 31aa91fec9622..226a8391a5ca5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -127,6 +127,11 @@ def __init__(self, values, index, level=-1, value_columns=None, self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] + num_rows = np.max([index_level.size for index_level in self.new_index_levels]) + num_columns = self.removed_level.size + if num_rows * num_columns > (2 ** 31 - 1): + raise ValueError('Unstacked data frame is too big, causing int32 overflow') + self._make_sorted_values_labels() self._make_selectors() @@ -162,8 +167,6 @@ def _make_selectors(self): self.full_shape = ngroups, stride selector = self.sorted_labels[-1] + stride * comp_index + self.lift - if np.prod(self.full_shape) > (2 ** 31 - 1): - raise ValueError('Pivot table is too big, causing int32 overflow') mask = np.zeros(np.prod(self.full_shape), dtype=bool) mask.put(selector, True) diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 5ccedf92391af..8935cb6274733 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1279,11 +1279,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 - data = DataFrame({'ind1': list(range(1337600)) * 2, - 'ind2': list(range(3040)) * 2 * 440, - 'count': [1] * 2 * 1337600}) + df = DataFrame({'ind1': np.arange(2 ** 16), + 'ind2': np.arange(2 ** 16), + 'count': np.arange(2 ** 16)}) with tm.assert_raises_regex(ValueError, 'int32 overflow'): - data.pivot_table(index='ind1', columns='ind2', + df.pivot_table(index='ind1', columns='ind2', values='count', aggfunc='count') diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 2022340926cca..94b7e31744836 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1212,6 +1212,13 @@ def test_unstack_unobserved_keys(self): recons = result.stack() tm.assert_frame_equal(recons, df) + @pytest.mark.slow + def test_unstack_number_of_levels_larger_than_int32(self): + # GH 20601 + df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)]) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + df.unstack() + def test_stack_order_with_unsorted_levels(self): # GH 16323 From 23dae9344061fe6b9cd4d4cc4994b6369140dce8 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Sun, 22 Apr 2018 02:48:08 -0400 Subject: [PATCH 05/13] CLN PEP8 compliance --- pandas/core/reshape/reshape.py | 6 ++++-- pandas/tests/test_multilevel.py | 3 ++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 226a8391a5ca5..c649e2d751733 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -127,10 +127,12 @@ def __init__(self, values, index, level=-1, value_columns=None, self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] - num_rows = np.max([index_level.size for index_level in self.new_index_levels]) + num_rows = np.max([index_level.size for index_level + in self.new_index_levels]) num_columns = self.removed_level.size if num_rows * num_columns > (2 ** 31 - 1): - raise ValueError('Unstacked data frame is too big, causing int32 overflow') + raise ValueError('Unstacked DataFrame is too big, ' + 'causing int32 overflow') self._make_sorted_values_labels() self._make_selectors() diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 94b7e31744836..9f14ee3cb7f0d 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1215,7 +1215,8 @@ def test_unstack_unobserved_keys(self): @pytest.mark.slow def test_unstack_number_of_levels_larger_than_int32(self): # GH 20601 - df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)]) + df = DataFrame(np.random.randn(2 ** 16, 2), + index=[np.arange(2 ** 16), np.arange(2 ** 16)]) with tm.assert_raises_regex(ValueError, 'int32 overflow'): df.unstack() From a69438fd261cd7cbc828bba8b1fa6aa51f6d40fb Mon Sep 17 00:00:00 2001 From: Anh Le Date: Sun, 22 Apr 2018 13:15:48 -0400 Subject: [PATCH 06/13] ENH calculate size of the resulting pivot table and raise error if it's too big --- pandas/core/reshape/pivot.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index bb6e849cc94dd..16aa4df74b2d8 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -31,11 +31,6 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', index = _convert_by(index) columns = _convert_by(columns) - num_rows = data.reindex(index, axis='columns').shape[0] - num_columns = data.reindex(columns, axis='columns').shape[0] - if num_rows * num_columns > (2 ** 31 - 1): - raise ValueError('Pivot table is too big, causing int32 overflow') - if isinstance(aggfunc, list): pieces = [] keys = [] @@ -86,9 +81,14 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - # group by the cartesian product of the grouper - # if we have a categorical - grouped = data.groupby(keys, observed=False) + num_rows = (data.reindex(columns=index).drop_duplicates().shape[0] + if index else 1) + num_cols = (data.reindex(columns=columns).drop_duplicates().shape[0] + if columns else 1) + if num_rows * num_cols * len(values) > (2 ** 31 - 1): + raise ValueError('Pivot table is too big, causing int32 overflow') + + grouped = data.groupby(keys) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how='all') From b44ca163ab993dc800f0f8f91807d10d096b9382 Mon Sep 17 00:00:00 2001 From: Anh Le Date: Mon, 30 Jul 2018 15:40:26 -0500 Subject: [PATCH 07/13] rebase onto upstream master --- pandas/core/reshape/pivot.py | 7 ------- pandas/tests/reshape/test_pivot.py | 6 +++--- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 16aa4df74b2d8..611cd350a3e53 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -81,13 +81,6 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - num_rows = (data.reindex(columns=index).drop_duplicates().shape[0] - if index else 1) - num_cols = (data.reindex(columns=columns).drop_duplicates().shape[0] - if columns else 1) - if num_rows * num_cols * len(values) > (2 ** 31 - 1): - raise ValueError('Pivot table is too big, causing int32 overflow') - grouped = data.groupby(keys) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 8935cb6274733..eb76ff71bf152 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1280,11 +1280,11 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 df = DataFrame({'ind1': np.arange(2 ** 16), - 'ind2': np.arange(2 ** 16), - 'count': np.arange(2 ** 16)}) + 'ind2': np.arange(2 ** 16), + 'count': np.arange(2 ** 16)}) with tm.assert_raises_regex(ValueError, 'int32 overflow'): df.pivot_table(index='ind1', columns='ind2', - values='count', aggfunc='count') + values='count', aggfunc='count') class TestCrosstab(object): From 59678a65fd0c811920acd6a677fb37c27887f1a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Thu, 8 Nov 2018 12:03:13 +0100 Subject: [PATCH 08/13] ENH: Raise and catch FloatingPointException due to overflow * Modify tests to only cover windows platforms --- pandas/core/reshape/reshape.py | 9 ++++++--- pandas/tests/reshape/test_pivot.py | 15 +++++++++------ pandas/tests/test_multilevel.py | 13 ++++++++----- 3 files changed, 23 insertions(+), 14 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index c649e2d751733..700440ec4aeef 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -130,9 +130,12 @@ def __init__(self, values, index, level=-1, value_columns=None, num_rows = np.max([index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size - if num_rows * num_columns > (2 ** 31 - 1): - raise ValueError('Unstacked DataFrame is too big, ' - 'causing int32 overflow') + with np.errstate(all='raise'): + try: + num_columns * num_rows + except FloatingPointError: + raise ValueError('Unstacked DataFrame is too big, ' + 'causing int32 overflow') self._make_sorted_values_labels() self._make_selectors() diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index eb76ff71bf152..9aa4800597ca7 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from datetime import datetime, date, timedelta +import sys import pytest @@ -1279,12 +1280,14 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 - df = DataFrame({'ind1': np.arange(2 ** 16), - 'ind2': np.arange(2 ** 16), - 'count': np.arange(2 ** 16)}) - with tm.assert_raises_regex(ValueError, 'int32 overflow'): - df.pivot_table(index='ind1', columns='ind2', - values='count', aggfunc='count') + if sys.platform == 'win32': + df = DataFrame({'ind1': np.arange(2 ** 16), + 'ind2': np.arange(2 ** 16), + 'count': 0}) + + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + df.pivot_table(index='ind1', columns='ind2', + values='count', aggfunc='count') class TestCrosstab(object): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 9f14ee3cb7f0d..68fa3643e4f18 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -3,6 +3,8 @@ from warnings import catch_warnings, simplefilter import datetime import itertools +import sys + import pytest import pytz @@ -1214,11 +1216,12 @@ def test_unstack_unobserved_keys(self): @pytest.mark.slow def test_unstack_number_of_levels_larger_than_int32(self): - # GH 20601 - df = DataFrame(np.random.randn(2 ** 16, 2), - index=[np.arange(2 ** 16), np.arange(2 ** 16)]) - with tm.assert_raises_regex(ValueError, 'int32 overflow'): - df.unstack() + if sys.platform == 'win32': + # GH 20601 + df = DataFrame(np.random.randn(2 ** 16, 2), + index=[np.arange(2 ** 16), np.arange(2 ** 16)]) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + df.unstack() def test_stack_order_with_unsorted_levels(self): # GH 16323 From 4dbbad750fb330663646160dd1190a6bcaf4231e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Mon, 12 Nov 2018 15:16:58 +0100 Subject: [PATCH 09/13] ENH: use pd.compat for windows check, add comment --- pandas/core/reshape/pivot.py | 2 +- pandas/core/reshape/reshape.py | 4 ++++ pandas/tests/reshape/test_pivot.py | 4 ++-- pandas/tests/test_multilevel.py | 6 +++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 611cd350a3e53..537eb290f8e83 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -81,7 +81,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - grouped = data.groupby(keys) + grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how='all') diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 700440ec4aeef..2292384e681a2 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -127,6 +127,10 @@ def __init__(self, values, index, level=-1, value_columns=None, self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] + # Bug fix GH 20601 + # If the data frame is too big, the number of unique index combination + # will cause int32 overflow on windows environments. + # We want to check and raise an error before this happens num_rows = np.max([index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 9aa4800597ca7..d2663ba9c6e05 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -13,7 +13,7 @@ from pandas import (DataFrame, Series, Index, MultiIndex, Grouper, date_range, concat, Categorical) from pandas.core.reshape.pivot import pivot_table, crosstab -from pandas.compat import range, product +from pandas.compat import range, product, is_platform_windows import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -1280,7 +1280,7 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 - if sys.platform == 'win32': + if is_platform_windows(): df = DataFrame({'ind1': np.arange(2 ** 16), 'ind2': np.arange(2 ** 16), 'count': 0}) diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 68fa3643e4f18..b8cbfe65c49df 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -19,7 +19,7 @@ import pandas.core.common as com import pandas.util.testing as tm from pandas.compat import (range, lrange, StringIO, lzip, u, product as - cart_product, zip) + cart_product, zip, is_platform_windows) import pandas as pd import pandas._libs.index as _index @@ -1216,8 +1216,8 @@ def test_unstack_unobserved_keys(self): @pytest.mark.slow def test_unstack_number_of_levels_larger_than_int32(self): - if sys.platform == 'win32': - # GH 20601 + # GH 20601 + if is_platform_windows(): df = DataFrame(np.random.randn(2 ** 16, 2), index=[np.arange(2 ** 16), np.arange(2 ** 16)]) with tm.assert_raises_regex(ValueError, 'int32 overflow'): From 263f598aa6b26330899fd437506b2ef0c0b3de4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 30 Dec 2018 16:58:57 +0100 Subject: [PATCH 10/13] ENH: ValueError on all platforms when max int32 is reached --- pandas/core/reshape/reshape.py | 12 ++++++------ pandas/tests/reshape/test_pivot.py | 18 ++++++++---------- pandas/tests/test_multilevel.py | 12 +++++------- 3 files changed, 19 insertions(+), 23 deletions(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 2d8ad48f46bc4..b7ee68b8feec8 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -116,12 +116,12 @@ def __init__(self, values, index, level=-1, value_columns=None, num_rows = np.max([index_level.size for index_level in self.new_index_levels]) num_columns = self.removed_level.size - with np.errstate(all='raise'): - try: - num_columns * num_rows - except FloatingPointError: - raise ValueError('Unstacked DataFrame is too big, ' - 'causing int32 overflow') + + num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) + + if num_cells <= 0: + raise ValueError('Unstacked DataFrame is too big, ' + 'causing int32 overflow') self._make_sorted_values_labels() self._make_selectors() diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index b031c80178066..0610db2e9a2b4 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1,7 +1,6 @@ # -*- coding: utf-8 -*- from datetime import datetime, date, timedelta -import sys import pytest @@ -13,7 +12,7 @@ from pandas import (DataFrame, Series, Index, MultiIndex, Grouper, date_range, concat, Categorical) from pandas.core.reshape.pivot import pivot_table, crosstab -from pandas.compat import range, product, is_platform_windows +from pandas.compat import range, product import pandas.util.testing as tm from pandas.api.types import CategoricalDtype as CDT @@ -1276,14 +1275,13 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): @pytest.mark.slow def test_pivot_number_of_levels_larger_than_int32(self): # GH 20601 - if is_platform_windows(): - df = DataFrame({'ind1': np.arange(2 ** 16), - 'ind2': np.arange(2 ** 16), - 'count': 0}) - - with tm.assert_raises_regex(ValueError, 'int32 overflow'): - df.pivot_table(index='ind1', columns='ind2', - values='count', aggfunc='count') + df = DataFrame({'ind1': np.arange(2 ** 16), + 'ind2': np.arange(2 ** 16), + 'count': 0}) + + with pytest.raises(ValueError, match='int32 overflow'): + df.pivot_table(index='ind1', columns='ind2', + values='count', aggfunc='count') class TestCrosstab(object): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index d51126ad8a4cc..85bcb5c530c08 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -3,7 +3,6 @@ from warnings import catch_warnings, simplefilter import datetime import itertools -import sys import pytest import pytz @@ -17,7 +16,7 @@ from pandas.core.dtypes.common import is_float_dtype, is_integer_dtype import pandas.util.testing as tm from pandas.compat import (range, lrange, StringIO, lzip, u, product as - cart_product, zip, is_platform_windows) + cart_product, zip) import pandas as pd AGG_FUNCTIONS = ['sum', 'prod', 'min', 'max', 'median', 'mean', 'skew', 'mad', @@ -725,11 +724,10 @@ def test_unstack_unobserved_keys(self): @pytest.mark.slow def test_unstack_number_of_levels_larger_than_int32(self): # GH 20601 - if is_platform_windows(): - df = DataFrame(np.random.randn(2 ** 16, 2), - index=[np.arange(2 ** 16), np.arange(2 ** 16)]) - with tm.assert_raises_regex(ValueError, 'int32 overflow'): - df.unstack() + df = DataFrame(np.random.randn(2 ** 16, 2), + index=[np.arange(2 ** 16), np.arange(2 ** 16)]) + with pytest.raises(ValueError, match='int32 overflow'): + df.unstack() def test_stack_order_with_unsorted_levels(self): # GH 16323 From b96689d8ec66ee90384ad34c2a87a36cdd0abc7d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 30 Dec 2018 17:13:21 +0100 Subject: [PATCH 11/13] CLN: Added comment for overflow --- pandas/core/reshape/reshape.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index ea649f267a21a..1d5bac415f648 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -117,6 +117,7 @@ def __init__(self, values, index, level=-1, value_columns=None, in self.new_index_levels]) num_columns = self.removed_level.size + # GH20601: This forces an overflow if the number of cells is too high. num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) if num_cells <= 0: From 241729ff7d0a7687decba63dca36ad2876557b9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Sun, 30 Dec 2018 18:01:26 +0100 Subject: [PATCH 12/13] BUG: zero cells should be allowed --- pandas/core/reshape/reshape.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 1d5bac415f648..f436b3b92a359 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -120,7 +120,7 @@ def __init__(self, values, index, level=-1, value_columns=None, # GH20601: This forces an overflow if the number of cells is too high. num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) - if num_cells <= 0: + if num_rows > 0 and num_columns > 0 and num_cells <= 0: raise ValueError('Unstacked DataFrame is too big, ' 'causing int32 overflow') From a3cdbca33363c520afa5b9f9dc564e09096251bb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Florian=20M=C3=BCller?= Date: Mon, 31 Dec 2018 12:43:46 +0100 Subject: [PATCH 13/13] DOC: Added whatsnew entry (#23512) --- doc/source/whatsnew/v0.24.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a84fd118061bc..5f40ca2ad3b36 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1646,6 +1646,7 @@ Reshaping - :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) - Constructing a DataFrame with an index argument that wasn't already an instance of :class:`~pandas.core.Index` was broken (:issue:`22227`). - Bug in :class:`DataFrame` prevented list subclasses to be used to construction (:issue:`21226`) +- Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a missleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`) .. _whatsnew_0240.bug_fixes.sparse: