diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index a84fd118061bc..5f40ca2ad3b36 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -1646,6 +1646,7 @@ Reshaping - :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`) - Constructing a DataFrame with an index argument that wasn't already an instance of :class:`~pandas.core.Index` was broken (:issue:`22227`). - Bug in :class:`DataFrame` prevented list subclasses to be used to construction (:issue:`21226`) +- Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a missleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`) .. _whatsnew_0240.bug_fixes.sparse: diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 61ac5d9ed6a2e..c7c447d18b6b1 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -78,8 +78,6 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - # group by the cartesian product of the grouper - # if we have a categorical grouped = data.groupby(keys, observed=False) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 70161826696c5..f436b3b92a359 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -109,6 +109,21 @@ def __init__(self, values, index, level=-1, value_columns=None, self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] + # Bug fix GH 20601 + # If the data frame is too big, the number of unique index combination + # will cause int32 overflow on windows environments. + # We want to check and raise an error before this happens + num_rows = np.max([index_level.size for index_level + in self.new_index_levels]) + num_columns = self.removed_level.size + + # GH20601: This forces an overflow if the number of cells is too high. + num_cells = np.multiply(num_rows, num_columns, dtype=np.int32) + + if num_rows > 0 and num_columns > 0 and num_cells <= 0: + raise ValueError('Unstacked DataFrame is too big, ' + 'causing int32 overflow') + self._make_sorted_values_labels() self._make_selectors() diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index e32e1999836ec..a2b5eacd873bb 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1272,6 +1272,17 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): aggfunc=f_numpy) tm.assert_frame_equal(result, expected) + @pytest.mark.slow + def test_pivot_number_of_levels_larger_than_int32(self): + # GH 20601 + df = DataFrame({'ind1': np.arange(2 ** 16), + 'ind2': np.arange(2 ** 16), + 'count': 0}) + + with pytest.raises(ValueError, match='int32 overflow'): + df.pivot_table(index='ind1', columns='ind2', + values='count', aggfunc='count') + class TestCrosstab(object): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 6c1a2490ea76e..ce95f0f86ef7b 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -3,6 +3,7 @@ from warnings import catch_warnings, simplefilter import datetime import itertools + import pytest import pytz @@ -720,6 +721,14 @@ def test_unstack_unobserved_keys(self): recons = result.stack() tm.assert_frame_equal(recons, df) + @pytest.mark.slow + def test_unstack_number_of_levels_larger_than_int32(self): + # GH 20601 + df = DataFrame(np.random.randn(2 ** 16, 2), + index=[np.arange(2 ** 16), np.arange(2 ** 16)]) + with pytest.raises(ValueError, match='int32 overflow'): + df.unstack() + def test_stack_order_with_unsorted_levels(self): # GH 16323