diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index d2d5d40393b62..a5a34811c4b83 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -615,7 +615,7 @@ Reshaping - Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`) - Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`) - Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`) -- +- Bug in :func:`pandas.pivot_table` when the number of unique index combination exceeds int32 (:issue:`20601`) - Build Changes diff --git a/pandas/core/reshape/pivot.py b/pandas/core/reshape/pivot.py index 0d1caa3d57d73..99772f0fe36ad 100644 --- a/pandas/core/reshape/pivot.py +++ b/pandas/core/reshape/pivot.py @@ -81,9 +81,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean', pass values = list(values) - # group by the cartesian product of the grouper - # if we have a categorical - grouped = data.groupby(keys, observed=False) + grouped = data.groupby(keys) agged = grouped.agg(aggfunc) if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns): agged = agged.dropna(how='all') diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index f9ab813855f47..694a1d3336469 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -126,6 +126,17 @@ def __init__(self, values, index, level=-1, value_columns=None, self.removed_level = self.new_index_levels.pop(self.level) self.removed_level_full = index.levels[self.level] + # Bug fix GH 20601 + # If the data frame is too big, + # the number of unique index combination will cause int32 overflow + # We want to check and raise an error before this happens + num_rows = np.max([index_level.size for index_level + in self.new_index_levels]) + num_columns = self.removed_level.size + if num_rows * num_columns > (2 ** 31 - 1): + raise ValueError('Unstacked DataFrame is too big, ' + 'causing int32 overflow') + self._make_sorted_values_labels() self._make_selectors() diff --git a/pandas/tests/reshape/test_pivot.py b/pandas/tests/reshape/test_pivot.py index 7e7e081408534..06a7c72969d37 100644 --- a/pandas/tests/reshape/test_pivot.py +++ b/pandas/tests/reshape/test_pivot.py @@ -1275,6 +1275,16 @@ def test_pivot_string_func_vs_func(self, f, f_numpy): aggfunc=f_numpy) tm.assert_frame_equal(result, expected) + @pytest.mark.slow + def test_pivot_number_of_levels_larger_than_int32(self): + # GH 20601 + df = DataFrame({'ind1': np.arange(2 ** 16), + 'ind2': np.arange(2 ** 16), + 'count': np.arange(2 ** 16)}) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + df.pivot_table(index='ind1', columns='ind2', + values='count', aggfunc='count') + class TestCrosstab(object): diff --git a/pandas/tests/test_multilevel.py b/pandas/tests/test_multilevel.py index 3caee2b44c579..bdac0d13b84a3 100644 --- a/pandas/tests/test_multilevel.py +++ b/pandas/tests/test_multilevel.py @@ -1195,6 +1195,14 @@ def test_unstack_unobserved_keys(self): recons = result.stack() tm.assert_frame_equal(recons, df) + @pytest.mark.slow + def test_unstack_number_of_levels_larger_than_int32(self): + # GH 20601 + df = DataFrame(np.random.randn(2 ** 16, 2), + index=[np.arange(2 ** 16), np.arange(2 ** 16)]) + with tm.assert_raises_regex(ValueError, 'int32 overflow'): + df.unstack() + def test_stack_order_with_unsorted_levels(self): # GH 16323