Skip to content

ENH GH20601 raise error when pivot table's number of levels > int32 #20784

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 8 commits into from
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.24.0.txt
Original file line number Diff line number Diff line change
Expand Up @@ -615,7 +615,7 @@ Reshaping
- Bug in :meth:`Series.combine_first` with ``datetime64[ns, tz]`` dtype which would return tz-naive result (:issue:`21469`)
- Bug in :meth:`Series.where` and :meth:`DataFrame.where` with ``datetime64[ns, tz]`` dtype (:issue:`21546`)
- Bug in :meth:`Series.mask` and :meth:`DataFrame.mask` with ``list`` conditionals (:issue:`21891`)
-
- Bug in :func:`pandas.pivot_table` when the number of unique index combination exceeds int32 (:issue:`20601`)
-

Build Changes
Expand Down
4 changes: 1 addition & 3 deletions pandas/core/reshape/pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,7 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
pass
values = list(values)

# group by the cartesian product of the grouper
# if we have a categorical
grouped = data.groupby(keys, observed=False)
grouped = data.groupby(keys)
agged = grouped.agg(aggfunc)
if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):
agged = agged.dropna(how='all')
Expand Down
11 changes: 11 additions & 0 deletions pandas/core/reshape/reshape.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,17 @@ def __init__(self, values, index, level=-1, value_columns=None,
self.removed_level = self.new_index_levels.pop(self.level)
self.removed_level_full = index.levels[self.level]

# Bug fix GH 20601
# If the data frame is too big,
# the number of unique index combination will cause int32 overflow
# We want to check and raise an error before this happens
num_rows = np.max([index_level.size for index_level
in self.new_index_levels])
num_columns = self.removed_level.size
if num_rows * num_columns > (2 ** 31 - 1):
raise ValueError('Unstacked DataFrame is too big, '
'causing int32 overflow')

self._make_sorted_values_labels()
self._make_selectors()

Expand Down
10 changes: 10 additions & 0 deletions pandas/tests/reshape/test_pivot.py
Original file line number Diff line number Diff line change
Expand Up @@ -1275,6 +1275,16 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
aggfunc=f_numpy)
tm.assert_frame_equal(result, expected)

@pytest.mark.slow
def test_pivot_number_of_levels_larger_than_int32(self):
# GH 20601
df = DataFrame({'ind1': np.arange(2 ** 16),
'ind2': np.arange(2 ** 16),
'count': np.arange(2 ** 16)})
with tm.assert_raises_regex(ValueError, 'int32 overflow'):
df.pivot_table(index='ind1', columns='ind2',
values='count', aggfunc='count')


class TestCrosstab(object):

Expand Down
8 changes: 8 additions & 0 deletions pandas/tests/test_multilevel.py
Original file line number Diff line number Diff line change
Expand Up @@ -1195,6 +1195,14 @@ def test_unstack_unobserved_keys(self):
recons = result.stack()
tm.assert_frame_equal(recons, df)

@pytest.mark.slow
def test_unstack_number_of_levels_larger_than_int32(self):
# GH 20601
df = DataFrame(np.random.randn(2 ** 16, 2),
index=[np.arange(2 ** 16), np.arange(2 ** 16)])
with tm.assert_raises_regex(ValueError, 'int32 overflow'):
df.unstack()

def test_stack_order_with_unsorted_levels(self):
# GH 16323

Expand Down