Skip to content

Commit d85a5c3

Browse files
swebjreback
authored andcommitted
BUG: pivot/unstack leading to too many items should raise exception (#23512)
1 parent 2b82159 commit d85a5c3

File tree

5 files changed

+36
-2
lines changed

5 files changed

+36
-2
lines changed

doc/source/whatsnew/v0.24.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -1649,6 +1649,7 @@ Reshaping
16491649
- :meth:`DataFrame.nlargest` and :meth:`DataFrame.nsmallest` now returns the correct n values when keep != 'all' also when tied on the first columns (:issue:`22752`)
16501650
- Constructing a DataFrame with an index argument that wasn't already an instance of :class:`~pandas.core.Index` was broken (:issue:`22227`).
16511651
- Bug in :class:`DataFrame` prevented list subclasses to be used to construction (:issue:`21226`)
1652+
- Bug in :func:`DataFrame.unstack` and :func:`DataFrame.pivot_table` returning a missleading error message when the resulting DataFrame has more elements than int32 can handle. Now, the error message is improved, pointing towards the actual problem (:issue:`20601`)
16521653

16531654
.. _whatsnew_0240.bug_fixes.sparse:
16541655

pandas/core/reshape/pivot.py

-2
Original file line numberDiff line numberDiff line change
@@ -78,8 +78,6 @@ def pivot_table(data, values=None, index=None, columns=None, aggfunc='mean',
7878
pass
7979
values = list(values)
8080

81-
# group by the cartesian product of the grouper
82-
# if we have a categorical
8381
grouped = data.groupby(keys, observed=False)
8482
agged = grouped.agg(aggfunc)
8583
if dropna and isinstance(agged, ABCDataFrame) and len(agged.columns):

pandas/core/reshape/reshape.py

+15
Original file line numberDiff line numberDiff line change
@@ -109,6 +109,21 @@ def __init__(self, values, index, level=-1, value_columns=None,
109109
self.removed_level = self.new_index_levels.pop(self.level)
110110
self.removed_level_full = index.levels[self.level]
111111

112+
# Bug fix GH 20601
113+
# If the data frame is too big, the number of unique index combination
114+
# will cause int32 overflow on windows environments.
115+
# We want to check and raise an error before this happens
116+
num_rows = np.max([index_level.size for index_level
117+
in self.new_index_levels])
118+
num_columns = self.removed_level.size
119+
120+
# GH20601: This forces an overflow if the number of cells is too high.
121+
num_cells = np.multiply(num_rows, num_columns, dtype=np.int32)
122+
123+
if num_rows > 0 and num_columns > 0 and num_cells <= 0:
124+
raise ValueError('Unstacked DataFrame is too big, '
125+
'causing int32 overflow')
126+
112127
self._make_sorted_values_labels()
113128
self._make_selectors()
114129

pandas/tests/reshape/test_pivot.py

+11
Original file line numberDiff line numberDiff line change
@@ -1272,6 +1272,17 @@ def test_pivot_string_func_vs_func(self, f, f_numpy):
12721272
aggfunc=f_numpy)
12731273
tm.assert_frame_equal(result, expected)
12741274

1275+
@pytest.mark.slow
1276+
def test_pivot_number_of_levels_larger_than_int32(self):
1277+
# GH 20601
1278+
df = DataFrame({'ind1': np.arange(2 ** 16),
1279+
'ind2': np.arange(2 ** 16),
1280+
'count': 0})
1281+
1282+
with pytest.raises(ValueError, match='int32 overflow'):
1283+
df.pivot_table(index='ind1', columns='ind2',
1284+
values='count', aggfunc='count')
1285+
12751286

12761287
class TestCrosstab(object):
12771288

pandas/tests/test_multilevel.py

+9
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from warnings import catch_warnings, simplefilter
44
import datetime
55
import itertools
6+
67
import pytest
78
import pytz
89

@@ -720,6 +721,14 @@ def test_unstack_unobserved_keys(self):
720721
recons = result.stack()
721722
tm.assert_frame_equal(recons, df)
722723

724+
@pytest.mark.slow
725+
def test_unstack_number_of_levels_larger_than_int32(self):
726+
# GH 20601
727+
df = DataFrame(np.random.randn(2 ** 16, 2),
728+
index=[np.arange(2 ** 16), np.arange(2 ** 16)])
729+
with pytest.raises(ValueError, match='int32 overflow'):
730+
df.unstack()
731+
723732
def test_stack_order_with_unsorted_levels(self):
724733
# GH 16323
725734

0 commit comments

Comments
 (0)