Skip to content

Commit 172c515

Browse files
Michael Odintsovjorisvandenbossche
Michael Odintsov
authored andcommitted
BUG: Fix group index calculation to prevent hitting maximum recursion depth (#21541)
(cherry picked from commit f91a704)
1 parent d44fddb commit 172c515

File tree

3 files changed

+35
-12
lines changed

3 files changed

+35
-12
lines changed

doc/source/whatsnew/v0.23.2.txt

+1
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ Bug Fixes
5858

5959
- Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`)
6060
- Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`)
61+
- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`).
6162
-
6263

6364
**I/O**

pandas/core/sorting.py

+17-12
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,21 @@ def _int64_cut_off(shape):
5252
return i
5353
return len(shape)
5454

55-
def loop(labels, shape):
55+
def maybe_lift(lab, size):
56+
# promote nan values (assigned -1 label in lab array)
57+
# so that all output values are non-negative
58+
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
59+
60+
labels = map(_ensure_int64, labels)
61+
if not xnull:
62+
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
63+
64+
labels = list(labels)
65+
shape = list(shape)
66+
67+
# Iteratively process all the labels in chunks sized so less
68+
# than _INT64_MAX unique int ids will be required for each chunk
69+
while True:
5670
# how many levels can be done without overflow:
5771
nlev = _int64_cut_off(shape)
5872

@@ -74,7 +88,7 @@ def loop(labels, shape):
7488
out[mask] = -1
7589

7690
if nlev == len(shape): # all levels done!
77-
return out
91+
break
7892

7993
# compress what has been done so far in order to avoid overflow
8094
# to retain lexical ranks, obs_ids should be sorted
@@ -83,16 +97,7 @@ def loop(labels, shape):
8397
labels = [comp_ids] + labels[nlev:]
8498
shape = [len(obs_ids)] + shape[nlev:]
8599

86-
return loop(labels, shape)
87-
88-
def maybe_lift(lab, size): # pormote nan values
89-
return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
90-
91-
labels = map(_ensure_int64, labels)
92-
if not xnull:
93-
labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
94-
95-
return loop(list(labels), list(shape))
100+
return out
96101

97102

98103
def get_compressed_ids(labels, sizes):

pandas/tests/frame/test_analytics.py

+17
Original file line numberDiff line numberDiff line change
@@ -1507,6 +1507,23 @@ def test_duplicated_with_misspelled_column_name(self, subset):
15071507
with pytest.raises(KeyError):
15081508
df.drop_duplicates(subset)
15091509

1510+
@pytest.mark.slow
1511+
def test_duplicated_do_not_fail_on_wide_dataframes(self):
1512+
# gh-21524
1513+
# Given the wide dataframe with a lot of columns
1514+
# with different (important!) values
1515+
data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
1516+
for i in range(100)}
1517+
df = pd.DataFrame(data).T
1518+
result = df.duplicated()
1519+
1520+
# Then duplicates produce the bool pd.Series as a result
1521+
# and don't fail during calculation.
1522+
# Actual values doesn't matter here, though usually
1523+
# it's all False in this case
1524+
assert isinstance(result, pd.Series)
1525+
assert result.dtype == np.bool
1526+
15101527
def test_drop_duplicates_with_duplicate_column_names(self):
15111528
# GH17836
15121529
df = DataFrame([

0 commit comments

Comments
 (0)