BUG: Fix group index calculation to prevent hitting maximum recursion depth (#21541)

Michael Odintsov · jorisvandenbossche · commit 172c5159ba7c · 2018-07-02T15:59:04.000+02:00
(cherry picked from commit f91a704)
diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt
@@ -58,6 +58,7 @@ Bug Fixes
 
 - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`)
 - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`)
+- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`).
 -
 
 **I/O**
diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py
@@ -52,7 +52,21 @@ def _int64_cut_off(shape):
                 return i
         return len(shape)
 
-    def loop(labels, shape):
+    def maybe_lift(lab, size):
+        # promote nan values (assigned -1 label in lab array)
+        # so that all output values are non-negative
+        return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
+
+    labels = map(_ensure_int64, labels)
+    if not xnull:
+        labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
+
+    labels = list(labels)
+    shape = list(shape)
+
+    # Iteratively process all the labels in chunks sized so less
+    # than _INT64_MAX unique int ids will be required for each chunk
+    while True:
         # how many levels can be done without overflow:
         nlev = _int64_cut_off(shape)
 
@@ -74,7 +88,7 @@ def loop(labels, shape):
             out[mask] = -1
 
         if nlev == len(shape):  # all levels done!
-            return out
+            break
 
         # compress what has been done so far in order to avoid overflow
         # to retain lexical ranks, obs_ids should be sorted
@@ -83,16 +97,7 @@ def loop(labels, shape):
         labels = [comp_ids] + labels[nlev:]
         shape = [len(obs_ids)] + shape[nlev:]
 
-        return loop(labels, shape)
-
-    def maybe_lift(lab, size):  # pormote nan values
-        return (lab + 1, size + 1) if (lab == -1).any() else (lab, size)
-
-    labels = map(_ensure_int64, labels)
-    if not xnull:
-        labels, shape = map(list, zip(*map(maybe_lift, labels, shape)))
-
-    return loop(list(labels), list(shape))
+    return out
 
 
 def get_compressed_ids(labels, sizes):
diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py
@@ -1507,6 +1507,23 @@ def test_duplicated_with_misspelled_column_name(self, subset):
         with pytest.raises(KeyError):
             df.drop_duplicates(subset)
 
+    @pytest.mark.slow
+    def test_duplicated_do_not_fail_on_wide_dataframes(self):
+        # gh-21524
+        # Given the wide dataframe with a lot of columns
+        # with different (important!) values
+        data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000)
+                for i in range(100)}
+        df = pd.DataFrame(data).T
+        result = df.duplicated()
+
+        # Then duplicates produce the bool pd.Series as a result
+        # and don't fail during calculation.
+        # Actual values doesn't matter here, though usually
+        # it's all False in this case
+        assert isinstance(result, pd.Series)
+        assert result.dtype == np.bool
+
     def test_drop_duplicates_with_duplicate_column_names(self):
         # GH17836
         df = DataFrame([

Original file line number	Diff line number	Diff line change
`@@ -58,6 +58,7 @@ Bug Fixes`
`58`	`58`
`59`	`59`	- Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`)
`60`	`60`	- Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`)
	`61`	+- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`).
`61`	`62`	`-`
`62`	`63`
`63`	`64`	`I/O`