diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt index 0f2c9c4756987..a89a84a15bbdc 100644 --- a/doc/source/whatsnew/v0.23.2.txt +++ b/doc/source/whatsnew/v0.23.2.txt @@ -60,6 +60,7 @@ Bug Fixes - Bug in :meth:`Index.get_indexer_non_unique` with categorical key (:issue:`21448`) - Bug in comparison operations for :class:`MultiIndex` where error was raised on equality / inequality comparison involving a MultiIndex with ``nlevels == 1`` (:issue:`21149`) +- Bug in :func:`DataFrame.duplicated` with a large number of columns causing a 'maximum recursion depth exceeded' (:issue:`21524`). - **I/O** diff --git a/pandas/core/sorting.py b/pandas/core/sorting.py index e550976d1deeb..212f44e55c489 100644 --- a/pandas/core/sorting.py +++ b/pandas/core/sorting.py @@ -52,7 +52,21 @@ def _int64_cut_off(shape): return i return len(shape) - def loop(labels, shape): + def maybe_lift(lab, size): + # promote nan values (assigned -1 label in lab array) + # so that all output values are non-negative + return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) + + labels = map(_ensure_int64, labels) + if not xnull: + labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) + + labels = list(labels) + shape = list(shape) + + # Iteratively process all the labels in chunks sized so less + # than _INT64_MAX unique int ids will be required for each chunk + while True: # how many levels can be done without overflow: nlev = _int64_cut_off(shape) @@ -74,7 +88,7 @@ def loop(labels, shape): out[mask] = -1 if nlev == len(shape): # all levels done! - return out + break # compress what has been done so far in order to avoid overflow # to retain lexical ranks, obs_ids should be sorted @@ -83,16 +97,7 @@ def loop(labels, shape): labels = [comp_ids] + labels[nlev:] shape = [len(obs_ids)] + shape[nlev:] - return loop(labels, shape) - - def maybe_lift(lab, size): # pormote nan values - return (lab + 1, size + 1) if (lab == -1).any() else (lab, size) - - labels = map(_ensure_int64, labels) - if not xnull: - labels, shape = map(list, zip(*map(maybe_lift, labels, shape))) - - return loop(list(labels), list(shape)) + return out def get_compressed_ids(labels, sizes): diff --git a/pandas/tests/frame/test_analytics.py b/pandas/tests/frame/test_analytics.py index 6dc24ed856017..12ebdbe0fd3c7 100644 --- a/pandas/tests/frame/test_analytics.py +++ b/pandas/tests/frame/test_analytics.py @@ -1527,6 +1527,23 @@ def test_duplicated_with_misspelled_column_name(self, subset): with pytest.raises(KeyError): df.drop_duplicates(subset) + @pytest.mark.slow + def test_duplicated_do_not_fail_on_wide_dataframes(self): + # gh-21524 + # Given the wide dataframe with a lot of columns + # with different (important!) values + data = {'col_{0:02d}'.format(i): np.random.randint(0, 1000, 30000) + for i in range(100)} + df = pd.DataFrame(data).T + result = df.duplicated() + + # Then duplicates produce the bool pd.Series as a result + # and don't fail during calculation. + # Actual values doesn't matter here, though usually + # it's all False in this case + assert isinstance(result, pd.Series) + assert result.dtype == np.bool + def test_drop_duplicates_with_duplicate_column_names(self): # GH17836 df = DataFrame([