PERF: improves groupby.get_group_index when shape is a long sequence, #10161

behzadnouri · jreback · commit 3fb802a7f5b8 · 2015-09-25T08:17:55.000-04:00
diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
@@ -293,6 +293,14 @@ def setup(self):
     def time_frame_duplicated(self):
         self.df.duplicated()
 
+class frame_duplicated_wide(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.df = DataFrame(np.random.randn(1000, 100).astype(str))
+
+    def time_frame_duplicated_wide(self):
+        self.df.T.duplicated()
 
 class frame_fancy_lookup(object):
     goal_time = 0.2
@@ -929,4 +937,4 @@ def setup(self):
         self.s = Series((['abcdefg', np.nan] * 500000))
 
     def time_series_string_vector_slice(self):
-        self.s.str[:5]
+        self.s.str[:5]
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -1045,6 +1045,7 @@ Performance Improvements
 - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
 - Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` and ``SeriesGroupby.transform`` (:issue:`10820`, :issue:`11077`)
 - Performance improvements in ``DataFrame.drop_duplicates`` with integer dtypes (:issue:`10917`)
+- Performance improvements in ``DataFrame.duplicated`` with wide frames. (:issue:`10161`, :issue:`11180`)
 - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
 - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
 - Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
@@ -3739,10 +3739,17 @@ def get_group_index(labels, shape, sort, xnull):
     An array of type int64 where two elements are equal if their corresponding
     labels are equal at all location.
     """
+    def _int64_cut_off(shape):
+        acc = long(1)
+        for i, mul in enumerate(shape):
+            acc *= long(mul)
+            if not acc < _INT64_MAX:
+                return i
+        return len(shape)
+
     def loop(labels, shape):
         # how many levels can be done without overflow:
-        pred = lambda i: not _int64_overflow_possible(shape[:i])
-        nlev = next(filter(pred, range(len(shape), 0, -1)))
+        nlev = _int64_cut_off(shape)
 
         # compute flat ids for the first `nlev` levels
         stride = np.prod(shape[1:nlev], dtype='i8')