From a7e644e779a418289de0db42ca6d2d47c88c04b0 Mon Sep 17 00:00:00 2001
From: behzad nouri <behzadnouri@gmail.com>
Date: Wed, 23 Sep 2015 18:26:43 -0400
Subject: [PATCH] improves groupby.get_group_index when shape is a long
 sequence

---
 asv_bench/benchmarks/frame_methods.py | 10 +++++++++-
 doc/source/whatsnew/v0.17.0.txt       |  1 +
 pandas/core/groupby.py                | 11 +++++++++--
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py
index 98b0ec73fb23c..9bece56e15c90 100644
--- a/asv_bench/benchmarks/frame_methods.py
+++ b/asv_bench/benchmarks/frame_methods.py
@@ -293,6 +293,14 @@ def setup(self):
     def time_frame_duplicated(self):
         self.df.duplicated()
 
+class frame_duplicated_wide(object):
+    goal_time = 0.2
+
+    def setup(self):
+        self.df = DataFrame(np.random.randn(1000, 100).astype(str))
+
+    def time_frame_duplicated_wide(self):
+        self.df.T.duplicated()
 
 class frame_fancy_lookup(object):
     goal_time = 0.2
@@ -929,4 +937,4 @@ def setup(self):
         self.s = Series((['abcdefg', np.nan] * 500000))
 
     def time_series_string_vector_slice(self):
-        self.s.str[:5]
\ No newline at end of file
+        self.s.str[:5]
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
index bf3dfa227732e..ca2c7dfc36353 100644
--- a/doc/source/whatsnew/v0.17.0.txt
+++ b/doc/source/whatsnew/v0.17.0.txt
@@ -1044,6 +1044,7 @@ Performance Improvements
 - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
 - Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` and ``SeriesGroupby.transform`` (:issue:`10820`, :issue:`11077`)
 - Performance improvements in ``DataFrame.drop_duplicates`` with integer dtypes (:issue:`10917`)
+- Performance improvements in ``DataFrame.duplicated`` with wide frames. (:issue:`11180`)
 - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
 - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
 - Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
diff --git a/pandas/core/groupby.py b/pandas/core/groupby.py
index c486c414081f2..e72f7c6c6a6bf 100644
--- a/pandas/core/groupby.py
+++ b/pandas/core/groupby.py
@@ -3739,10 +3739,17 @@ def get_group_index(labels, shape, sort, xnull):
     An array of type int64 where two elements are equal if their corresponding
     labels are equal at all location.
     """
+    def _int64_cut_off(shape):
+        acc = long(1)
+        for i, mul in enumerate(shape):
+            acc *= long(mul)
+            if not acc < _INT64_MAX:
+                return i
+        return len(shape)
+
     def loop(labels, shape):
         # how many levels can be done without overflow:
-        pred = lambda i: not _int64_overflow_possible(shape[:i])
-        nlev = next(filter(pred, range(len(shape), 0, -1)))
+        nlev = _int64_cut_off(shape)
 
         # compute flat ids for the first `nlev` levels
         stride = np.prod(shape[1:nlev], dtype='i8')