PERF: perf improvements in drop_duplicates for integer dtyped arrays

jreback · jreback · commit a00c7ea1e2b6 · 2015-08-28T08:32:39.000-04:00
diff --git a/asv_bench/benchmarks/reindex.py b/asv_bench/benchmarks/reindex.py
@@ -61,6 +61,19 @@ def time_frame_drop_duplicates(self):
         self.df.drop_duplicates(['key1', 'key2'])
 
 
+class frame_drop_duplicates_int(object):
+
+    def setup(self):
+        np.random.seed(1234)
+        self.N = 1000000
+        self.K = 10000
+        self.key1 = np.random.randint(0,self.K,size=self.N)
+        self.df = DataFrame({'key1': self.key1})
+
+    def time_frame_drop_duplicates_int(self):
+        self.df.drop_duplicates()
+
+
 class frame_drop_duplicates_na(object):
     goal_time = 0.2
 
@@ -381,4 +394,4 @@ def setup(self):
         self.s2 = Series(np.tile(tm.makeStringIndex(1000).values, 10))
 
     def time_series_drop_duplicates_string(self):
-        self.s2.drop_duplicates()
+        self.s2.drop_duplicates()
diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt
@@ -722,7 +722,7 @@ Performance Improvements
 - Added vbench benchmarks for alternative ExcelWriter engines and reading Excel files (:issue:`7171`)
 - Performance improvements in ``Categorical.value_counts`` (:issue:`10804`)
 - Performance improvements in ``SeriesGroupBy.nunique`` and ``SeriesGroupBy.value_counts`` (:issue:`10820`)
-
+- Performance improvements in ``DataFrame.drop_duplicates`` with integer dtypes (:issue:`10917`)
 - 4x improvement in ``timedelta`` string parsing (:issue:`6755`, :issue:`10426`)
 - 8x improvement in ``timedelta64`` and ``datetime64`` ops (:issue:`6755`)
 - Significantly improved performance of indexing ``MultiIndex`` with slicers (:issue:`10287`)
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -2962,7 +2962,13 @@ def duplicated(self, subset=None, keep='first'):
         from pandas.hashtable import duplicated_int64, _SIZE_HINT_LIMIT
 
         def f(vals):
-            labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
+
+            # if we have integers we can directly index with these
+            if com.is_integer_dtype(vals):
+                from pandas.core.nanops import unique1d
+                labels, shape = vals, unique1d(vals)
+            else:
+                labels, shape = factorize(vals, size_hint=min(len(self), _SIZE_HINT_LIMIT))
             return labels.astype('i8',copy=False), len(shape)
 
         if subset is None:
diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py
@@ -7975,6 +7975,22 @@ def test_drop_duplicates(self):
             expected = df2.drop_duplicates(['AAA', 'B'], take_last=True)
         assert_frame_equal(result, expected)
 
+        # integers
+        result = df.drop_duplicates('C')
+        expected = df.iloc[[0,2]]
+        assert_frame_equal(result, expected)
+        result = df.drop_duplicates('C',keep='last')
+        expected = df.iloc[[-2,-1]]
+        assert_frame_equal(result, expected)
+
+        df['E'] = df['C'].astype('int8')
+        result = df.drop_duplicates('E')
+        expected = df.iloc[[0,2]]
+        assert_frame_equal(result, expected)
+        result = df.drop_duplicates('E',keep='last')
+        expected = df.iloc[[-2,-1]]
+        assert_frame_equal(result, expected)
+
     def test_drop_duplicates_for_take_all(self):
         df = DataFrame({'AAA': ['foo', 'bar', 'baz', 'bar',
                                 'foo', 'bar', 'qux', 'foo'],