Skip to content

Commit a020c10

Browse files
committed
PERF: Improve drop_duplicates for bool columns (pandas-dev#12963)
Add whatsnew Add dtype label and reorg logic
1 parent 9ab57dc commit a020c10

File tree

3 files changed

+11
-0
lines changed

3 files changed

+11
-0
lines changed

asv_bench/benchmarks/reindex.py

+5
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ def setup(self):
132132
self.K = 10000
133133
self.key1 = np.random.randint(0, self.K, size=self.N)
134134
self.df_int = DataFrame({'key1': self.key1})
135+
self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K,
136+
dtype=bool)
137+
for i in range(10)})
135138

136139
def time_frame_drop_dups(self):
137140
self.df.drop_duplicates(['key1', 'key2'])
@@ -154,6 +157,8 @@ def time_series_drop_dups_string(self):
154157
def time_frame_drop_dups_int(self):
155158
self.df_int.drop_duplicates()
156159

160+
def time_frame_drop_dups_bool(self):
161+
self.df_bool.drop_duplicates()
157162

158163
#----------------------------------------------------------------------
159164
# blog "pandas escaped the zoo"

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,7 @@ Performance Improvements
788788
- Improved performance of ``.rank()`` for categorical data (:issue:`15498`)
789789
- Improved performance when using ``.unstack()`` (:issue:`15503`)
790790
- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
791+
- Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`)
791792

792793

793794
.. _whatsnew_0200.bug_fixes:

pandas/core/algorithms.py

+5
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
is_period_dtype,
2020
is_period_arraylike,
2121
is_float_dtype,
22+
is_bool_dtype,
2223
needs_i8_conversion,
2324
is_categorical,
2425
is_datetime64_dtype,
@@ -341,6 +342,10 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
341342
# numpy dtype
342343
dtype = values.dtype
343344
vals = values.view(np.int64)
345+
elif is_bool_dtype(values):
346+
dtype = bool
347+
# transform to int dtype to avoid object path
348+
vals = np.asarray(values).view('uint8')
344349
else:
345350
vals = np.asarray(values)
346351

0 commit comments

Comments
 (0)