Skip to content

Commit 9334282

Browse files
committed
PERF: Improve drop_duplicates for bool columns (#12963)
Add whatsnew
1 parent 9ab57dc commit 9334282

File tree

3 files changed

+9
-0
lines changed

3 files changed

+9
-0
lines changed

asv_bench/benchmarks/reindex.py

+5
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ def setup(self):
132132
self.K = 10000
133133
self.key1 = np.random.randint(0, self.K, size=self.N)
134134
self.df_int = DataFrame({'key1': self.key1})
135+
self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K,
136+
dtype=bool)
137+
for i in range(10)})
135138

136139
def time_frame_drop_dups(self):
137140
self.df.drop_duplicates(['key1', 'key2'])
@@ -154,6 +157,8 @@ def time_series_drop_dups_string(self):
154157
def time_frame_drop_dups_int(self):
155158
self.df_int.drop_duplicates()
156159

160+
def time_frame_drop_dups_bool(self):
161+
self.df_bool.drop_duplicates()
157162

158163
#----------------------------------------------------------------------
159164
# blog "pandas escaped the zoo"

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,7 @@ Performance Improvements
788788
- Improved performance of ``.rank()`` for categorical data (:issue:`15498`)
789789
- Improved performance when using ``.unstack()`` (:issue:`15503`)
790790
- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
791+
- Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`)
791792

792793

793794
.. _whatsnew_0200.bug_fixes:

pandas/core/algorithms.py

+3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
is_period_dtype,
2020
is_period_arraylike,
2121
is_float_dtype,
22+
is_bool_dtype,
2223
needs_i8_conversion,
2324
is_categorical,
2425
is_datetime64_dtype,
@@ -343,6 +344,8 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
343344
vals = values.view(np.int64)
344345
else:
345346
vals = np.asarray(values)
347+
if is_bool_dtype(vals):
348+
vals = vals.astype(int)
346349

347350
(hash_klass, vec_klass), vals = _get_data_algo(vals, _hashtables)
348351

0 commit comments

Comments
 (0)