Skip to content

Commit f2e942e

Browse files
mroeschkejreback
authored andcommitted
PERF: Improve drop_duplicates for bool columns (pandas-dev#12963)
closes pandas-dev#12963 Author: Matt Roeschke <[email protected]> Closes pandas-dev#15738 from mroeschke/fix_12963 and squashes the following commits: a020c10 [Matt Roeschke] PERF: Improve drop_duplicates for bool columns (pandas-dev#12963)
1 parent bff47f2 commit f2e942e

File tree

3 files changed

+12
-1
lines changed

3 files changed

+12
-1
lines changed

asv_bench/benchmarks/reindex.py

+5
Original file line numberDiff line numberDiff line change
@@ -132,6 +132,9 @@ def setup(self):
132132
self.K = 10000
133133
self.key1 = np.random.randint(0, self.K, size=self.N)
134134
self.df_int = DataFrame({'key1': self.key1})
135+
self.df_bool = DataFrame({i: np.random.randint(0, 2, size=self.K,
136+
dtype=bool)
137+
for i in range(10)})
135138

136139
def time_frame_drop_dups(self):
137140
self.df.drop_duplicates(['key1', 'key2'])
@@ -154,6 +157,8 @@ def time_series_drop_dups_string(self):
154157
def time_frame_drop_dups_int(self):
155158
self.df_int.drop_duplicates()
156159

160+
def time_frame_drop_dups_bool(self):
161+
self.df_bool.drop_duplicates()
157162

158163
#----------------------------------------------------------------------
159164
# blog "pandas escaped the zoo"

doc/source/whatsnew/v0.20.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,7 @@ Performance Improvements
789789
- Improved performance of ``.rank()`` for categorical data (:issue:`15498`)
790790
- Improved performance when using ``.unstack()`` (:issue:`15503`)
791791
- Improved performance of merge/join on ``category`` columns (:issue:`10409`)
792+
- Improved performance of ``drop_duplicates()`` on ``bool`` columns (:issue:`12963`)
792793

793794

794795
.. _whatsnew_0200.bug_fixes:

pandas/core/algorithms.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
is_period_dtype,
2020
is_period_arraylike,
2121
is_float_dtype,
22+
is_bool_dtype,
2223
needs_i8_conversion,
2324
is_categorical,
2425
is_datetime64_dtype,
@@ -325,8 +326,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
325326
"""
326327
from pandas import Index, Series, DatetimeIndex, PeriodIndex
327328

328-
# handling two possibilities here
329+
# handling possibilities here
329330
# - for a numpy datetimelike simply view as i8 then cast back
331+
# - bool handled as uint8 then cast back
330332
# - for an extension datetimelike view as i8 then
331333
# reconstruct from boxed values to transfer metadata
332334
dtype = None
@@ -341,6 +343,9 @@ def factorize(values, sort=False, order=None, na_sentinel=-1, size_hint=None):
341343
# numpy dtype
342344
dtype = values.dtype
343345
vals = values.view(np.int64)
346+
elif is_bool_dtype(values):
347+
dtype = bool
348+
vals = np.asarray(values).view('uint8')
344349
else:
345350
vals = np.asarray(values)
346351

0 commit comments

Comments
 (0)