Skip to content

Commit 322dbf4

Browse files
TomAugspurgerjreback
authored andcommitted
Sparse get dummies perf (#21997)
1 parent 8fd8d0d commit 322dbf4

File tree

3 files changed

+24
-5
lines changed

3 files changed

+24
-5
lines changed

asv_bench/benchmarks/reshape.py

+18
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1+
import string
12
from itertools import product
23

34
import numpy as np
45
from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long
6+
import pandas as pd
57

68
from .pandas_vb_common import setup # noqa
79

@@ -132,3 +134,19 @@ def setup(self):
132134

133135
def time_pivot_table(self):
134136
self.df.pivot_table(index='key1', columns=['key2', 'key3'])
137+
138+
139+
class GetDummies(object):
140+
goal_time = 0.2
141+
142+
def setup(self):
143+
categories = list(string.ascii_letters[:12])
144+
s = pd.Series(np.random.choice(categories, size=1_000_000),
145+
dtype=pd.api.types.CategoricalDtype(categories))
146+
self.s = s
147+
148+
def time_get_dummies_1d(self):
149+
pd.get_dummies(self.s, sparse=False)
150+
151+
def time_get_dummies_1d_sparse(self):
152+
pd.get_dummies(self.s, sparse=True)

doc/source/whatsnew/v0.24.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ Performance Improvements
408408
- Improved performance of :meth:`HDFStore.groups` (and dependent functions like
409409
:meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster)
410410
(:issue:`21372`)
411-
-
411+
- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`)
412412

413413
.. _whatsnew_0240.docs:
414414

pandas/core/reshape/reshape.py

+5-4
Original file line numberDiff line numberDiff line change
@@ -940,10 +940,11 @@ def get_empty_Frame(data, sparse):
940940
sparse_series = {}
941941
N = len(data)
942942
sp_indices = [[] for _ in range(len(dummy_cols))]
943-
for ndx, code in enumerate(codes):
944-
if code == -1:
945-
# Blank entries if not dummy_na and code == -1, #GH4446
946-
continue
943+
mask = codes != -1
944+
codes = codes[mask]
945+
n_idx = np.arange(N)[mask]
946+
947+
for ndx, code in zip(n_idx, codes):
947948
sp_indices[code].append(ndx)
948949

949950
if drop_first:

0 commit comments

Comments
 (0)