From bc658b035d4664e366f364adf26357fb3a555738 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Jul 2018 10:36:18 -0500 Subject: [PATCH 1/4] PERF --- asv_bench/benchmarks/reshape.py | 19 +++++++++++++++++++ pandas/core/reshape/reshape.py | 9 +++++---- 2 files changed, 24 insertions(+), 4 deletions(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 9044b080c45f9..b8d2cce198069 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -1,7 +1,9 @@ +import string from itertools import product import numpy as np from pandas import DataFrame, MultiIndex, date_range, melt, wide_to_long +import pandas as pd from .pandas_vb_common import setup # noqa @@ -132,3 +134,20 @@ def setup(self): def time_pivot_table(self): self.df.pivot_table(index='key1', columns=['key2', 'key3']) + + + +class GetDummies(object): + goal_time = 0.2 + + def setup(self): + categories = list(string.ascii_letters[:12]) + s = pd.Series(np.random.choice(categories, size=1_000_000), + dtype=pd.api.types.CategoricalDtype(categories)) + self.s = s + + def time_get_dummies_1d(self): + pd.get_dummies(self.s, sparse=False) + + def time_get_dummies_1d_sparse(self): + pd.get_dummies(self.s, sparse=True) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index d5d2e594b8d6b..b63a938112522 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -940,10 +940,11 @@ def get_empty_Frame(data, sparse): sparse_series = {} N = len(data) sp_indices = [[] for _ in range(len(dummy_cols))] - for ndx, code in enumerate(codes): - if code == -1: - # Blank entries if not dummy_na and code == -1, #GH4446 - continue + mask = codes != -1 + codes = codes[mask] + n_idx = np.arange(N)[mask] + + for ndx, code in zip(n_idx, codes): sp_indices[code].append(ndx) if drop_first: From 8fda4fb9f63f30d4640cb5a134fce1ce88f06b53 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Jul 2018 10:45:20 -0500 Subject: [PATCH 2/4] Whatsnew --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 9e3f7ec73f852..1d8aad9bdda5c 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -343,7 +343,7 @@ Performance Improvements - Improved performance of :meth:`HDFStore.groups` (and dependent functions like :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) (:issue:`21372`) -- +- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:``) .. _whatsnew_0240.docs: From 234e9b2fc18ecbdacf8b6608c46083c29dcfcf8b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Jul 2018 10:47:50 -0500 Subject: [PATCH 3/4] Issue number --- doc/source/whatsnew/v0.24.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index 1d8aad9bdda5c..fc51ff2df001a 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -343,7 +343,7 @@ Performance Improvements - Improved performance of :meth:`HDFStore.groups` (and dependent functions like :meth:`~HDFStore.keys`. (i.e. ``x in store`` checks are much faster) (:issue:`21372`) -- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:``) +- Improved the performance of :func:`pandas.get_dummies` with ``sparse=True`` (:issue:`21997`) .. _whatsnew_0240.docs: From be8b9eb64c1def032ae570f817824e294e747ecf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Fri, 20 Jul 2018 10:48:23 -0500 Subject: [PATCH 4/4] lint --- asv_bench/benchmarks/reshape.py | 1 - 1 file changed, 1 deletion(-) diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index b8d2cce198069..07634811370c7 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -136,7 +136,6 @@ def time_pivot_table(self): self.df.pivot_table(index='key1', columns=['key2', 'key3']) - class GetDummies(object): goal_time = 0.2