From 49ffb8f9af344538753233be34bf8397f3170041 Mon Sep 17 00:00:00 2001 From: Nick MK Lee Date: Thu, 10 May 2018 08:04:28 -0400 Subject: [PATCH 1/3] PERF: removed coercion to int64 for arrays of ints in Categorical.from_codes --- pandas/core/arrays/categorical.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index f91782459df67..93e7aa7ca439b 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -578,7 +578,11 @@ def from_codes(cls, codes, categories, ordered=False): unordered. """ try: - codes = np.asarray(codes, np.int64) + if (type(codes) == np.ndarray + and np.issubdtype(codes.dtype, np.integer)): + codes = np.asarray(codes) + else: + codes = np.asarray(codes, np.int64) except (ValueError, TypeError): raise ValueError( "codes need to be convertible to an arrays of integers") From d83abca49b3d848c8a5f4ff0cb5e8da167058012 Mon Sep 17 00:00:00 2001 From: Nick MK Lee Date: Fri, 11 May 2018 08:23:29 -0400 Subject: [PATCH 2/3] CLN: using coerce_indexer_dtype for codes in Categorical.from_codes --- doc/source/whatsnew/v0.23.0.txt | 1 + pandas/core/arrays/categorical.py | 6 +----- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/doc/source/whatsnew/v0.23.0.txt b/doc/source/whatsnew/v0.23.0.txt index d10d51352d0e4..99b931db99c2c 100644 --- a/doc/source/whatsnew/v0.23.0.txt +++ b/doc/source/whatsnew/v0.23.0.txt @@ -1079,6 +1079,7 @@ Performance Improvements - Improved performance of :func:`Series.isin` in the case of categorical dtypes (:issue:`20003`) - Improved performance of ``getattr(Series, attr)`` when the Series has certain index types. This manifiested in slow printing of large Series with a ``DatetimeIndex`` (:issue:`19764`) - Fixed a performance regression for :func:`GroupBy.nth` and :func:`GroupBy.last` with some object columns (:issue:`19283`) +- Improved performance of :func:`pandas.core.arrays.Categorical.from_codes` (:issue:`18501`) .. _whatsnew_0230.docs: diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 93e7aa7ca439b..abcb9ae3494b5 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -578,11 +578,7 @@ def from_codes(cls, codes, categories, ordered=False): unordered. """ try: - if (type(codes) == np.ndarray - and np.issubdtype(codes.dtype, np.integer)): - codes = np.asarray(codes) - else: - codes = np.asarray(codes, np.int64) + codes = coerce_indexer_dtype(np.asarray(codes), categories) except (ValueError, TypeError): raise ValueError( "codes need to be convertible to an arrays of integers") From 75da8a3fa2f77663270390e35a45d614a52d3d45 Mon Sep 17 00:00:00 2001 From: Nick MK Lee Date: Mon, 14 May 2018 17:25:08 -0400 Subject: [PATCH 3/3] PERF: added asv for Categorical.from_codes (#18501) --- asv_bench/benchmarks/categoricals.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 0ffd5f881d626..ae1d7029217a4 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -51,6 +51,7 @@ def setup(self): self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) + self.values_all_int8 = np.ones(N, 'int8') def time_regular(self): pd.Categorical(self.values, self.categories) @@ -70,6 +71,9 @@ def time_with_nan(self): def time_all_nan(self): pd.Categorical(self.values_all_nan) + def time_from_codes_all_int8(self): + pd.Categorical.from_codes(self.values_all_int8, self.categories) + class ValueCounts(object):