From 1cd53fa2b0dc8033ea5ca1ef68bd81047d81340d Mon Sep 17 00:00:00 2001 From: tp Date: Sun, 25 Oct 2020 11:22:41 +0000 Subject: [PATCH 1/6] REGR/PERF: Index.is_ --- pandas/core/indexes/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 006469f79780d..24caf6ee49b4a 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -545,7 +545,7 @@ def is_(self, other) -> bool: return True elif not hasattr(other, "_id"): return False - elif com.any_none(self._id, other._id): + elif self._id is None or other._id is None: return False else: return self._id is other._id From a0c1ec0d277b091e093bbf1b0eff9b2d4f942b8b Mon Sep 17 00:00:00 2001 From: tp Date: Fri, 23 Oct 2020 22:05:22 +0100 Subject: [PATCH 2/6] PERF: ensure_string_array with non-numpy input array --- pandas/_libs/lib.pyx | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 001fbae120ae8..0250e5454eeab 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -651,6 +651,7 @@ cpdef ndarray[object] ensure_string_array( cdef: Py_ssize_t i = 0, n = len(arr) + arr = np.asarray(arr) # PERF: need a numpy array to ensure fast access result = np.asarray(arr, dtype="object") if copy and result is arr: From 7b4928c82f8a310615ff1147fd99a1186aa8802a Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 24 Oct 2020 00:53:16 +0100 Subject: [PATCH 3/6] fix conversion of nan to string --- pandas/_libs/lib.pyx | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 0250e5454eeab..6fcf2bc92511d 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -651,27 +651,29 @@ cpdef ndarray[object] ensure_string_array( cdef: Py_ssize_t i = 0, n = len(arr) - arr = np.asarray(arr) # PERF: need a numpy array to ensure fast access result = np.asarray(arr, dtype="object") if copy and result is arr: result = result.copy() + arr = np.asarray(arr) # PERF: need a numpy array to ensure fast access + for i in range(n): - val = arr[i] + arr_val = arr[i] + res_val = result[i] - if isinstance(val, str): + if not checknull(res_val) and isinstance(arr_val, str): continue - if not checknull(val): - result[i] = str(val) + if not checknull(res_val): + result[i] = str(arr_val) else: if convert_na_value: - val = na_value + arr_val = na_value if skipna: - result[i] = val + result[i] = arr_val else: - result[i] = str(val) + result[i] = str(arr_val) return result From f28792d0fb4e9f8ee5bfff44b273c9711a22b04c Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 24 Oct 2020 08:41:07 +0100 Subject: [PATCH 4/6] fix conversion --- pandas/_libs/lib.pyx | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 6fcf2bc92511d..597cad6046a0b 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -651,29 +651,33 @@ cpdef ndarray[object] ensure_string_array( cdef: Py_ssize_t i = 0, n = len(arr) + from pandas.core.dtypes.common import is_extension_array_dtype + + if is_extension_array_dtype(arr): + arr = arr.to_numpy() + elif not isinstance(arr, np.ndarray): + arr = np.array(arr, dtype=object) + result = np.asarray(arr, dtype="object") if copy and result is arr: result = result.copy() - arr = np.asarray(arr) # PERF: need a numpy array to ensure fast access - for i in range(n): - arr_val = arr[i] - res_val = result[i] + val = arr[i] - if not checknull(res_val) and isinstance(arr_val, str): + if isinstance(val, str): continue - if not checknull(res_val): - result[i] = str(arr_val) + if not checknull(val): + result[i] = str(val) else: if convert_na_value: - arr_val = na_value + val = na_value if skipna: - result[i] = arr_val + result[i] = val else: - result[i] = str(arr_val) + result[i] = str(val) return result From 3262d8fe878394ae4fee70f65bfe0b14681c3d7c Mon Sep 17 00:00:00 2001 From: tp Date: Sat, 24 Oct 2020 15:23:16 +0100 Subject: [PATCH 5/6] add whatsnew, ASVs --- asv_bench/benchmarks/strings.py | 18 +++++++++++++++++- doc/source/whatsnew/v1.2.0.rst | 2 +- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d8b35abb94b9d..7c75ad031e7cd 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -2,7 +2,7 @@ import numpy as np -from pandas import DataFrame, Series +from pandas import Categorical, DataFrame, Series from .pandas_vb_common import tm @@ -16,6 +16,10 @@ def setup(self, dtype): self.series_arr = tm.rands_array(nchars=10, size=10 ** 5) self.frame_arr = self.series_arr.reshape((50_000, 2)).copy() + # GH37371. Testing construction of string series/frames from ExtensionArrays + self.series_cat_arr = Categorical(self.series_arr) + self.frame_cat_arr = Categorical(self.frame_arr) + def time_series_construction(self, dtype): Series(self.series_arr, dtype=dtype) @@ -28,6 +32,18 @@ def time_frame_construction(self, dtype): def peakmem_frame_construction(self, dtype): DataFrame(self.frame_arr, dtype=dtype) + def time_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) + + def peakmem_cat_series_construction(self, dtype): + Series(self.series_cat_arr, dtype=dtype) + + def time_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + + def peakmem_cat_frame_construction(self, dtype): + DataFrame(self.frame_cat_arr, dtype=dtype) + class Methods: def setup(self): diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 05996efb6d332..9b320182d7968 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -334,7 +334,7 @@ Deprecations Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ -- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`) +- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`, :issue:`37371`) - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`) - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`) - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`) From d9f8e6e044c4dca4019bca5b287262c96f71e901 Mon Sep 17 00:00:00 2001 From: tp Date: Mon, 26 Oct 2020 12:57:52 +0000 Subject: [PATCH 6/6] is_extension_dtype -> hasattr --- pandas/_libs/lib.pyx | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 597cad6046a0b..2cb4df7e054fe 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -651,12 +651,10 @@ cpdef ndarray[object] ensure_string_array( cdef: Py_ssize_t i = 0, n = len(arr) - from pandas.core.dtypes.common import is_extension_array_dtype - - if is_extension_array_dtype(arr): + if hasattr(arr, "to_numpy"): arr = arr.to_numpy() elif not isinstance(arr, np.ndarray): - arr = np.array(arr, dtype=object) + arr = np.array(arr, dtype="object") result = np.asarray(arr, dtype="object")