diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py index 301fc9d405057..cb949637ea745 100644 --- a/asv_bench/benchmarks/array.py +++ b/asv_bench/benchmarks/array.py @@ -44,6 +44,24 @@ def time_from_integer_array(self): pd.array(self.values_integer, dtype="Int64") +class StringArray: + def setup(self): + N = 100_000 + values = tm.rands_array(3, N) + self.values_obj = np.array(values, dtype="object") + self.values_str = np.array(values, dtype="U") + self.values_list = values.tolist() + + def time_from_np_object_array(self): + pd.array(self.values_obj, dtype="string") + + def time_from_np_str_array(self): + pd.array(self.values_str, dtype="string") + + def time_from_list(self): + pd.array(self.values_list, dtype="string") + + class ArrowStringArray: params = [False, True] diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 3e376d15a5a87..aa231d1b841d4 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -156,6 +156,7 @@ Performance improvements - Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`) - Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`) - Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`) +- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`) - Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`) - Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`) - Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index d9a7195520fd7..de226fcd19084 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -703,6 +703,10 @@ cpdef ndarray[object] ensure_string_array( if copy and result is arr: result = result.copy() + if issubclass(arr.dtype.type, np.str_): + # short-circuit, all elements are str + return result + for i in range(n): val = arr[i] diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py index b562fe5c1f26c..e321e8da15a6e 100644 --- a/pandas/tests/arrays/string_/test_string.py +++ b/pandas/tests/arrays/string_/test_string.py @@ -597,6 +597,14 @@ def test_setitem_scalar_with_mask_validation(dtype): ser[mask] = 1 +def test_from_numpy_str(dtype): + vals = ["a", "b", "c"] + arr = np.array(vals, dtype=np.str_) + result = pd.array(arr, dtype=dtype) + expected = pd.array(vals, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + def test_tolist(dtype): vals = ["a", "b", "c"] arr = pd.array(vals, dtype=dtype)