PERF: ensure_string_array with non-numpy input array (#37371)

topper-123 · web-flow · commit 3c8c4c90dba0 · 2020-10-26T13:37:54.000-04:00
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -2,7 +2,7 @@
 
 import numpy as np
 
-from pandas import DataFrame, Series
+from pandas import Categorical, DataFrame, Series
 
 from .pandas_vb_common import tm
 
@@ -16,6 +16,10 @@ def setup(self, dtype):
         self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
         self.frame_arr = self.series_arr.reshape((50_000, 2)).copy()
 
+        # GH37371. Testing construction of string series/frames from ExtensionArrays
+        self.series_cat_arr = Categorical(self.series_arr)
+        self.frame_cat_arr = Categorical(self.frame_arr)
+
     def time_series_construction(self, dtype):
         Series(self.series_arr, dtype=dtype)
 
@@ -28,6 +32,18 @@ def time_frame_construction(self, dtype):
     def peakmem_frame_construction(self, dtype):
         DataFrame(self.frame_arr, dtype=dtype)
 
+    def time_cat_series_construction(self, dtype):
+        Series(self.series_cat_arr, dtype=dtype)
+
+    def peakmem_cat_series_construction(self, dtype):
+        Series(self.series_cat_arr, dtype=dtype)
+
+    def time_cat_frame_construction(self, dtype):
+        DataFrame(self.frame_cat_arr, dtype=dtype)
+
+    def peakmem_cat_frame_construction(self, dtype):
+        DataFrame(self.frame_cat_arr, dtype=dtype)
+
 
 class Methods:
     def setup(self):
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -334,7 +334,7 @@ Deprecations
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`)
+- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`, :issue:`37371`)
 - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
 - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
 - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -651,6 +651,11 @@ cpdef ndarray[object] ensure_string_array(
     cdef:
         Py_ssize_t i = 0, n = len(arr)
 
+    if hasattr(arr, "to_numpy"):
+        arr = arr.to_numpy()
+    elif not isinstance(arr, np.ndarray):
+        arr = np.array(arr, dtype="object")
+
     result = np.asarray(arr, dtype="object")
 
     if copy and result is arr: