PERF: construct DataFrame with string array and dtype=str (#36432)

topper-123 · web-flow · commit 605efc6532b8 · 2020-09-19T15:54:16.000-04:00
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -13,13 +13,20 @@ class Construction:
     param_names = ["dtype"]
 
     def setup(self, dtype):
-        self.data = tm.rands_array(nchars=10 ** 5, size=10)
+        self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
+        self.frame_arr = self.series_arr.reshape((50_000, 2)).copy()
 
-    def time_construction(self, dtype):
-        Series(self.data, dtype=dtype)
+    def time_series_construction(self, dtype):
+        Series(self.series_arr, dtype=dtype)
 
-    def peakmem_construction(self, dtype):
-        Series(self.data, dtype=dtype)
+    def peakmem_series_construction(self, dtype):
+        Series(self.series_arr, dtype=dtype)
+
+    def time_frame_construction(self, dtype):
+        DataFrame(self.frame_arr, dtype=dtype)
+
+    def peakmem_frame_construction(self, dtype):
+        DataFrame(self.frame_arr, dtype=dtype)
 
 
 class Methods:
diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
@@ -222,7 +222,7 @@ Deprecations
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`)
+- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`)
 - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
 - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
 - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py
@@ -13,6 +13,7 @@
 
 from pandas.core.dtypes.cast import (
     construct_1d_arraylike_from_scalar,
+    construct_1d_ndarray_preserving_na,
     maybe_cast_to_datetime,
     maybe_convert_platform,
     maybe_infer_to_datetimelike,
@@ -189,15 +190,16 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
     # the dtypes will be coerced to a single dtype
     values = _prep_ndarray(values, copy=copy)
 
-    if dtype is not None:
-        if not is_dtype_equal(values.dtype, dtype):
-            try:
-                values = values.astype(dtype)
-            except Exception as orig:
-                # e.g. ValueError when trying to cast object dtype to float64
-                raise ValueError(
-                    f"failed to cast to '{dtype}' (Exception was: {orig})"
-                ) from orig
+    if dtype is not None and not is_dtype_equal(values.dtype, dtype):
+        try:
+            values = construct_1d_ndarray_preserving_na(
+                values.ravel(), dtype=dtype, copy=False
+            ).reshape(values.shape)
+        except Exception as orig:
+            # e.g. ValueError when trying to cast object dtype to float64
+            raise ValueError(
+                f"failed to cast to '{dtype}' (Exception was: {orig})"
+            ) from orig
 
     # _prep_ndarray ensures that values.ndim == 2 at this point
     index, columns = _get_axes(