Skip to content

Commit 605efc6

Browse files
authored
PERF: construct DataFrame with string array and dtype=str (#36432)
1 parent b3e2c6c commit 605efc6

File tree

3 files changed

+24
-15
lines changed

3 files changed

+24
-15
lines changed

asv_bench/benchmarks/strings.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,20 @@ class Construction:
1313
param_names = ["dtype"]
1414

1515
def setup(self, dtype):
16-
self.data = tm.rands_array(nchars=10 ** 5, size=10)
16+
self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
17+
self.frame_arr = self.series_arr.reshape((50_000, 2)).copy()
1718

18-
def time_construction(self, dtype):
19-
Series(self.data, dtype=dtype)
19+
def time_series_construction(self, dtype):
20+
Series(self.series_arr, dtype=dtype)
2021

21-
def peakmem_construction(self, dtype):
22-
Series(self.data, dtype=dtype)
22+
def peakmem_series_construction(self, dtype):
23+
Series(self.series_arr, dtype=dtype)
24+
25+
def time_frame_construction(self, dtype):
26+
DataFrame(self.frame_arr, dtype=dtype)
27+
28+
def peakmem_frame_construction(self, dtype):
29+
DataFrame(self.frame_arr, dtype=dtype)
2330

2431

2532
class Methods:

doc/source/whatsnew/v1.2.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -222,7 +222,7 @@ Deprecations
222222
Performance improvements
223223
~~~~~~~~~~~~~~~~~~~~~~~~
224224

225-
- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`)
225+
- Performance improvements when creating DataFrame or Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`)
226226
- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
227227
- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
228228
- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)

pandas/core/internals/construction.py

+11-9
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
from pandas.core.dtypes.cast import (
1515
construct_1d_arraylike_from_scalar,
16+
construct_1d_ndarray_preserving_na,
1617
maybe_cast_to_datetime,
1718
maybe_convert_platform,
1819
maybe_infer_to_datetimelike,
@@ -189,15 +190,16 @@ def init_ndarray(values, index, columns, dtype: Optional[DtypeObj], copy: bool):
189190
# the dtypes will be coerced to a single dtype
190191
values = _prep_ndarray(values, copy=copy)
191192

192-
if dtype is not None:
193-
if not is_dtype_equal(values.dtype, dtype):
194-
try:
195-
values = values.astype(dtype)
196-
except Exception as orig:
197-
# e.g. ValueError when trying to cast object dtype to float64
198-
raise ValueError(
199-
f"failed to cast to '{dtype}' (Exception was: {orig})"
200-
) from orig
193+
if dtype is not None and not is_dtype_equal(values.dtype, dtype):
194+
try:
195+
values = construct_1d_ndarray_preserving_na(
196+
values.ravel(), dtype=dtype, copy=False
197+
).reshape(values.shape)
198+
except Exception as orig:
199+
# e.g. ValueError when trying to cast object dtype to float64
200+
raise ValueError(
201+
f"failed to cast to '{dtype}' (Exception was: {orig})"
202+
) from orig
201203

202204
# _prep_ndarray ensures that values.ndim == 2 at this point
203205
index, columns = _get_axes(

0 commit comments

Comments
 (0)