Skip to content

Commit 3c8c4c9

Browse files
authored
PERF: ensure_string_array with non-numpy input array (#37371)
1 parent 3c695dc commit 3c8c4c9

File tree

3 files changed

+23
-2
lines changed

3 files changed

+23
-2
lines changed

asv_bench/benchmarks/strings.py

+17-1
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import numpy as np
44

5-
from pandas import DataFrame, Series
5+
from pandas import Categorical, DataFrame, Series
66

77
from .pandas_vb_common import tm
88

@@ -16,6 +16,10 @@ def setup(self, dtype):
1616
self.series_arr = tm.rands_array(nchars=10, size=10 ** 5)
1717
self.frame_arr = self.series_arr.reshape((50_000, 2)).copy()
1818

19+
# GH37371. Testing construction of string series/frames from ExtensionArrays
20+
self.series_cat_arr = Categorical(self.series_arr)
21+
self.frame_cat_arr = Categorical(self.frame_arr)
22+
1923
def time_series_construction(self, dtype):
2024
Series(self.series_arr, dtype=dtype)
2125

@@ -28,6 +32,18 @@ def time_frame_construction(self, dtype):
2832
def peakmem_frame_construction(self, dtype):
2933
DataFrame(self.frame_arr, dtype=dtype)
3034

35+
def time_cat_series_construction(self, dtype):
36+
Series(self.series_cat_arr, dtype=dtype)
37+
38+
def peakmem_cat_series_construction(self, dtype):
39+
Series(self.series_cat_arr, dtype=dtype)
40+
41+
def time_cat_frame_construction(self, dtype):
42+
DataFrame(self.frame_cat_arr, dtype=dtype)
43+
44+
def peakmem_cat_frame_construction(self, dtype):
45+
DataFrame(self.frame_cat_arr, dtype=dtype)
46+
3147

3248
class Methods:
3349
def setup(self):

doc/source/whatsnew/v1.2.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -334,7 +334,7 @@ Deprecations
334334
Performance improvements
335335
~~~~~~~~~~~~~~~~~~~~~~~~
336336

337-
- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`)
337+
- Performance improvements when creating DataFrame or Series with dtype ``str`` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`, :issue:`36432`, :issue:`37371`)
338338
- Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
339339
- Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
340340
- Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)

pandas/_libs/lib.pyx

+5
Original file line numberDiff line numberDiff line change
@@ -651,6 +651,11 @@ cpdef ndarray[object] ensure_string_array(
651651
cdef:
652652
Py_ssize_t i = 0, n = len(arr)
653653

654+
if hasattr(arr, "to_numpy"):
655+
arr = arr.to_numpy()
656+
elif not isinstance(arr, np.ndarray):
657+
arr = np.array(arr, dtype="object")
658+
654659
result = np.asarray(arr, dtype="object")
655660

656661
if copy and result is arr:

0 commit comments

Comments
 (0)