Skip to content

Commit 62e05f5

Browse files
authored
PERF: StringArray from np.str_ array (pandas-dev#49109)
* perf: string array from np.str_ * add test * whatsnew * cleanup
1 parent 2e7f5a3 commit 62e05f5

File tree

4 files changed

+31
-0
lines changed

4 files changed

+31
-0
lines changed

asv_bench/benchmarks/array.py

+18
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,24 @@ def time_from_integer_array(self):
4444
pd.array(self.values_integer, dtype="Int64")
4545

4646

47+
class StringArray:
48+
def setup(self):
49+
N = 100_000
50+
values = tm.rands_array(3, N)
51+
self.values_obj = np.array(values, dtype="object")
52+
self.values_str = np.array(values, dtype="U")
53+
self.values_list = values.tolist()
54+
55+
def time_from_np_object_array(self):
56+
pd.array(self.values_obj, dtype="string")
57+
58+
def time_from_np_str_array(self):
59+
pd.array(self.values_str, dtype="string")
60+
61+
def time_from_list(self):
62+
pd.array(self.values_list, dtype="string")
63+
64+
4765
class ArrowStringArray:
4866

4967
params = [False, True]

doc/source/whatsnew/v2.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -156,6 +156,7 @@ Performance improvements
156156
- Performance improvement in :func:`merge` and :meth:`DataFrame.join` when joining on a sorted :class:`MultiIndex` (:issue:`48504`)
157157
- Performance improvement in :meth:`DataFrame.loc` and :meth:`Series.loc` for tuple-based indexing of a :class:`MultiIndex` (:issue:`48384`)
158158
- Performance improvement for :meth:`MultiIndex.unique` (:issue:`48335`)
159+
- Performance improvement for :class:`~arrays.StringArray` constructor passing a numpy array with type ``np.str_`` (:issue:`49109`)
159160
- Performance improvement for :func:`concat` with extension array backed indexes (:issue:`49128`)
160161
- Performance improvement in :meth:`DataFrame.join` when joining on a subset of a :class:`MultiIndex` (:issue:`48611`)
161162
- Performance improvement for :meth:`MultiIndex.intersection` (:issue:`48604`)

pandas/_libs/lib.pyx

+4
Original file line numberDiff line numberDiff line change
@@ -703,6 +703,10 @@ cpdef ndarray[object] ensure_string_array(
703703
if copy and result is arr:
704704
result = result.copy()
705705

706+
if issubclass(arr.dtype.type, np.str_):
707+
# short-circuit, all elements are str
708+
return result
709+
706710
for i in range(n):
707711
val = arr[i]
708712

pandas/tests/arrays/string_/test_string.py

+8
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,14 @@ def test_setitem_scalar_with_mask_validation(dtype):
597597
ser[mask] = 1
598598

599599

600+
def test_from_numpy_str(dtype):
601+
vals = ["a", "b", "c"]
602+
arr = np.array(vals, dtype=np.str_)
603+
result = pd.array(arr, dtype=dtype)
604+
expected = pd.array(vals, dtype=dtype)
605+
tm.assert_extension_array_equal(result, expected)
606+
607+
600608
def test_tolist(dtype):
601609
vals = ["a", "b", "c"]
602610
arr = pd.array(vals, dtype=dtype)

0 commit comments

Comments
 (0)