Skip to content

Commit 66d08dc

Browse files
Backport PR #35519: REF: StringArray._from_sequence, use less memory (#35770)
Co-authored-by: Terji Petersen <[email protected]>
1 parent ac8845b commit 66d08dc

File tree

6 files changed

+73
-53
lines changed

6 files changed

+73
-53
lines changed

asv_bench/benchmarks/strings.py

+15
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,21 @@
77
from .pandas_vb_common import tm
88

99

10+
class Construction:
11+
12+
params = ["str", "string"]
13+
param_names = ["dtype"]
14+
15+
def setup(self, dtype):
16+
self.data = tm.rands_array(nchars=10 ** 5, size=10)
17+
18+
def time_construction(self, dtype):
19+
Series(self.data, dtype=dtype)
20+
21+
def peakmem_construction(self, dtype):
22+
Series(self.data, dtype=dtype)
23+
24+
1025
class Methods:
1126
def setup(self):
1227
self.s = Series(tm.makeStringIndex(10 ** 5))

doc/source/whatsnew/v1.1.1.rst

+5
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@ Categorical
7575
- Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`)
7676
-
7777

78+
**Strings**
79+
80+
- fix memory usage issue when instantiating large :class:`pandas.arrays.StringArray` (:issue:`35499`)
81+
82+
7883
.. ---------------------------------------------------------------------------
7984
8085
.. _whatsnew_111.contributors:

pandas/_libs/lib.pyx

+34-17
Original file line numberDiff line numberDiff line change
@@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
618618

619619
@cython.wraparound(False)
620620
@cython.boundscheck(False)
621-
def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]:
622-
"""
623-
Convert all elements in an array to string.
621+
cpdef ndarray[object] ensure_string_array(
622+
arr,
623+
object na_value=np.nan,
624+
bint convert_na_value=True,
625+
bint copy=True,
626+
bint skipna=True,
627+
):
628+
"""Returns a new numpy array with object dtype and only strings and na values.
624629
625630
Parameters
626631
----------
627-
arr : ndarray
628-
The array whose elements we are casting.
629-
skipna : bool, default False
632+
arr : array-like
633+
The values to be converted to str, if needed.
634+
na_value : Any
635+
The value to use for na. For example, np.nan or pd.NA.
636+
convert_na_value : bool, default True
637+
If False, existing na values will be used unchanged in the new array.
638+
copy : bool, default True
639+
Whether to ensure that a new array is returned.
640+
skipna : bool, default True
630641
Whether or not to coerce nulls to their stringified form
631-
(e.g. NaN becomes 'nan').
642+
(e.g. if False, NaN becomes 'nan').
632643
633644
Returns
634645
-------
635646
ndarray
636-
A new array with the input array's elements casted.
647+
An array with the input array's elements casted to str or nan-like.
637648
"""
638649
cdef:
639-
object arr_i
640-
Py_ssize_t i, n = arr.size
641-
ndarray[object] result = np.empty(n, dtype=object)
642-
643-
for i in range(n):
644-
arr_i = arr[i]
650+
Py_ssize_t i = 0, n = len(arr)
645651

646-
if not (skipna and checknull(arr_i)):
647-
arr_i = str(arr_i)
652+
result = np.asarray(arr, dtype="object")
653+
if copy and result is arr:
654+
result = result.copy()
648655

649-
result[i] = arr_i
656+
for i in range(n):
657+
val = result[i]
658+
if not checknull(val):
659+
result[i] = str(val)
660+
else:
661+
if convert_na_value:
662+
val = na_value
663+
if skipna:
664+
result[i] = val
665+
else:
666+
result[i] = str(val)
650667

651668
return result
652669

pandas/core/arrays/string_.py

+6-19
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,10 @@ class StringArray(PandasArray):
178178

179179
def __init__(self, values, copy=False):
180180
values = extract_array(values)
181-
skip_validation = isinstance(values, type(self))
182181

183182
super().__init__(values, copy=copy)
184183
self._dtype = StringDtype()
185-
if not skip_validation:
184+
if not isinstance(values, type(self)):
186185
self._validate()
187186

188187
def _validate(self):
@@ -201,23 +200,11 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
201200
assert dtype == "string"
202201

203202
result = np.asarray(scalars, dtype="object")
204-
if copy and result is scalars:
205-
result = result.copy()
206-
207-
# Standardize all missing-like values to NA
208-
# TODO: it would be nice to do this in _validate / lib.is_string_array
209-
# We are already doing a scan over the values there.
210-
na_values = isna(result)
211-
has_nans = na_values.any()
212-
if has_nans and result is scalars:
213-
# force a copy now, if we haven't already
214-
result = result.copy()
215-
216-
# convert to str, then to object to avoid dtype like '<U3', then insert na_value
217-
result = np.asarray(result, dtype=str)
218-
result = np.asarray(result, dtype="object")
219-
if has_nans:
220-
result[na_values] = StringDtype.na_value
203+
204+
# convert non-na-likes to str, and nan-likes to StringDtype.na_value
205+
result = lib.ensure_string_array(
206+
result, na_value=StringDtype.na_value, copy=copy
207+
)
221208

222209
return cls(result)
223210

pandas/core/dtypes/cast.py

+4-12
Original file line numberDiff line numberDiff line change
@@ -916,7 +916,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
916916
dtype = pandas_dtype(dtype)
917917

918918
if issubclass(dtype.type, str):
919-
return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape)
919+
return lib.ensure_string_array(arr.ravel(), skipna=skipna).reshape(arr.shape)
920920

921921
elif is_datetime64_dtype(arr):
922922
if is_object_dtype(dtype):
@@ -1608,19 +1608,11 @@ def construct_1d_ndarray_preserving_na(
16081608
>>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str'))
16091609
array(['1.0', '2.0', None], dtype=object)
16101610
"""
1611-
subarr = np.array(values, dtype=dtype, copy=copy)
16121611

16131612
if dtype is not None and dtype.kind == "U":
1614-
# GH-21083
1615-
# We can't just return np.array(subarr, dtype='str') since
1616-
# NumPy will convert the non-string objects into strings
1617-
# Including NA values. Se we have to go
1618-
# string -> object -> update NA, which requires an
1619-
# additional pass over the data.
1620-
na_values = isna(values)
1621-
subarr2 = subarr.astype(object)
1622-
subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
1623-
subarr = subarr2
1613+
subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
1614+
else:
1615+
subarr = np.array(values, dtype=dtype, copy=copy)
16241616

16251617
return subarr
16261618

pandas/tests/arrays/string_/test_string.py

+9-5
Original file line numberDiff line numberDiff line change
@@ -206,12 +206,16 @@ def test_constructor_raises():
206206

207207
@pytest.mark.parametrize("copy", [True, False])
208208
def test_from_sequence_no_mutate(copy):
209-
a = np.array(["a", np.nan], dtype=object)
210-
original = a.copy()
211-
result = pd.arrays.StringArray._from_sequence(a, copy=copy)
212-
expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
209+
nan_arr = np.array(["a", np.nan], dtype=object)
210+
na_arr = np.array(["a", pd.NA], dtype=object)
211+
212+
result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy)
213+
expected = pd.arrays.StringArray(na_arr)
214+
213215
tm.assert_extension_array_equal(result, expected)
214-
tm.assert_numpy_array_equal(a, original)
216+
217+
expected = nan_arr if copy else na_arr
218+
tm.assert_numpy_array_equal(nan_arr, expected)
215219

216220

217221
def test_astype_int():

0 commit comments

Comments
 (0)