Backport PR #35519: REF: StringArray._from_sequence, use less memory (#35770)

meeseeksmachine · topper-123 · web-flow · commit 66d08dc74470 · 2020-08-17T16:32:35.000+01:00
Co-authored-by: Terji Petersen &lt;contribute@tensortable.com&gt;
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
@@ -7,6 +7,21 @@
 from .pandas_vb_common import tm
 
 
+class Construction:
+
+    params = ["str", "string"]
+    param_names = ["dtype"]
+
+    def setup(self, dtype):
+        self.data = tm.rands_array(nchars=10 ** 5, size=10)
+
+    def time_construction(self, dtype):
+        Series(self.data, dtype=dtype)
+
+    def peakmem_construction(self, dtype):
+        Series(self.data, dtype=dtype)
+
+
 class Methods:
     def setup(self):
         self.s = Series(tm.makeStringIndex(10 ** 5))
diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst
@@ -75,6 +75,11 @@ Categorical
 - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`)
 -
 
+**Strings**
+
+- fix memory usage issue when instantiating large  :class:`pandas.arrays.StringArray` (:issue:`35499`)
+
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_111.contributors:
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
@@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]:
-    """
-    Convert all elements in an array to string.
+cpdef ndarray[object] ensure_string_array(
+        arr,
+        object na_value=np.nan,
+        bint convert_na_value=True,
+        bint copy=True,
+        bint skipna=True,
+):
+    """Returns a new numpy array with object dtype and only strings and na values.
 
     Parameters
     ----------
-    arr : ndarray
-        The array whose elements we are casting.
-    skipna : bool, default False
+    arr : array-like
+        The values to be converted to str, if needed.
+    na_value : Any
+        The value to use for na. For example, np.nan or pd.NA.
+    convert_na_value : bool, default True
+        If False, existing na values will be used unchanged in the new array.
+    copy : bool, default True
+        Whether to ensure that a new array is returned.
+    skipna : bool, default True
         Whether or not to coerce nulls to their stringified form
-        (e.g. NaN becomes 'nan').
+        (e.g. if False, NaN becomes 'nan').
 
     Returns
     -------
     ndarray
-        A new array with the input array's elements casted.
+        An array with the input array's elements casted to str or nan-like.
     """
     cdef:
-        object arr_i
-        Py_ssize_t i, n = arr.size
-        ndarray[object] result = np.empty(n, dtype=object)
-
-    for i in range(n):
-        arr_i = arr[i]
+        Py_ssize_t i = 0, n = len(arr)
 
-        if not (skipna and checknull(arr_i)):
-            arr_i = str(arr_i)
+    result = np.asarray(arr, dtype="object")
+    if copy and result is arr:
+        result = result.copy()
 
-        result[i] = arr_i
+    for i in range(n):
+        val = result[i]
+        if not checknull(val):
+            result[i] = str(val)
+        else:
+            if convert_na_value:
+                val = na_value
+            if skipna:
+                result[i] = val
+            else:
+                result[i] = str(val)
 
     return result
 
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
@@ -178,11 +178,10 @@ class StringArray(PandasArray):
 
     def __init__(self, values, copy=False):
         values = extract_array(values)
-        skip_validation = isinstance(values, type(self))
 
         super().__init__(values, copy=copy)
         self._dtype = StringDtype()
-        if not skip_validation:
+        if not isinstance(values, type(self)):
             self._validate()
 
     def _validate(self):
@@ -201,23 +200,11 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
             assert dtype == "string"
 
         result = np.asarray(scalars, dtype="object")
-        if copy and result is scalars:
-            result = result.copy()
-
-        # Standardize all missing-like values to NA
-        # TODO: it would be nice to do this in _validate / lib.is_string_array
-        # We are already doing a scan over the values there.
-        na_values = isna(result)
-        has_nans = na_values.any()
-        if has_nans and result is scalars:
-            # force a copy now, if we haven't already
-            result = result.copy()
-
-        # convert to str, then to object to avoid dtype like '<U3', then insert na_value
-        result = np.asarray(result, dtype=str)
-        result = np.asarray(result, dtype="object")
-        if has_nans:
-            result[na_values] = StringDtype.na_value
+
+        # convert non-na-likes to str, and nan-likes to StringDtype.na_value
+        result = lib.ensure_string_array(
+            result, na_value=StringDtype.na_value, copy=copy
+        )
 
         return cls(result)
 
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
@@ -916,7 +916,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
         dtype = pandas_dtype(dtype)
 
     if issubclass(dtype.type, str):
-        return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape)
+        return lib.ensure_string_array(arr.ravel(), skipna=skipna).reshape(arr.shape)
 
     elif is_datetime64_dtype(arr):
         if is_object_dtype(dtype):
@@ -1608,19 +1608,11 @@ def construct_1d_ndarray_preserving_na(
     >>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str'))
     array(['1.0', '2.0', None], dtype=object)
     """
-    subarr = np.array(values, dtype=dtype, copy=copy)
 
     if dtype is not None and dtype.kind == "U":
-        # GH-21083
-        # We can't just return np.array(subarr, dtype='str') since
-        # NumPy will convert the non-string objects into strings
-        # Including NA values. Se we have to go
-        # string -> object -> update NA, which requires an
-        # additional pass over the data.
-        na_values = isna(values)
-        subarr2 = subarr.astype(object)
-        subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
-        subarr = subarr2
+        subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
+    else:
+        subarr = np.array(values, dtype=dtype, copy=copy)
 
     return subarr
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -206,12 +206,16 @@ def test_constructor_raises():
 
 @pytest.mark.parametrize("copy", [True, False])
 def test_from_sequence_no_mutate(copy):
-    a = np.array(["a", np.nan], dtype=object)
-    original = a.copy()
-    result = pd.arrays.StringArray._from_sequence(a, copy=copy)
-    expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
+    nan_arr = np.array(["a", np.nan], dtype=object)
+    na_arr = np.array(["a", pd.NA], dtype=object)
+
+    result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy)
+    expected = pd.arrays.StringArray(na_arr)
+
     tm.assert_extension_array_equal(result, expected)
-    tm.assert_numpy_array_equal(a, original)
+
+    expected = nan_arr if copy else na_arr
+    tm.assert_numpy_array_equal(nan_arr, expected)
 
 
 def test_astype_int():