From df8e4d6498ee63fef64e4d7dd704b65f78e21e12 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sun, 2 Aug 2020 20:03:39 +0100
Subject: [PATCH 1/9] REF: StringArray._from_sequence

---
 doc/source/whatsnew/v1.1.1.rst             |  5 +++++
 pandas/_libs/lib.pyx                       | 14 ++++++++++++++
 pandas/core/arrays/string_.py              | 21 ++++-----------------
 pandas/tests/arrays/string_/test_string.py | 14 +++++++++-----
 4 files changed, 32 insertions(+), 22 deletions(-)

diff --git a/doc/source/whatsnew/v1.1.1.rst b/doc/source/whatsnew/v1.1.1.rst
index 565b4a014bd0c..f1a2f3be5dca3 100644
--- a/doc/source/whatsnew/v1.1.1.rst
+++ b/doc/source/whatsnew/v1.1.1.rst
@@ -74,6 +74,11 @@ Categorical
 - Bug in :class:`DataFrame` constructor failing to raise ``ValueError`` in some cases when data and index have mismatched lengths (:issue:`33437`)
 -
 
+**Strings**
+
+- fix memory usage issue when instantiating large  :class:`pandas.arrays.StringArray` (:issue:`35499`)
+
+
 .. ---------------------------------------------------------------------------
 
 .. _whatsnew_111.contributors:
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 5fa91ffee8ea8..d052e3eaeff03 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1698,6 +1698,20 @@ cpdef bint is_string_array(ndarray values, bint skipna=False):
     return validator.validate(values)
 
 
+cpdef ndarray ensure_string_array(ndarray values, object na_value):
+    cdef:
+        Py_ssize_t i = 0, n = len(values)
+
+    for i in range(n):
+        val = values[i]
+        if not checknull(val):
+            values[i] = str(val)
+        else:
+            values[i] = na_value
+
+    return values
+
+
 cdef class BytesValidator(Validator):
     cdef inline bint is_value_typed(self, object value) except -1:
         return isinstance(value, bytes)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index bb55c3cdea45c..e7a89a3174ac0 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -177,11 +177,10 @@ class StringArray(PandasArray):
 
     def __init__(self, values, copy=False):
         values = extract_array(values)
-        skip_validation = isinstance(values, type(self))
 
         super().__init__(values, copy=copy)
         self._dtype = StringDtype()
-        if not skip_validation:
+        if not isinstance(values, type(self)):
             self._validate()
 
     def _validate(self):
@@ -195,7 +194,7 @@ def _validate(self):
             )
 
     @classmethod
-    def _from_sequence(cls, scalars, dtype=None, copy=False):
+    def _from_sequence(cls, scalars, dtype=None, copy=True):
         if dtype:
             assert dtype == "string"
 
@@ -203,20 +202,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
         if copy and result is scalars:
             result = result.copy()
 
-        # Standardize all missing-like values to NA
-        # TODO: it would be nice to do this in _validate / lib.is_string_array
-        # We are already doing a scan over the values there.
-        na_values = isna(result)
-        has_nans = na_values.any()
-        if has_nans and result is scalars:
-            # force a copy now, if we haven't already
-            result = result.copy()
-
-        # convert to str, then to object to avoid dtype like '<U3', then insert na_value
-        result = np.asarray(result, dtype=str)
-        result = np.asarray(result, dtype="object")
-        if has_nans:
-            result[na_values] = StringDtype.na_value
+        # convert non-na-likes to str, and nan-likes to StringDtype.na_value
+        result = lib.ensure_string_array(result, StringDtype.na_value)
 
         return cls(result)
 
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
index 6f9a1a5be4c43..efd5d29ae0717 100644
--- a/pandas/tests/arrays/string_/test_string.py
+++ b/pandas/tests/arrays/string_/test_string.py
@@ -206,12 +206,16 @@ def test_constructor_raises():
 
 @pytest.mark.parametrize("copy", [True, False])
 def test_from_sequence_no_mutate(copy):
-    a = np.array(["a", np.nan], dtype=object)
-    original = a.copy()
-    result = pd.arrays.StringArray._from_sequence(a, copy=copy)
-    expected = pd.arrays.StringArray(np.array(["a", pd.NA], dtype=object))
+    nan_arr = np.array(["a", np.nan], dtype=object)
+    na_arr = np.array(["a", pd.NA], dtype=object)
+
+    result = pd.arrays.StringArray._from_sequence(nan_arr, copy=copy)
+    expected = pd.arrays.StringArray(na_arr)
+
     tm.assert_extension_array_equal(result, expected)
-    tm.assert_numpy_array_equal(a, original)
+
+    expected = nan_arr if copy else na_arr
+    tm.assert_numpy_array_equal(nan_arr, expected)
 
 
 def test_astype_int():

From ac7ee274a3d1cb4c6c3d2bb58bb458c7296dec0d Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Mon, 3 Aug 2020 19:07:31 +0100
Subject: [PATCH 2/9] Use ensure_string_array in also in
 construct_1d_ndarray_preserving_na

---
 pandas/_libs/lib.pyx          | 44 ++++++++++++++++++++++++++++-------
 pandas/core/arrays/string_.py |  5 ++--
 pandas/core/dtypes/cast.py    | 14 +++--------
 3 files changed, 41 insertions(+), 22 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index d052e3eaeff03..191340b6f9294 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1698,18 +1698,46 @@ cpdef bint is_string_array(ndarray values, bint skipna=False):
     return validator.validate(values)
 
 
-cpdef ndarray ensure_string_array(ndarray values, object na_value):
+cpdef ndarray ensure_string_array(
+        values, object na_value=np.nan, bint convert_na_value=True, bint copy=True):
+    """Returns a new numpy array with object dtype and only strings and na values.
+    
+    Parameters
+    ---------
+    values : array-like
+        The values to be converted to str, if needed
+    na_value : Any
+        The value to use for na. For example, np.nan or pd.NAN
+    convert_na_value : bool, default True
+        If False, existing na values will be used unchanged in the new array
+    copy : bool, default True
+        Whether to wnsure that a new array is returned
+
+    Returns
+    -------
+    ndarray    
+    """
     cdef:
         Py_ssize_t i = 0, n = len(values)
 
-    for i in range(n):
-        val = values[i]
-        if not checknull(val):
-            values[i] = str(val)
-        else:
-            values[i] = na_value
+    result = np.asarray(values, dtype="object")
+    if copy and result is values:
+        result = result.copy()
 
-    return values
+    if convert_na_value:
+        for i in range(n):
+            val = result[i]
+            if not checknull(val):
+                result[i] = str(val)
+            else:
+                result[i] = na_value
+    else:
+        for i in range(n):
+            val = result[i]
+            if not checknull(val):
+                result[i] = str(val)
+
+    return result
 
 
 cdef class BytesValidator(Validator):
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index e7a89a3174ac0..68d09556d97d6 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -199,11 +199,10 @@ def _from_sequence(cls, scalars, dtype=None, copy=True):
             assert dtype == "string"
 
         result = np.asarray(scalars, dtype="object")
-        if copy and result is scalars:
-            result = result.copy()
 
         # convert non-na-likes to str, and nan-likes to StringDtype.na_value
-        result = lib.ensure_string_array(result, StringDtype.na_value)
+        result = lib.ensure_string_array(
+            result, na_value=StringDtype.na_value, copy=copy)
 
         return cls(result)
 
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 228329898b6a4..5fcad6009504b 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1608,19 +1608,11 @@ def construct_1d_ndarray_preserving_na(
     >>> construct_1d_ndarray_preserving_na([1.0, 2.0, None], dtype=np.dtype('str'))
     array(['1.0', '2.0', None], dtype=object)
     """
-    subarr = np.array(values, dtype=dtype, copy=copy)
 
     if dtype is not None and dtype.kind == "U":
-        # GH-21083
-        # We can't just return np.array(subarr, dtype='str') since
-        # NumPy will convert the non-string objects into strings
-        # Including NA values. Se we have to go
-        # string -> object -> update NA, which requires an
-        # additional pass over the data.
-        na_values = isna(values)
-        subarr2 = subarr.astype(object)
-        subarr2[na_values] = np.asarray(values, dtype=object)[na_values]
-        subarr = subarr2
+        subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
+    else:
+        subarr = np.array(values, dtype=dtype, copy=copy)
 
     return subarr
 

From 887736ab5da9b336bce25b7d0a4e5811475aa8d2 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Mon, 3 Aug 2020 19:31:19 +0100
Subject: [PATCH 3/9] fix linting

---
 pandas/_libs/lib.pyx          | 6 +++---
 pandas/core/arrays/string_.py | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 191340b6f9294..8b42328ba8e8c 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1701,9 +1701,9 @@ cpdef bint is_string_array(ndarray values, bint skipna=False):
 cpdef ndarray ensure_string_array(
         values, object na_value=np.nan, bint convert_na_value=True, bint copy=True):
     """Returns a new numpy array with object dtype and only strings and na values.
-    
+
     Parameters
-    ---------
+    ----------
     values : array-like
         The values to be converted to str, if needed
     na_value : Any
@@ -1715,7 +1715,7 @@ cpdef ndarray ensure_string_array(
 
     Returns
     -------
-    ndarray    
+    ndarray
     """
     cdef:
         Py_ssize_t i = 0, n = len(values)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 68d09556d97d6..738571551de15 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -202,7 +202,8 @@ def _from_sequence(cls, scalars, dtype=None, copy=True):
 
         # convert non-na-likes to str, and nan-likes to StringDtype.na_value
         result = lib.ensure_string_array(
-            result, na_value=StringDtype.na_value, copy=copy)
+            result, na_value=StringDtype.na_value, copy=copy
+        )
 
         return cls(result)
 

From 61f3bd393e01f88f212ff6f74ece3e8a831c11bb Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Mon, 3 Aug 2020 20:01:33 +0100
Subject: [PATCH 4/9] fix copy param

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 738571551de15..381968f9724b6 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -194,7 +194,7 @@ def _validate(self):
             )
 
     @classmethod
-    def _from_sequence(cls, scalars, dtype=None, copy=True):
+    def _from_sequence(cls, scalars, dtype=None, copy=False):
         if dtype:
             assert dtype == "string"
 

From c6afa1e5ab3cb5d26436573106ce76a8ac67ccf1 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Mon, 3 Aug 2020 21:48:25 +0100
Subject: [PATCH 5/9] fix comments

---
 pandas/_libs/lib.pyx | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 8b42328ba8e8c..5132ce234c9a7 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -1705,13 +1705,13 @@ cpdef ndarray ensure_string_array(
     Parameters
     ----------
     values : array-like
-        The values to be converted to str, if needed
+        The values to be converted to str, if needed.
     na_value : Any
-        The value to use for na. For example, np.nan or pd.NAN
+        The value to use for na. For example, np.nan or pd.NA.
     convert_na_value : bool, default True
-        If False, existing na values will be used unchanged in the new array
+        If False, existing na values will be used unchanged in the new array.
     copy : bool, default True
-        Whether to wnsure that a new array is returned
+        Whether to ensure that a new array is returned.
 
     Returns
     -------

From ce18bb9f15905c01c4b1574862ca70beef839784 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Tue, 4 Aug 2020 08:08:22 +0100
Subject: [PATCH 6/9] delete libs_.lib.astype_str

---
 pandas/_libs/lib.pyx       | 93 ++++++++++++++------------------------
 pandas/core/dtypes/cast.py |  3 +-
 2 files changed, 36 insertions(+), 60 deletions(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 5132ce234c9a7..796f4357c397a 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -618,35 +618,52 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
 
 @cython.wraparound(False)
 @cython.boundscheck(False)
-def astype_str(arr: ndarray, skipna: bool=False) -> ndarray[object]:
-    """
-    Convert all elements in an array to string.
+cpdef ndarray[object] ensure_string_array(
+        ndarray[object] arr,
+        object na_value=np.nan,
+        bint convert_na_value=True,
+        bint copy=True,
+        bint skipna=True,
+):
+    """Returns a new numpy array with object dtype and only strings and na values.
 
     Parameters
     ----------
-    arr : ndarray
-        The array whose elements we are casting.
-    skipna : bool, default False
+    arr : array-like
+        The values to be converted to str, if needed.
+    na_value : Any
+        The value to use for na. For example, np.nan or pd.NA.
+    convert_na_value : bool, default True
+        If False, existing na values will be used unchanged in the new array.
+    copy : bool, default True
+        Whether to ensure that a new array is returned.
+    skipna : bool, default True
         Whether or not to coerce nulls to their stringified form
-        (e.g. NaN becomes 'nan').
+        (e.g. if False, NaN becomes 'nan').
 
     Returns
     -------
     ndarray
-        A new array with the input array's elements casted.
+        An array with the input array's elements casted to str or nan-like.
     """
     cdef:
-        object arr_i
-        Py_ssize_t i, n = arr.size
-        ndarray[object] result = np.empty(n, dtype=object)
-
-    for i in range(n):
-        arr_i = arr[i]
+        Py_ssize_t i = 0, n = len(arr)
 
-        if not (skipna and checknull(arr_i)):
-            arr_i = str(arr_i)
+    result = np.asarray(arr, dtype="object")
+    if copy and result is arr:
+        result = result.copy()
 
-        result[i] = arr_i
+    for i in range(n):
+        val = result[i]
+        if not checknull(val):
+            result[i] = str(val)
+        else:
+            if convert_na_value:
+                val = na_value
+            if skipna:
+                result[i] = val
+            else:
+                result[i] = str(val)
 
     return result
 
@@ -1698,48 +1715,6 @@ cpdef bint is_string_array(ndarray values, bint skipna=False):
     return validator.validate(values)
 
 
-cpdef ndarray ensure_string_array(
-        values, object na_value=np.nan, bint convert_na_value=True, bint copy=True):
-    """Returns a new numpy array with object dtype and only strings and na values.
-
-    Parameters
-    ----------
-    values : array-like
-        The values to be converted to str, if needed.
-    na_value : Any
-        The value to use for na. For example, np.nan or pd.NA.
-    convert_na_value : bool, default True
-        If False, existing na values will be used unchanged in the new array.
-    copy : bool, default True
-        Whether to ensure that a new array is returned.
-
-    Returns
-    -------
-    ndarray
-    """
-    cdef:
-        Py_ssize_t i = 0, n = len(values)
-
-    result = np.asarray(values, dtype="object")
-    if copy and result is values:
-        result = result.copy()
-
-    if convert_na_value:
-        for i in range(n):
-            val = result[i]
-            if not checknull(val):
-                result[i] = str(val)
-            else:
-                result[i] = na_value
-    else:
-        for i in range(n):
-            val = result[i]
-            if not checknull(val):
-                result[i] = str(val)
-
-    return result
-
-
 cdef class BytesValidator(Validator):
     cdef inline bint is_value_typed(self, object value) except -1:
         return isinstance(value, bytes)
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 5fcad6009504b..6c7c14740f6bd 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -916,7 +916,7 @@ def astype_nansafe(arr, dtype, copy: bool = True, skipna: bool = False):
         dtype = pandas_dtype(dtype)
 
     if issubclass(dtype.type, str):
-        return lib.astype_str(arr.ravel(), skipna=skipna).reshape(arr.shape)
+        return lib.ensure_string_array(arr.ravel(), skipna=skipna).reshape(arr.shape)
 
     elif is_datetime64_dtype(arr):
         if is_object_dtype(dtype):
@@ -1610,6 +1610,7 @@ def construct_1d_ndarray_preserving_na(
     """
 
     if dtype is not None and dtype.kind == "U":
+        values = np.asarray(values, dtype="object")
         subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
     else:
         subarr = np.array(values, dtype=dtype, copy=copy)

From 9ef03559f4709224af7f1b11955fe9459f78f605 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Tue, 4 Aug 2020 08:33:47 +0100
Subject: [PATCH 7/9] correct input parameter type

---
 pandas/_libs/lib.pyx | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 796f4357c397a..52c694435e05a 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -619,7 +619,7 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef ndarray[object] ensure_string_array(
-        ndarray[object] arr,
+        ndarray arr,
         object na_value=np.nan,
         bint convert_na_value=True,
         bint copy=True,

From 3db2884c9878e3f7e1361a0b664a0b6d497db421 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Tue, 4 Aug 2020 23:23:45 +0100
Subject: [PATCH 8/9] Add ASVs

---
 asv_bench/benchmarks/array.py   |  2 ++
 asv_bench/benchmarks/strings.py | 15 +++++++++++++++
 pandas/_libs/lib.pyx            |  2 +-
 pandas/core/dtypes/cast.py      |  1 -
 4 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
index 103df0fd94847..19e54e5ebcaa3 100644
--- a/asv_bench/benchmarks/array.py
+++ b/asv_bench/benchmarks/array.py
@@ -2,6 +2,8 @@
 
 import pandas as pd
 
+from .pandas_vb_common import tm
+
 
 class BooleanArray:
     def setup(self):
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index d7fb2775376c0..61e47fb388662 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -7,6 +7,21 @@
 from .pandas_vb_common import tm
 
 
+class Construction:
+
+    params = ["str", "string"]
+    param_names = ["dtype"]
+
+    def setup(self):
+        self.data = tm.rands_array(nchars=10 ** 5, size=10)
+
+    def time_construction(self, dtype):
+        Series(self.data, dtype=dtype)
+
+    def peakmem_construction(self, dtype):
+        Series(self.data, dtype=dtype)
+
+
 class Methods:
     def setup(self):
         self.s = Series(tm.makeStringIndex(10 ** 5))
diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx
index 52c694435e05a..eadfcefaac73d 100644
--- a/pandas/_libs/lib.pyx
+++ b/pandas/_libs/lib.pyx
@@ -619,7 +619,7 @@ def astype_intsafe(ndarray[object] arr, new_dtype):
 @cython.wraparound(False)
 @cython.boundscheck(False)
 cpdef ndarray[object] ensure_string_array(
-        ndarray arr,
+        arr,
         object na_value=np.nan,
         bint convert_na_value=True,
         bint copy=True,
diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py
index 6c7c14740f6bd..2697f42eb05a4 100644
--- a/pandas/core/dtypes/cast.py
+++ b/pandas/core/dtypes/cast.py
@@ -1610,7 +1610,6 @@ def construct_1d_ndarray_preserving_na(
     """
 
     if dtype is not None and dtype.kind == "U":
-        values = np.asarray(values, dtype="object")
         subarr = lib.ensure_string_array(values, convert_na_value=False, copy=copy)
     else:
         subarr = np.array(values, dtype=dtype, copy=copy)

From 47b5d69748eaaa5d58750926250c0fcc3455fdee Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Tue, 4 Aug 2020 23:45:43 +0100
Subject: [PATCH 9/9] cleanups

---
 asv_bench/benchmarks/array.py   | 2 --
 asv_bench/benchmarks/strings.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/asv_bench/benchmarks/array.py b/asv_bench/benchmarks/array.py
index 19e54e5ebcaa3..103df0fd94847 100644
--- a/asv_bench/benchmarks/array.py
+++ b/asv_bench/benchmarks/array.py
@@ -2,8 +2,6 @@
 
 import pandas as pd
 
-from .pandas_vb_common import tm
-
 
 class BooleanArray:
     def setup(self):
diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py
index 61e47fb388662..2023858181baa 100644
--- a/asv_bench/benchmarks/strings.py
+++ b/asv_bench/benchmarks/strings.py
@@ -12,7 +12,7 @@ class Construction:
     params = ["str", "string"]
     param_names = ["dtype"]
 
-    def setup(self):
+    def setup(self, dtype):
         self.data = tm.rands_array(nchars=10 ** 5, size=10)
 
     def time_construction(self, dtype):