From 7abf9ff25e7712d9859c633eb683589c2d8b2d71 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sat, 12 Sep 2020 23:34:37 +0100
Subject: [PATCH 1/4] PERF: StringArray construction

---
 pandas/core/arrays/string_.py | 44 +++++++++++++++++++++++------------
 1 file changed, 29 insertions(+), 15 deletions(-)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 381968f9724b6..2907cd3ae9d02 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -1,5 +1,5 @@
 import operator
-from typing import TYPE_CHECKING, Type, Union
+from typing import TYPE_CHECKING, Optional, Type, Union
 
 import numpy as np
 
@@ -122,6 +122,9 @@ class StringArray(PandasArray):
 
     copy : bool, default False
         Whether to copy the array of data.
+    convert : bool, default False
+        If true, force conversion of non-na scalars to strings.
+        If False, raises a ValueError, if a scalar is neither a string nor na.
 
     Attributes
     ----------
@@ -162,7 +165,15 @@ class StringArray(PandasArray):
     ['1', '1']
     Length: 2, dtype: string
 
-    However, instantiating StringArrays directly with non-strings will raise an error.
+    Instantiating StringArrays directly with non-strings will raise an error unless
+    ``convert=True``.
+
+    >>> pd.arrays.StringArray(['1', 1])
+    TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got list)
+    >>> pd.arrays.StringArray(['1', 1], convert=True)
+    <StringArray>
+    ['1', '1']
+    Length: 2, dtype: string
 
     For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
 
@@ -175,22 +186,30 @@ class StringArray(PandasArray):
     # undo the PandasArray hack
     _typ = "extension"
 
-    def __init__(self, values, copy=False):
+    def __init__(self, values, copy=False, convert: bool = False):
         values = extract_array(values)
+        if not isinstance(values, type(self)):
+            if convert:
+                values = lib.ensure_string_array(
+                    values, na_value=StringDtype.na_value, copy=copy
+                )
+            else:
+                self._validate(values)
 
         super().__init__(values, copy=copy)
         self._dtype = StringDtype()
-        if not isinstance(values, type(self)):
-            self._validate()
 
-    def _validate(self):
+    def _validate(self, values: Optional[np.ndarray] = None) -> None:
         """Validate that we only store NA or strings."""
-        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
+        if values is None:
+            values = self._ndarray
+
+        if len(values) and not lib.is_string_array(values, skipna=True):
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if self._ndarray.dtype != "object":
+        if values.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
-                f"'{self._ndarray.dtype}' dtype instead."
+                f"'{values.dtype}' dtype instead."
             )
 
     @classmethod
@@ -200,12 +219,7 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
 
         result = np.asarray(scalars, dtype="object")
 
-        # convert non-na-likes to str, and nan-likes to StringDtype.na_value
-        result = lib.ensure_string_array(
-            result, na_value=StringDtype.na_value, copy=copy
-        )
-
-        return cls(result)
+        return cls(result, copy=copy, convert=True)
 
     @classmethod
     def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):

From f1721ac58773e94d754ffda045102edb9713c92a Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sun, 13 Sep 2020 07:53:19 +0100
Subject: [PATCH 2/4] add issue number

---
 doc/source/whatsnew/v1.2.0.rst | 2 +-
 pandas/core/arrays/string_.py  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index f398af6e4dd5e..278fb7274bd9b 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -219,7 +219,7 @@ Deprecations
 Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
-- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`)
+- Performance improvements when creating Series with dtype `str` or :class:`StringDtype` from array with many string elements (:issue:`36304`, :issue:`36317`, :issue:`36325`)
 - Performance improvement in :meth:`GroupBy.agg` with the ``numba`` engine (:issue:`35759`)
 - Performance improvements when creating :meth:`pd.Series.map` from a huge dictionary (:issue:`34717`)
 - Performance improvement in :meth:`GroupBy.transform` with the ``numba`` engine (:issue:`36240`)
diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 2907cd3ae9d02..3b58570c1adff 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -165,11 +165,11 @@ class StringArray(PandasArray):
     ['1', '1']
     Length: 2, dtype: string
 
-    Instantiating StringArrays directly with non-strings will raise an error unless
-    ``convert=True``.
+    Instantiating StringArrays directly with non-strings arrays  will raise an error
+    unless ``convert=True``.
 
-    >>> pd.arrays.StringArray(['1', 1])
-    TypeError: Argument 'values' has incorrect type (expected numpy.ndarray, got list)
+    >>> pd.arrays.StringArray(np.array(['1', 1], dtype=object))
+    ValueError: StringArray requires a sequence of strings or pandas.NA
     >>> pd.arrays.StringArray(['1', 1], convert=True)
     <StringArray>
     ['1', '1']

From ee01e0282c59f845bd914d10b6a238c45abe1628 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Sun, 13 Sep 2020 08:42:29 +0100
Subject: [PATCH 3/4] clean doc string

---
 pandas/core/arrays/string_.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 3b58570c1adff..6f678d2225b49 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -168,7 +168,7 @@ class StringArray(PandasArray):
     Instantiating StringArrays directly with non-strings arrays  will raise an error
     unless ``convert=True``.
 
-    >>> pd.arrays.StringArray(np.array(['1', 1], dtype=object))
+    >>> pd.arrays.StringArray(np.array(['1', 1]))
     ValueError: StringArray requires a sequence of strings or pandas.NA
     >>> pd.arrays.StringArray(['1', 1], convert=True)
     <StringArray>
@@ -190,9 +190,8 @@ def __init__(self, values, copy=False, convert: bool = False):
         values = extract_array(values)
         if not isinstance(values, type(self)):
             if convert:
-                values = lib.ensure_string_array(
-                    values, na_value=StringDtype.na_value, copy=copy
-                )
+                na_val = StringDtype.na_value
+                values = lib.ensure_string_array(values, na_value=na_val, copy=copy)
             else:
                 self._validate(values)
 

From 39ea860483533965df934ce328ee7af8de85afd7 Mon Sep 17 00:00:00 2001
From: tp <contribute@tensortable.com>
Date: Wed, 16 Sep 2020 13:22:06 +0100
Subject: [PATCH 4/4] Refactor to avoid call to StringArray__init__ &
 validation

---
 pandas/core/arrays/string_.py | 50 +++++++++++++++--------------------
 1 file changed, 21 insertions(+), 29 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 6f678d2225b49..cef35f2b1137c 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -1,5 +1,5 @@
 import operator
-from typing import TYPE_CHECKING, Optional, Type, Union
+from typing import TYPE_CHECKING, Type, Union
 
 import numpy as np
 
@@ -122,9 +122,6 @@ class StringArray(PandasArray):
 
     copy : bool, default False
         Whether to copy the array of data.
-    convert : bool, default False
-        If true, force conversion of non-na scalars to strings.
-        If False, raises a ValueError, if a scalar is neither a string nor na.
 
     Attributes
     ----------
@@ -165,15 +162,7 @@ class StringArray(PandasArray):
     ['1', '1']
     Length: 2, dtype: string
 
-    Instantiating StringArrays directly with non-strings arrays  will raise an error
-    unless ``convert=True``.
-
-    >>> pd.arrays.StringArray(np.array(['1', 1]))
-    ValueError: StringArray requires a sequence of strings or pandas.NA
-    >>> pd.arrays.StringArray(['1', 1], convert=True)
-    <StringArray>
-    ['1', '1']
-    Length: 2, dtype: string
+    However, instantiating StringArrays directly with non-strings will raise an error.
 
     For comparison methods, `StringArray` returns a :class:`pandas.BooleanArray`:
 
@@ -186,29 +175,22 @@ class StringArray(PandasArray):
     # undo the PandasArray hack
     _typ = "extension"
 
-    def __init__(self, values, copy=False, convert: bool = False):
+    def __init__(self, values, copy=False):
         values = extract_array(values)
-        if not isinstance(values, type(self)):
-            if convert:
-                na_val = StringDtype.na_value
-                values = lib.ensure_string_array(values, na_value=na_val, copy=copy)
-            else:
-                self._validate(values)
 
         super().__init__(values, copy=copy)
         self._dtype = StringDtype()
+        if not isinstance(values, type(self)):
+            self._validate()
 
-    def _validate(self, values: Optional[np.ndarray] = None) -> None:
+    def _validate(self):
         """Validate that we only store NA or strings."""
-        if values is None:
-            values = self._ndarray
-
-        if len(values) and not lib.is_string_array(values, skipna=True):
+        if len(self._ndarray) and not lib.is_string_array(self._ndarray, skipna=True):
             raise ValueError("StringArray requires a sequence of strings or pandas.NA")
-        if values.dtype != "object":
+        if self._ndarray.dtype != "object":
             raise ValueError(
                 "StringArray requires a sequence of strings or pandas.NA. Got "
-                f"'{values.dtype}' dtype instead."
+                f"'{self._ndarray.dtype}' dtype instead."
             )
 
     @classmethod
@@ -217,8 +199,18 @@ def _from_sequence(cls, scalars, dtype=None, copy=False):
             assert dtype == "string"
 
         result = np.asarray(scalars, dtype="object")
-
-        return cls(result, copy=copy, convert=True)
+        # convert non-na-likes to str, and nan-likes to StringDtype.na_value
+        result = lib.ensure_string_array(
+            result, na_value=StringDtype.na_value, copy=copy
+        )
+
+        # Manually creating new array avoids the validation step in the __init__, so is
+        # faster. Refactor need for validation?
+        new_string_array = object.__new__(cls)
+        new_string_array._dtype = StringDtype()
+        new_string_array._ndarray = result
+
+        return new_string_array
 
     @classmethod
     def _from_sequence_of_strings(cls, strings, dtype=None, copy=False):