From d63037da2076475900ec62e584b9dc6653764574 Mon Sep 17 00:00:00 2001
From: Tim Loderhose <tim.loderhose@mlprograms.com>
Date: Mon, 18 Jul 2022 18:38:19 +0200
Subject: [PATCH 1/7] Bypass chunking/validation logic in __from_arrow__

Instead of converting each chunk to a StringArray after casting to
array and then concatenating, instead use pyarrow to concatenate chunks
and convert to numpy.

Finally, we bypass validation the validation logic by initializing
NDArrayBacked instead of StringArray.
---
 pandas/core/arrays/string_.py | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index c9abef226770c..56a8294427e31 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -195,16 +195,14 @@ def __from_arrow__(
                 # pyarrow.ChunkedArray
                 chunks = array.chunks
 
-            results = []
-            for arr in chunks:
-                # using _from_sequence to ensure None is converted to NA
-                str_arr = StringArray._from_sequence(np.array(arr))
-                results.append(str_arr)
-
-        if results:
-            return StringArray._concat_same_type(results)
-        else:
-            return StringArray(np.array([], dtype="object"))
+        # Bypass validation inside StringArray constructor
+        new_string_array = StringArray.__new__(StringArray)
+        NDArrayBacked.__init__(
+            new_string_array,
+            pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False),
+            StringDtype(storage="python"),
+        )
+        return new_string_array
 
 
 class BaseStringArray(ExtensionArray):

From 9bb031201c6df30e46e2d2d9ad7d595f8c0cab1d Mon Sep 17 00:00:00 2001
From: Tim Loderhose <tim.loderhose@mlprograms.com>
Date: Mon, 18 Jul 2022 21:19:03 +0200
Subject: [PATCH 2/7] Handle zero-chunks correctly & convert None to NA

---
 pandas/core/arrays/string_.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 56a8294427e31..2ca16f65bdbe7 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -195,11 +195,16 @@ def __from_arrow__(
                 # pyarrow.ChunkedArray
                 chunks = array.chunks
 
+        if len(chunks) == 0:
+            arr = np.array([], dtype=object)
+        else:
+            arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
+            arr = lib.convert_nans_to_NA(arr)
         # Bypass validation inside StringArray constructor
         new_string_array = StringArray.__new__(StringArray)
         NDArrayBacked.__init__(
             new_string_array,
-            pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False),
+            arr,
             StringDtype(storage="python"),
         )
         return new_string_array

From b538260222eec8360ec00c274eb29199ead1d989 Mon Sep 17 00:00:00 2001
From: Tim Loderhose <tim.loderhose@mlprograms.com>
Date: Mon, 18 Jul 2022 21:19:25 +0200
Subject: [PATCH 3/7] Add change to whatsnew

---
 doc/source/whatsnew/v1.5.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
index 22a5f2a08362f..5347f1a38e6fe 100644
--- a/doc/source/whatsnew/v1.5.0.rst
+++ b/doc/source/whatsnew/v1.5.0.rst
@@ -802,6 +802,7 @@ Performance improvements
 - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`)
 - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`)
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`)
+- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
 -
 
 .. ---------------------------------------------------------------------------

From 71760864ae99c089f78adf43c0653dc581b3fcf2 Mon Sep 17 00:00:00 2001
From: Tim Loderhose <tim@loderhose.com>
Date: Sun, 23 Oct 2022 14:32:12 +0200
Subject: [PATCH 4/7] Add to v2 whatsnew

---
 doc/source/whatsnew/v2.0.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
index 0880b8e2cac12..e2dac1fa50533 100644
--- a/doc/source/whatsnew/v2.0.0.rst
+++ b/doc/source/whatsnew/v2.0.0.rst
@@ -231,6 +231,7 @@ Performance improvements
 - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`)
 - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`)
 - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`)
+- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
 
 .. ---------------------------------------------------------------------------
 .. _whatsnew_200.bug_fixes:

From 3d1162e2c84724279ac0b6c27158335087de9aff Mon Sep 17 00:00:00 2001
From: Tim Loderhose <tim@loderhose.com>
Date: Wed, 30 Nov 2022 14:31:36 +0100
Subject: [PATCH 5/7] Add GH issue to comment about validation bypass

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 41006ce6e44b3..922665d8364c1 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -203,7 +203,7 @@ def __from_arrow__(
         else:
             arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
             arr = lib.convert_nans_to_NA(arr)
-        # Bypass validation inside StringArray constructor
+        # Bypass validation inside StringArray constructor, see GH47781
         new_string_array = StringArray.__new__(StringArray)
         NDArrayBacked.__init__(
             new_string_array,

From c2d131adbb1c0a2190e50ead67074e6ad105a5ee Mon Sep 17 00:00:00 2001
From: Tim Loderhose <tim@loderhose.com>
Date: Wed, 30 Nov 2022 14:33:55 +0100
Subject: [PATCH 6/7] Add # to GH issue

---
 pandas/core/arrays/string_.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py
index 922665d8364c1..2ca1aa49fe6e3 100644
--- a/pandas/core/arrays/string_.py
+++ b/pandas/core/arrays/string_.py
@@ -203,7 +203,7 @@ def __from_arrow__(
         else:
             arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False)
             arr = lib.convert_nans_to_NA(arr)
-        # Bypass validation inside StringArray constructor, see GH47781
+        # Bypass validation inside StringArray constructor, see GH#47781
         new_string_array = StringArray.__new__(StringArray)
         NDArrayBacked.__init__(
             new_string_array,

From 563e247b010888cdd768add65172f54bace1c691 Mon Sep 17 00:00:00 2001
From: Tim Loderhose <tim@loderhose.com>
Date: Fri, 24 Feb 2023 13:04:24 +0900
Subject: [PATCH 7/7] Move release note to v2.1.0

---
 doc/source/whatsnew/v2.1.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst
index f6a6c81bfe25d..3b24310014ff8 100644
--- a/doc/source/whatsnew/v2.1.0.rst
+++ b/doc/source/whatsnew/v2.1.0.rst
@@ -102,6 +102,7 @@ Performance improvements
 ~~~~~~~~~~~~~~~~~~~~~~~~
 - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`)
 - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`)
+- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`)
 -
 
 .. ---------------------------------------------------------------------------