From d63037da2076475900ec62e584b9dc6653764574 Mon Sep 17 00:00:00 2001 From: Tim Loderhose Date: Mon, 18 Jul 2022 18:38:19 +0200 Subject: [PATCH 1/7] Bypass chunking/validation logic in __from_arrow__ Instead of converting each chunk to a StringArray after casting to array and then concatenating, instead use pyarrow to concatenate chunks and convert to numpy. Finally, we bypass validation the validation logic by initializing NDArrayBacked instead of StringArray. --- pandas/core/arrays/string_.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index c9abef226770c..56a8294427e31 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -195,16 +195,14 @@ def __from_arrow__( # pyarrow.ChunkedArray chunks = array.chunks - results = [] - for arr in chunks: - # using _from_sequence to ensure None is converted to NA - str_arr = StringArray._from_sequence(np.array(arr)) - results.append(str_arr) - - if results: - return StringArray._concat_same_type(results) - else: - return StringArray(np.array([], dtype="object")) + # Bypass validation inside StringArray constructor + new_string_array = StringArray.__new__(StringArray) + NDArrayBacked.__init__( + new_string_array, + pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False), + StringDtype(storage="python"), + ) + return new_string_array class BaseStringArray(ExtensionArray): From 9bb031201c6df30e46e2d2d9ad7d595f8c0cab1d Mon Sep 17 00:00:00 2001 From: Tim Loderhose Date: Mon, 18 Jul 2022 21:19:03 +0200 Subject: [PATCH 2/7] Handle zero-chunks correctly & convert None to NA --- pandas/core/arrays/string_.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 56a8294427e31..2ca16f65bdbe7 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -195,11 +195,16 @@ def __from_arrow__( # pyarrow.ChunkedArray chunks = array.chunks + if len(chunks) == 0: + arr = np.array([], dtype=object) + else: + arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) + arr = lib.convert_nans_to_NA(arr) # Bypass validation inside StringArray constructor new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( new_string_array, - pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False), + arr, StringDtype(storage="python"), ) return new_string_array From b538260222eec8360ec00c274eb29199ead1d989 Mon Sep 17 00:00:00 2001 From: Tim Loderhose Date: Mon, 18 Jul 2022 21:19:25 +0200 Subject: [PATCH 3/7] Add change to whatsnew --- doc/source/whatsnew/v1.5.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst index 22a5f2a08362f..5347f1a38e6fe 100644 --- a/doc/source/whatsnew/v1.5.0.rst +++ b/doc/source/whatsnew/v1.5.0.rst @@ -802,6 +802,7 @@ Performance improvements - Performance improvement in datetime arrays string formatting when one of the default strftime formats ``"%Y-%m-%d %H:%M:%S"`` or ``"%Y-%m-%d %H:%M:%S.%f"`` is used. (:issue:`44764`) - Performance improvement in :meth:`Series.to_sql` and :meth:`DataFrame.to_sql` (:class:`SQLiteTable`) when processing time arrays. (:issue:`44764`) - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47404`, :issue:`47405`) +- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) - .. --------------------------------------------------------------------------- From 71760864ae99c089f78adf43c0653dc581b3fcf2 Mon Sep 17 00:00:00 2001 From: Tim Loderhose Date: Sun, 23 Oct 2022 14:32:12 +0200 Subject: [PATCH 4/7] Add to v2 whatsnew --- doc/source/whatsnew/v2.0.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 0880b8e2cac12..e2dac1fa50533 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -231,6 +231,7 @@ Performance improvements - Performance improvements to :func:`read_sas` (:issue:`47403`, :issue:`47405`, :issue:`47656`, :issue:`48502`) - Memory improvement in :meth:`RangeIndex.sort_values` (:issue:`48801`) - Performance improvement in :class:`DataFrameGroupBy` and :class:`SeriesGroupBy` when ``by`` is a categorical type and ``sort=False`` (:issue:`48976`) +- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) .. --------------------------------------------------------------------------- .. _whatsnew_200.bug_fixes: From 3d1162e2c84724279ac0b6c27158335087de9aff Mon Sep 17 00:00:00 2001 From: Tim Loderhose Date: Wed, 30 Nov 2022 14:31:36 +0100 Subject: [PATCH 5/7] Add GH issue to comment about validation bypass --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 41006ce6e44b3..922665d8364c1 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -203,7 +203,7 @@ def __from_arrow__( else: arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) arr = lib.convert_nans_to_NA(arr) - # Bypass validation inside StringArray constructor + # Bypass validation inside StringArray constructor, see GH47781 new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( new_string_array, From c2d131adbb1c0a2190e50ead67074e6ad105a5ee Mon Sep 17 00:00:00 2001 From: Tim Loderhose Date: Wed, 30 Nov 2022 14:33:55 +0100 Subject: [PATCH 6/7] Add # to GH issue --- pandas/core/arrays/string_.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 922665d8364c1..2ca1aa49fe6e3 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -203,7 +203,7 @@ def __from_arrow__( else: arr = pyarrow.concat_arrays(chunks).to_numpy(zero_copy_only=False) arr = lib.convert_nans_to_NA(arr) - # Bypass validation inside StringArray constructor, see GH47781 + # Bypass validation inside StringArray constructor, see GH#47781 new_string_array = StringArray.__new__(StringArray) NDArrayBacked.__init__( new_string_array, From 563e247b010888cdd768add65172f54bace1c691 Mon Sep 17 00:00:00 2001 From: Tim Loderhose Date: Fri, 24 Feb 2023 13:04:24 +0900 Subject: [PATCH 7/7] Move release note to v2.1.0 --- doc/source/whatsnew/v2.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v2.1.0.rst b/doc/source/whatsnew/v2.1.0.rst index f6a6c81bfe25d..3b24310014ff8 100644 --- a/doc/source/whatsnew/v2.1.0.rst +++ b/doc/source/whatsnew/v2.1.0.rst @@ -102,6 +102,7 @@ Performance improvements ~~~~~~~~~~~~~~~~~~~~~~~~ - Performance improvement in :meth:`DataFrame.first_valid_index` and :meth:`DataFrame.last_valid_index` for extension array dtypes (:issue:`51549`) - Performance improvement in :meth:`DataFrame.clip` and :meth:`Series.clip` (:issue:`51472`) +- Performance improvement in :func:`read_parquet` on string columns when using ``use_nullable_dtypes=True`` (:issue:`47345`) - .. ---------------------------------------------------------------------------