From eb75042955106ec6e149dbde5d69d8f5eb98961a Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 19 Mar 2023 19:24:06 +0100 Subject: [PATCH 1/2] BUG: Fix regression when using Series with arrow string array --- pandas/_libs/lib.pyx | 7 +++++-- pandas/core/arrays/string_arrow.py | 2 ++ pandas/tests/extension/test_arrow.py | 8 ++++++++ 3 files changed, 15 insertions(+), 2 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index a9fcf6b28953b..461e22c6ac1ef 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -752,8 +752,11 @@ cpdef ndarray[object] ensure_string_array( out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out - - arr = arr.to_numpy() + if hasattr(arr, "type"): + # pyarrow array + arr = np.array(arr) + else: + arr = arr.to_numpy() elif not util.is_array(arr): arr = np.array(arr, dtype="object") diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index d5c1f98e63f14..19d040618ec99 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -151,6 +151,8 @@ def _from_sequence(cls, scalars, dtype: Dtype | None = None, copy: bool = False) result = scalars._data result = lib.ensure_string_array(result, copy=copy, convert_na_value=False) return cls(pa.array(result, mask=na_values, type=pa.string())) + elif isinstance(scalars, (pa.Array, pa.ChunkedArray)): + return cls(pc.cast(scalars, pa.string())) # convert non-na-likes to str result = lib.ensure_string_array(scalars, copy=copy) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 517626f8c2abb..1cdff54cd6df6 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2353,6 +2353,14 @@ def test_concat_empty_arrow_backed_series(dtype): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("dtype", ["string", "string[pyarrow]"]) +def test_series_from_string_array(dtype): + arr = pa.array("the quick brown fox".split()) + ser = pd.Series(arr, dtype=dtype) + expected = pd.Series(ArrowExtensionArray(arr), dtype=dtype) + tm.assert_series_equal(ser, expected) + + # _data was renamed to _pa_data class OldArrowExtensionArray(ArrowExtensionArray): def __getstate__(self): From 91663370031c66428d57c15be064e6c77e17ce54 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 21 Mar 2023 09:43:31 +0100 Subject: [PATCH 2/2] Move --- pandas/_libs/lib.pyx | 6 +----- pandas/core/arrays/string_.py | 3 +++ 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 461e22c6ac1ef..88ea61a23a426 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -752,11 +752,7 @@ cpdef ndarray[object] ensure_string_array( out = arr.astype(str).astype(object) out[arr.isna()] = na_value return out - if hasattr(arr, "type"): - # pyarrow array - arr = np.array(arr) - else: - arr = arr.to_numpy() + arr = arr.to_numpy() elif not util.is_array(arr): arr = np.array(arr, dtype="object") diff --git a/pandas/core/arrays/string_.py b/pandas/core/arrays/string_.py index 89f44de3386c7..4bd95da2b6b07 100644 --- a/pandas/core/arrays/string_.py +++ b/pandas/core/arrays/string_.py @@ -352,6 +352,9 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal result[na_values] = libmissing.NA else: + if hasattr(scalars, "type"): + # pyarrow array + scalars = np.array(scalars) # convert non-na-likes to str, and nan-likes to StringDtype().na_value result = lib.ensure_string_array(scalars, na_value=libmissing.NA, copy=copy)