From d19ff87fb4a9c58134b67d08a1c9db33cf1532c5 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 11 Apr 2023 12:04:23 +0200 Subject: [PATCH 1/4] BUG: pd.array raising with NumPy array and large dtype --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/arrays/arrow/array.py | 5 +++++ pandas/tests/extension/test_arrow.py | 14 ++++++++++++++ 3 files changed, 20 insertions(+) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index a4a58811e382f..8e4fdfaca177d 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -28,6 +28,7 @@ Bug fixes - Fixed segfault in :meth:`Series.to_numpy` with ``null[pyarrow]`` dtype (:issue:`52443`) - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) +- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 82fc826b81a51..456263c969840 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -241,6 +241,11 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal Construct a new ExtensionArray from a sequence of scalars. """ pa_dtype = to_pyarrow_type(dtype) + if isinstance(scalars, np.ndarray) and ( + pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype) + ): + scalars = scalars.tolist() + if isinstance(scalars, cls): scalars = scalars._pa_array elif not isinstance(scalars, (pa.Array, pa.ChunkedArray)): diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 2b606c2c5e711..e4dce05310b5b 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2423,6 +2423,20 @@ def test_setitem_boolean_replace_with_mask_segfault(): assert arr._pa_array == expected._pa_array +@pytest.mark.parametrize( + "data, arrow_dtype", + [ + ([b"a", b"b"], pa.large_binary()), + (["a", "b"], pa.large_string()), + ], +) +def test_conversion_large_dtypes_from_numpy_array(data, arrow_dtype): + dtype = ArrowDtype(arrow_dtype) + result = pd.array(np.array(data), dtype=dtype) + expected = pd.array(data, dtype=dtype) + tm.assert_extension_array_equal(result, expected) + + @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES) def test_describe_numeric_data(pa_type): # GH 52470 From abcfb92f32e85d471d65ba484efe545eecca7ff0 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler <61934744+phofl@users.noreply.github.com> Date: Tue, 11 Apr 2023 12:40:58 +0200 Subject: [PATCH 2/4] Fix --- pandas/core/arrays/arrow/array.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 456263c969840..af4b9bd29ab61 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -241,8 +241,12 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal Construct a new ExtensionArray from a sequence of scalars. """ pa_dtype = to_pyarrow_type(dtype) - if isinstance(scalars, np.ndarray) and ( - pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype) + if ( + isinstance(scalars, np.ndarray) + and isinstance(dtype, ArrowDtype) + and ( + pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype) + ) ): scalars = scalars.tolist() From eb30082a77f45488b07c759df39b845defe50d37 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Sun, 23 Apr 2023 11:54:15 +0200 Subject: [PATCH 3/4] Add gh ref --- pandas/core/arrays/arrow/array.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 586283059fa11..fb54cf1f0a3e0 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -252,6 +252,7 @@ def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy: bool = Fal pa.types.is_large_binary(pa_dtype) or pa.types.is_large_string(pa_dtype) ) ): + # See https://github.com/apache/arrow/issues/35289 scalars = scalars.tolist() if isinstance(scalars, cls): From 30a6917fa37395676aa78bdfcda4a1c0b5469483 Mon Sep 17 00:00:00 2001 From: Patrick Hoefler Date: Tue, 25 Apr 2023 22:05:15 +0200 Subject: [PATCH 4/4] Move --- doc/source/whatsnew/v2.0.1.rst | 1 - doc/source/whatsnew/v2.0.2.rst | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 24c380c71bceb..2613d12e43400 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -43,7 +43,6 @@ Bug fixes - Bug in :meth:`DataFrame.max` and related casting different :class:`Timestamp` resolutions always to nanoseconds (:issue:`52524`) - Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) - Bug in :meth:`Series.dt.tz_localize` incorrectly localizing timestamps with :class:`ArrowDtype` (:issue:`52677`) -- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - Bug in arithmetic between ``np.datetime64`` and ``np.timedelta64`` ``NaT`` scalars with units always returning nanosecond resolution (:issue:`52295`) - Bug in logical and comparison operations between :class:`ArrowDtype` and numpy masked types (e.g. ``"boolean"``) (:issue:`52625`) - Fixed bug in :func:`merge` when merging with ``ArrowDtype`` one one and a NumPy dtype on the other side (:issue:`52406`) diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 09932a2d2d571..f6b0b4086cb39 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -22,6 +22,7 @@ Bug fixes ~~~~~~~~~ - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) +- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - .. ---------------------------------------------------------------------------