From 0d92d380ad38cae19252ebbfc8ecce7825d5b1c6 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 11 Apr 2023 13:37:59 -0700 Subject: [PATCH 1/3] BUG: to_numeric(errors='coerce', dtype_backend='pyarrow') with ArrowDtype --- doc/source/whatsnew/v2.0.1.rst | 1 + pandas/core/tools/numeric.py | 3 ++- pandas/tests/tools/test_to_numeric.py | 10 ++++++++++ 3 files changed, 13 insertions(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 7b4dc890da3e1..601737abe4e54 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -30,6 +30,7 @@ Bug fixes - Bug in :func:`pandas.testing.assert_series_equal` where ``check_dtype=False`` would still raise for datetime or timedelta types with different resolutions (:issue:`52449`) - Bug in :meth:`ArrowDtype.__from_arrow__` not respecting if dtype is explicitly given (:issue:`52533`) - Bug in :func:`read_csv` casting PyArrow datetimes to NumPy when ``dtype_backend="pyarrow"`` and ``parse_dates`` is set causing a performance bottleneck in the process (:issue:`52546`) +- Bug in :func:`to_numeric` with ``errors='coerce'`` and ``dtype_backend='pyarrow'`` with :class:`ArrowDtype` data (:issue:`52588`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 04443f89ddf6f..418536c61c7da 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -275,7 +275,8 @@ def to_numeric( # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct # masked array if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype): - if mask is None: + if mask is None or (new_mask is not None and errors == "coerce"): + # GH 52588 mask = new_mask else: mask = mask.copy() diff --git a/pandas/tests/tools/test_to_numeric.py b/pandas/tests/tools/test_to_numeric.py index fe6794b120681..499bcae5e90f0 100644 --- a/pandas/tests/tools/test_to_numeric.py +++ b/pandas/tests/tools/test_to_numeric.py @@ -6,6 +6,7 @@ import pandas as pd from pandas import ( + ArrowDtype, DataFrame, Index, Series, @@ -942,3 +943,12 @@ def test_invalid_dtype_backend(): ) with pytest.raises(ValueError, match=msg): to_numeric(ser, dtype_backend="numpy") + + +def test_coerce_pyarrow_backend(): + # GH 52588 + pa = pytest.importorskip("pyarrow") + ser = Series(list("12x"), dtype=ArrowDtype(pa.string())) + result = to_numeric(ser, errors="coerce", dtype_backend="pyarrow") + expected = Series([1, 2, None], dtype=ArrowDtype(pa.int64())) + tm.assert_series_equal(result, expected) From e9260724b130832b9de141c79419e092235dae03 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 11 Apr 2023 13:42:50 -0700 Subject: [PATCH 2/3] Use safer check --- pandas/core/tools/numeric.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index 418536c61c7da..fb08205d20890 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -275,7 +275,9 @@ def to_numeric( # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct # masked array if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype): - if mask is None or (new_mask is not None and errors == "coerce"): + if mask is None or ( + mask is not None and new_mask is not None and new_mask.shape == mask.shape + ): # GH 52588 mask = new_mask else: From 8d853b818b3934950ff1c7de83ec6b42c8db69c0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 11 Apr 2023 13:50:17 -0700 Subject: [PATCH 3/3] Remove unnecessary check --- pandas/core/tools/numeric.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/tools/numeric.py b/pandas/core/tools/numeric.py index fb08205d20890..a06152272797f 100644 --- a/pandas/core/tools/numeric.py +++ b/pandas/core/tools/numeric.py @@ -275,9 +275,7 @@ def to_numeric( # GH33013: for IntegerArray, BooleanArray & FloatingArray need to reconstruct # masked array if (mask is not None or new_mask is not None) and not is_string_dtype(values.dtype): - if mask is None or ( - mask is not None and new_mask is not None and new_mask.shape == mask.shape - ): + if mask is None or (new_mask is not None and new_mask.shape == mask.shape): # GH 52588 mask = new_mask else: