From bfaa437701cf0c85e60e7c5c749576fd8d82ff6f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Apr 2023 15:59:36 -0700 Subject: [PATCH 1/2] BUG: describe not returning ArrowDtype --- doc/source/whatsnew/v2.0.1.rst | 2 +- pandas/core/methods/describe.py | 8 +++++++- pandas/tests/extension/test_arrow.py | 12 ++++++++++++ 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v2.0.1.rst b/doc/source/whatsnew/v2.0.1.rst index 0122c84ba2a8e..bab7f02d8eed0 100644 --- a/doc/source/whatsnew/v2.0.1.rst +++ b/doc/source/whatsnew/v2.0.1.rst @@ -20,7 +20,7 @@ Fixed regressions Bug fixes ~~~~~~~~~ -- +- Bug in :meth:`Series.describe` not returning :class:`ArrowDtype` with ``pyarrow.float64`` type with numeric data (:issue:`52427`) .. --------------------------------------------------------------------------- .. _whatsnew_201.other: diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index e9f1eaabbe246..45cf038ebc19e 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -36,6 +36,7 @@ is_timedelta64_dtype, ) +from pandas.core.arrays.arrow.dtype import ArrowDtype from pandas.core.arrays.floating import Float64Dtype from pandas.core.reshape.concat import concat @@ -229,7 +230,12 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: # GH#48340 - always return float on non-complex numeric data dtype: DtypeObj | None if is_extension_array_dtype(series.dtype): - dtype = Float64Dtype() + if isinstance(series.dtype, ArrowDtype): + import pyarrow as pa + + dtype = ArrowDtype(pa.float64()) + else: + dtype = Float64Dtype() elif is_numeric_dtype(series.dtype) and not is_complex_dtype(series.dtype): dtype = np.dtype("float") else: diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index df470d85a4fad..41502e507db89 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2387,3 +2387,15 @@ def test_setitem_boolean_replace_with_mask_segfault(): expected = arr.copy() arr[np.zeros((N,), dtype=np.bool_)] = False assert arr._pa_array == expected._pa_array + + +@pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES) +def test_describe_numeric_data(pa_type): + data = pd.Series([1, 2, 3], dtype=ArrowDtype(pa_type)) + result = data.describe() + expected = pd.Series( + [3, 2, 1, 1, 1.5, 2.0, 2.5, 3], + dtype=ArrowDtype(pa.float64()), + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) From 760549a60f3afc823a00bdfdcb3a450ccfce8e8b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 5 Apr 2023 16:43:43 -0700 Subject: [PATCH 2/2] Add GH issue --- pandas/tests/extension/test_arrow.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 41502e507db89..31d4c76a8db11 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2391,6 +2391,7 @@ def test_setitem_boolean_replace_with_mask_segfault(): @pytest.mark.parametrize("pa_type", tm.ALL_INT_PYARROW_DTYPES + tm.FLOAT_PYARROW_DTYPES) def test_describe_numeric_data(pa_type): + # GH 52470 data = pd.Series([1, 2, 3], dtype=ArrowDtype(pa_type)) result = data.describe() expected = pd.Series(