diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst index 3ee7031795d16..8322c8408a0e3 100644 --- a/doc/source/whatsnew/v2.0.2.rst +++ b/doc/source/whatsnew/v2.0.2.rst @@ -23,6 +23,7 @@ Bug fixes - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`) - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`) - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`) +- Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`) - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`) - diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py index a29e01dd173d3..c8f8a2127083e 100644 --- a/pandas/core/methods/describe.py +++ b/pandas/core/methods/describe.py @@ -19,7 +19,6 @@ import numpy as np -from pandas._libs import lib from pandas._libs.tslibs import Timestamp from pandas._typing import ( DtypeObj, @@ -232,9 +231,13 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series: dtype: DtypeObj | None if isinstance(series.dtype, ExtensionDtype): if isinstance(series.dtype, ArrowDtype): - import pyarrow as pa + if series.dtype.kind == "m": + # GH53001: describe timedeltas with object dtype + dtype = None + else: + import pyarrow as pa - dtype = ArrowDtype(pa.float64()) + dtype = ArrowDtype(pa.float64()) else: dtype = Float64Dtype() elif series.dtype.kind in "iufb": @@ -363,9 +366,9 @@ def select_describe_func( return describe_categorical_1d elif is_numeric_dtype(data): return describe_numeric_1d - elif lib.is_np_dtype(data.dtype, "M") or isinstance(data.dtype, DatetimeTZDtype): + elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype): return describe_timestamp_1d - elif lib.is_np_dtype(data.dtype, "m"): + elif data.dtype.kind == "m": return describe_numeric_1d else: return describe_categorical_1d diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e4115e235cd6b..ab55144aec0c8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -2842,6 +2842,36 @@ def test_describe_numeric_data(pa_type): tm.assert_series_equal(result, expected) +@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES) +def test_describe_timedelta_data(pa_type): + # GH53001 + data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type)) + result = data.describe() + expected = pd.Series( + [9] + pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist(), + dtype=object, + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + +@pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES) +def test_describe_datetime_data(pa_type): + # GH53001 + data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type)) + result = data.describe() + expected = pd.Series( + [9] + + [ + pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit) + for v in [5, 1, 3, 5, 7, 9] + ], + dtype=object, + index=["count", "mean", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize( "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES )