BUG: Series.describe treating pyarrow timestamps/timedeltas as categorical (pandas-dev#53001)

lukemanley · Yi Wei · commit 88b639674133 · 2023-05-19T14:06:33.000+02:00
* Series.describe treating pyarrow timestamps and timedeltas as categorical

* gh refs

* cleanup
diff --git a/doc/source/whatsnew/v2.0.2.rst b/doc/source/whatsnew/v2.0.2.rst
@@ -23,6 +23,7 @@ Bug fixes
 - Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
 - Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
 - Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`)
+- Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`)
 - Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`)
 -
 
diff --git a/pandas/core/methods/describe.py b/pandas/core/methods/describe.py
@@ -19,7 +19,6 @@
 
 import numpy as np
 
-from pandas._libs import lib
 from pandas._libs.tslibs import Timestamp
 from pandas._typing import (
     DtypeObj,
@@ -232,9 +231,13 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
     dtype: DtypeObj | None
     if isinstance(series.dtype, ExtensionDtype):
         if isinstance(series.dtype, ArrowDtype):
-            import pyarrow as pa
+            if series.dtype.kind == "m":
+                # GH53001: describe timedeltas with object dtype
+                dtype = None
+            else:
+                import pyarrow as pa
 
-            dtype = ArrowDtype(pa.float64())
+                dtype = ArrowDtype(pa.float64())
         else:
             dtype = Float64Dtype()
     elif series.dtype.kind in "iufb":
@@ -363,9 +366,9 @@ def select_describe_func(
         return describe_categorical_1d
     elif is_numeric_dtype(data):
         return describe_numeric_1d
-    elif lib.is_np_dtype(data.dtype, "M") or isinstance(data.dtype, DatetimeTZDtype):
+    elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
         return describe_timestamp_1d
-    elif lib.is_np_dtype(data.dtype, "m"):
+    elif data.dtype.kind == "m":
         return describe_numeric_1d
     else:
         return describe_categorical_1d
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
@@ -2842,6 +2842,36 @@ def test_describe_numeric_data(pa_type):
     tm.assert_series_equal(result, expected)
 
 
+@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
+def test_describe_timedelta_data(pa_type):
+    # GH53001
+    data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
+    result = data.describe()
+    expected = pd.Series(
+        [9] + pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist(),
+        dtype=object,
+        index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
+    )
+    tm.assert_series_equal(result, expected)
+
+
+@pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES)
+def test_describe_datetime_data(pa_type):
+    # GH53001
+    data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
+    result = data.describe()
+    expected = pd.Series(
+        [9]
+        + [
+            pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit)
+            for v in [5, 1, 3, 5, 7, 9]
+        ],
+        dtype=object,
+        index=["count", "mean", "min", "25%", "50%", "75%", "max"],
+    )
+    tm.assert_series_equal(result, expected)
+
+
 @pytest.mark.parametrize(
     "pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
 )

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,7 @@ Bug fixes`
`23`	`23`	- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
`24`	`24`	- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
`25`	`25`	- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`)
	`26`	+- Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`)
`26`	`27`	- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`)
`27`	`28`	`-`
`28`	`29`