Skip to content

Commit 88b6396

Browse files
lukemanleyYi Wei
authored and
Yi Wei
committed
BUG: Series.describe treating pyarrow timestamps/timedeltas as categorical (pandas-dev#53001)
* Series.describe treating pyarrow timestamps and timedeltas as categorical * gh refs * cleanup
1 parent c872490 commit 88b6396

File tree

3 files changed

+39
-5
lines changed

3 files changed

+39
-5
lines changed

doc/source/whatsnew/v2.0.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Bug fixes
2323
- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
2424
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
2525
- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`)
26+
- Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`)
2627
- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`)
2728
-
2829

pandas/core/methods/describe.py

+8-5
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@
1919

2020
import numpy as np
2121

22-
from pandas._libs import lib
2322
from pandas._libs.tslibs import Timestamp
2423
from pandas._typing import (
2524
DtypeObj,
@@ -232,9 +231,13 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
232231
dtype: DtypeObj | None
233232
if isinstance(series.dtype, ExtensionDtype):
234233
if isinstance(series.dtype, ArrowDtype):
235-
import pyarrow as pa
234+
if series.dtype.kind == "m":
235+
# GH53001: describe timedeltas with object dtype
236+
dtype = None
237+
else:
238+
import pyarrow as pa
236239

237-
dtype = ArrowDtype(pa.float64())
240+
dtype = ArrowDtype(pa.float64())
238241
else:
239242
dtype = Float64Dtype()
240243
elif series.dtype.kind in "iufb":
@@ -363,9 +366,9 @@ def select_describe_func(
363366
return describe_categorical_1d
364367
elif is_numeric_dtype(data):
365368
return describe_numeric_1d
366-
elif lib.is_np_dtype(data.dtype, "M") or isinstance(data.dtype, DatetimeTZDtype):
369+
elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
367370
return describe_timestamp_1d
368-
elif lib.is_np_dtype(data.dtype, "m"):
371+
elif data.dtype.kind == "m":
369372
return describe_numeric_1d
370373
else:
371374
return describe_categorical_1d

pandas/tests/extension/test_arrow.py

+30
Original file line numberDiff line numberDiff line change
@@ -2842,6 +2842,36 @@ def test_describe_numeric_data(pa_type):
28422842
tm.assert_series_equal(result, expected)
28432843

28442844

2845+
@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
2846+
def test_describe_timedelta_data(pa_type):
2847+
# GH53001
2848+
data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
2849+
result = data.describe()
2850+
expected = pd.Series(
2851+
[9] + pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist(),
2852+
dtype=object,
2853+
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
2854+
)
2855+
tm.assert_series_equal(result, expected)
2856+
2857+
2858+
@pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES)
2859+
def test_describe_datetime_data(pa_type):
2860+
# GH53001
2861+
data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
2862+
result = data.describe()
2863+
expected = pd.Series(
2864+
[9]
2865+
+ [
2866+
pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit)
2867+
for v in [5, 1, 3, 5, 7, 9]
2868+
],
2869+
dtype=object,
2870+
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
2871+
)
2872+
tm.assert_series_equal(result, expected)
2873+
2874+
28452875
@pytest.mark.parametrize(
28462876
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
28472877
)

0 commit comments

Comments
 (0)