Skip to content

Commit fa989d2

Browse files
committed
Backport PR pandas-dev#53001: BUG: Series.describe treating pyarrow timestamps/timedeltas as categorical
1 parent fbbdac5 commit fa989d2

File tree

3 files changed

+59
-6
lines changed

3 files changed

+59
-6
lines changed

doc/source/whatsnew/v2.0.2.rst

+1
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ Bug fixes
2323
- Bug in :func:`api.interchange.from_dataframe` was returning :class:`DataFrame`'s of incorrect sizes when called on slices (:issue:`52824`)
2424
- Bug in :func:`api.interchange.from_dataframe` was unnecessarily raising on bitmasks (:issue:`49888`)
2525
- Bug in :meth:`DataFrame.convert_dtypes` ignores ``convert_*`` keywords when set to False ``dtype_backend="pyarrow"`` (:issue:`52872`)
26+
- Bug in :meth:`Series.describe` treating pyarrow-backed timestamps and timedeltas as categorical data (:issue:`53001`)
2627
- Bug in :meth:`pd.array` raising for ``NumPy`` array and ``pa.large_string`` or ``pa.large_binary`` (:issue:`52590`)
2728
-
2829

pandas/core/methods/describe.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,10 @@
3131
from pandas.core.dtypes.common import (
3232
is_bool_dtype,
3333
is_complex_dtype,
34-
is_datetime64_any_dtype,
3534
is_extension_array_dtype,
3635
is_numeric_dtype,
37-
is_timedelta64_dtype,
3836
)
37+
from pandas.core.dtypes.dtypes import DatetimeTZDtype
3938

4039
from pandas.core.arrays.arrow.dtype import ArrowDtype
4140
from pandas.core.arrays.floating import Float64Dtype
@@ -232,9 +231,13 @@ def describe_numeric_1d(series: Series, percentiles: Sequence[float]) -> Series:
232231
dtype: DtypeObj | None
233232
if is_extension_array_dtype(series):
234233
if isinstance(series.dtype, ArrowDtype):
235-
import pyarrow as pa
234+
if series.dtype.kind == "m":
235+
# GH53001: describe timedeltas with object dtype
236+
dtype = None
237+
else:
238+
import pyarrow as pa
236239

237-
dtype = ArrowDtype(pa.float64())
240+
dtype = ArrowDtype(pa.float64())
238241
else:
239242
dtype = Float64Dtype()
240243
elif is_numeric_dtype(series) and not is_complex_dtype(series):
@@ -362,9 +365,9 @@ def select_describe_func(
362365
return describe_categorical_1d
363366
elif is_numeric_dtype(data):
364367
return describe_numeric_1d
365-
elif is_datetime64_any_dtype(data.dtype):
368+
elif data.dtype.kind == "M" or isinstance(data.dtype, DatetimeTZDtype):
366369
return describe_timestamp_1d
367-
elif is_timedelta64_dtype(data.dtype):
370+
elif data.dtype.kind == "m":
368371
return describe_numeric_1d
369372
else:
370373
return describe_categorical_1d

pandas/tests/extension/test_arrow.py

+49
Original file line numberDiff line numberDiff line change
@@ -2658,6 +2658,55 @@ def test_describe_numeric_data(pa_type):
26582658
tm.assert_series_equal(result, expected)
26592659

26602660

2661+
@pytest.mark.parametrize("pa_type", tm.TIMEDELTA_PYARROW_DTYPES)
2662+
def test_describe_timedelta_data(pa_type):
2663+
# GH53001
2664+
data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
2665+
result = data.describe()
2666+
expected = pd.Series(
2667+
[9] + pd.to_timedelta([5, 2, 1, 3, 5, 7, 9], unit=pa_type.unit).tolist(),
2668+
dtype=object,
2669+
index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
2670+
)
2671+
tm.assert_series_equal(result, expected)
2672+
2673+
2674+
@pytest.mark.parametrize("pa_type", tm.DATETIME_PYARROW_DTYPES)
2675+
def test_describe_datetime_data(pa_type):
2676+
# GH53001
2677+
data = pd.Series(range(1, 10), dtype=ArrowDtype(pa_type))
2678+
result = data.describe()
2679+
expected = pd.Series(
2680+
[9]
2681+
+ [
2682+
pd.Timestamp(v, tz=pa_type.tz, unit=pa_type.unit)
2683+
for v in [5, 1, 3, 5, 7, 9]
2684+
],
2685+
dtype=object,
2686+
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
2687+
)
2688+
tm.assert_series_equal(result, expected)
2689+
2690+
2691+
@pytest.mark.parametrize(
2692+
"pa_type", tm.DATETIME_PYARROW_DTYPES + tm.TIMEDELTA_PYARROW_DTYPES
2693+
)
2694+
def test_quantile_temporal(pa_type):
2695+
# GH52678
2696+
data = [1, 2, 3]
2697+
ser = pd.Series(data, dtype=ArrowDtype(pa_type))
2698+
result = ser.quantile(0.1)
2699+
expected = ser[0]
2700+
assert result == expected
2701+
2702+
2703+
def test_date32_repr():
2704+
# GH48238
2705+
arrow_dt = pa.array([date.fromisoformat("2020-01-01")], type=pa.date32())
2706+
ser = pd.Series(arrow_dt, dtype=ArrowDtype(arrow_dt.type))
2707+
assert repr(ser) == "0 2020-01-01\ndtype: date32[day][pyarrow]"
2708+
2709+
26612710
@pytest.mark.xfail(
26622711
pa_version_under8p0,
26632712
reason="Function 'add_checked' has no kernel matching input types",

0 commit comments

Comments
 (0)