diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 8133e54c934ad..905348fd6db42 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -21,6 +21,17 @@ Other enhancements - - +.. --------------------------------------------------------------------------- + +.. _whatsnew_110.api.other: + +Other API changes +^^^^^^^^^^^^^^^^^ + +- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` + will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) +- +- .. --------------------------------------------------------------------------- diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 6332ff45c59d0..7bf7c8b7ae75f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9626,26 +9626,8 @@ def describe_categorical_1d(data): dtype = None if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - - if is_datetime64_any_dtype(data): - tz = data.dt.tz - asint = data.dropna().values.view("i8") - top = Timestamp(top) - if top.tzinfo is not None and tz is not None: - # Don't tz_localize(None) if key is already tz-aware - top = top.tz_convert(tz) - else: - top = top.tz_localize(tz) - names += ["top", "freq", "first", "last"] - result += [ - top, - freq, - Timestamp(asint.min(), tz=tz), - Timestamp(asint.max(), tz=tz), - ] - else: - names += ["top", "freq"] - result += [top, freq] + names += ["top", "freq"] + result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency @@ -9656,11 +9638,23 @@ def describe_categorical_1d(data): return pd.Series(result, index=names, name=data.name, dtype=dtype) + def describe_timestamp_1d(data): + # GH-30164 + stat_index = ["count", "mean", "min"] + formatted_percentiles + ["max"] + d = ( + [data.count(), data.mean(), data.min()] + + data.quantile(percentiles).tolist() + + [data.max()] + ) + return pd.Series(d, index=stat_index, name=data.name) + def describe_1d(data): if is_bool_dtype(data): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) + elif is_datetime64_any_dtype(data): + return describe_timestamp_1d(data) elif is_timedelta64_dtype(data): return describe_numeric_1d(data) else: diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 251563e51e15a..127233ed2713e 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -253,52 +253,19 @@ def test_describe_tz_values(self, tz_naive_fixture): expected = DataFrame( { - "s1": [ - 5, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - 2, - 1.581139, - 0, - 1, - 2, - 3, - 4, - ], + "s1": [5, 2, 0, 1, 2, 3, 4, 1.581139], "s2": [ 5, - 5, - s2.value_counts().index[0], - 1, + Timestamp(2018, 1, 3).tz_localize(tz), start.tz_localize(tz), + s2[1], + s2[2], + s2[3], end.tz_localize(tz), np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, - np.nan, ], }, - index=[ - "count", - "unique", - "top", - "freq", - "first", - "last", - "mean", - "std", - "min", - "25%", - "50%", - "75%", - "max", - ], + index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"], ) result = df.describe(include="all") tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index b147a04b11090..4e59c6995f4f2 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -1,6 +1,6 @@ import numpy as np -from pandas import Series, Timestamp, date_range +from pandas import Period, Series, Timedelta, Timestamp, date_range import pandas._testing as tm @@ -29,6 +29,36 @@ def test_describe(self): ) tm.assert_series_equal(result, expected) + s = Series( + [ + Timedelta("1 days"), + Timedelta("2 days"), + Timedelta("3 days"), + Timedelta("4 days"), + Timedelta("5 days"), + ], + name="timedelta_data", + ) + result = s.describe() + expected = Series( + [5, s[2], s.std(), s[0], s[1], s[2], s[3], s[4]], + name="timedelta_data", + index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) + + s = Series( + [Period("2020-01", "M"), Period("2020-01", "M"), Period("2019-12", "M")], + name="period_data", + ) + result = s.describe() + expected = Series( + [3, 2, s[0], 2], + name="period_data", + index=["count", "unique", "top", "freq"], + ) + tm.assert_series_equal(result, expected) + def test_describe_empty_object(self): # https://github.com/pandas-dev/pandas/issues/27183 s = Series([None, None], dtype=object) @@ -57,13 +87,14 @@ def test_describe_with_tz(self, tz_naive_fixture): expected = Series( [ 5, - 5, - s.value_counts().index[0], - 1, + Timestamp(2018, 1, 3).tz_localize(tz), start.tz_localize(tz), + s[1], + s[2], + s[3], end.tz_localize(tz), ], name=name, - index=["count", "unique", "top", "freq", "first", "last"], + index=["count", "mean", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected)