Skip to content

Commit b018691

Browse files
API: Make describe changes backwards compatible (#34798)
1 parent e5dcdd1 commit b018691

File tree

4 files changed

+153
-17
lines changed

4 files changed

+153
-17
lines changed

doc/source/whatsnew/v1.1.0.rst

+1-9
Original file line numberDiff line numberDiff line change
@@ -280,6 +280,7 @@ Other enhancements
280280
- Added :meth:`DataFrame.value_counts` (:issue:`5377`)
281281
- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations.
282282
- Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`)
283+
- :meth:`~DataFrame.describe` now includes a ``datetime_is_numeric`` keyword to control how datetime columns are summarized (:issue:`30164`, :issue:`34798`)
283284
- :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`)
284285
- :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`)
285286
- When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`)
@@ -675,15 +676,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once
675676
676677
df.apply(func, axis=1)
677678
678-
.. _whatsnew_110.api.other:
679-
680-
Other API changes
681-
^^^^^^^^^^^^^^^^^
682-
683-
- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last``
684-
will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`)
685-
686-
687679
Increased minimum versions for dependencies
688680
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
689681

pandas/core/generic.py

+48-6
Original file line numberDiff line numberDiff line change
@@ -9711,7 +9711,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries:
97119711
return np.abs(self)
97129712

97139713
def describe(
9714-
self: FrameOrSeries, percentiles=None, include=None, exclude=None
9714+
self: FrameOrSeries,
9715+
percentiles=None,
9716+
include=None,
9717+
exclude=None,
9718+
datetime_is_numeric=False,
97159719
) -> FrameOrSeries:
97169720
"""
97179721
Generate descriptive statistics.
@@ -9757,6 +9761,12 @@ def describe(
97579761
``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To
97589762
exclude pandas categorical columns, use ``'category'``
97599763
- None (default) : The result will exclude nothing.
9764+
datetime_is_numeric : bool, default False
9765+
Whether to treat datetime dtypes as numeric. This affects statistics
9766+
calculated for the column. For DataFrame input, this also
9767+
controls whether datetime columns are included by default.
9768+
9769+
.. versionadded:: 1.1.0
97609770
97619771
Returns
97629772
-------
@@ -9834,7 +9844,7 @@ def describe(
98349844
... np.datetime64("2010-01-01"),
98359845
... np.datetime64("2010-01-01")
98369846
... ])
9837-
>>> s.describe()
9847+
>>> s.describe(datetime_is_numeric=True)
98389848
count 3
98399849
mean 2006-09-01 08:00:00
98409850
min 2000-01-01 00:00:00
@@ -9992,8 +10002,37 @@ def describe_categorical_1d(data):
999210002
dtype = None
999310003
if result[1] > 0:
999410004
top, freq = objcounts.index[0], objcounts.iloc[0]
9995-
names += ["top", "freq"]
9996-
result += [top, freq]
10005+
if is_datetime64_any_dtype(data.dtype):
10006+
if self.ndim == 1:
10007+
stacklevel = 4
10008+
else:
10009+
stacklevel = 5
10010+
warnings.warn(
10011+
"Treating datetime data as categorical rather than numeric in "
10012+
"`.describe` is deprecated and will be removed in a future "
10013+
"version of pandas. Specify `datetime_is_numeric=True` to "
10014+
"silence this warning and adopt the future behavior now.",
10015+
FutureWarning,
10016+
stacklevel=stacklevel,
10017+
)
10018+
tz = data.dt.tz
10019+
asint = data.dropna().values.view("i8")
10020+
top = Timestamp(top)
10021+
if top.tzinfo is not None and tz is not None:
10022+
# Don't tz_localize(None) if key is already tz-aware
10023+
top = top.tz_convert(tz)
10024+
else:
10025+
top = top.tz_localize(tz)
10026+
names += ["top", "freq", "first", "last"]
10027+
result += [
10028+
top,
10029+
freq,
10030+
Timestamp(asint.min(), tz=tz),
10031+
Timestamp(asint.max(), tz=tz),
10032+
]
10033+
else:
10034+
names += ["top", "freq"]
10035+
result += [top, freq]
999710036

999810037
# If the DataFrame is empty, set 'top' and 'freq' to None
999910038
# to maintain output shape consistency
@@ -10019,7 +10058,7 @@ def describe_1d(data):
1001910058
return describe_categorical_1d(data)
1002010059
elif is_numeric_dtype(data):
1002110060
return describe_numeric_1d(data)
10022-
elif is_datetime64_any_dtype(data.dtype):
10061+
elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric:
1002310062
return describe_timestamp_1d(data)
1002410063
elif is_timedelta64_dtype(data.dtype):
1002510064
return describe_numeric_1d(data)
@@ -10030,7 +10069,10 @@ def describe_1d(data):
1003010069
return describe_1d(self)
1003110070
elif (include is None) and (exclude is None):
1003210071
# when some numerics are found, keep only numerics
10033-
data = self.select_dtypes(include=[np.number])
10072+
default_include = [np.number]
10073+
if datetime_is_numeric:
10074+
default_include.append("datetime")
10075+
data = self.select_dtypes(include=default_include)
1003410076
if len(data.columns) == 0:
1003510077
data = self
1003610078
elif include == "all":

pandas/tests/frame/methods/test_describe.py

+63-1
Original file line numberDiff line numberDiff line change
@@ -267,7 +267,69 @@ def test_describe_tz_values(self, tz_naive_fixture):
267267
},
268268
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
269269
)
270-
result = df.describe(include="all")
270+
result = df.describe(include="all", datetime_is_numeric=True)
271+
tm.assert_frame_equal(result, expected)
272+
273+
def test_datetime_is_numeric_includes_datetime(self):
274+
df = pd.DataFrame({"a": pd.date_range("2012", periods=3), "b": [1, 2, 3]})
275+
result = df.describe(datetime_is_numeric=True)
276+
expected = pd.DataFrame(
277+
{
278+
"a": [
279+
3,
280+
pd.Timestamp("2012-01-02"),
281+
pd.Timestamp("2012-01-01"),
282+
pd.Timestamp("2012-01-01T12:00:00"),
283+
pd.Timestamp("2012-01-02"),
284+
pd.Timestamp("2012-01-02T12:00:00"),
285+
pd.Timestamp("2012-01-03"),
286+
np.nan,
287+
],
288+
"b": [3, 2, 1, 1.5, 2, 2.5, 3, 1],
289+
},
290+
index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
291+
)
292+
tm.assert_frame_equal(result, expected)
293+
294+
def test_describe_tz_values2(self):
295+
tz = "CET"
296+
s1 = Series(range(5))
297+
start = Timestamp(2018, 1, 1)
298+
end = Timestamp(2018, 1, 5)
299+
s2 = Series(date_range(start, end, tz=tz))
300+
df = pd.DataFrame({"s1": s1, "s2": s2})
301+
302+
s1_ = s1.describe()
303+
s2_ = pd.Series(
304+
[
305+
5,
306+
5,
307+
s2.value_counts().index[0],
308+
1,
309+
start.tz_localize(tz),
310+
end.tz_localize(tz),
311+
],
312+
index=["count", "unique", "top", "freq", "first", "last"],
313+
)
314+
idx = [
315+
"count",
316+
"unique",
317+
"top",
318+
"freq",
319+
"first",
320+
"last",
321+
"mean",
322+
"std",
323+
"min",
324+
"25%",
325+
"50%",
326+
"75%",
327+
"max",
328+
]
329+
expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]
330+
331+
with tm.assert_produces_warning(FutureWarning):
332+
result = df.describe(include="all")
271333
tm.assert_frame_equal(result, expected)
272334

273335
def test_describe_percentiles_integer_idx(self):

pandas/tests/series/methods/test_describe.py

+41-1
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ def test_describe_with_tz(self, tz_naive_fixture):
8383
start = Timestamp(2018, 1, 1)
8484
end = Timestamp(2018, 1, 5)
8585
s = Series(date_range(start, end, tz=tz), name=name)
86-
result = s.describe()
86+
result = s.describe(datetime_is_numeric=True)
8787
expected = Series(
8888
[
8989
5,
@@ -98,3 +98,43 @@ def test_describe_with_tz(self, tz_naive_fixture):
9898
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
9999
)
100100
tm.assert_series_equal(result, expected)
101+
102+
def test_describe_with_tz_warns(self):
103+
name = tz = "CET"
104+
start = Timestamp(2018, 1, 1)
105+
end = Timestamp(2018, 1, 5)
106+
s = Series(date_range(start, end, tz=tz), name=name)
107+
108+
with tm.assert_produces_warning(FutureWarning):
109+
result = s.describe()
110+
111+
expected = Series(
112+
[
113+
5,
114+
5,
115+
s.value_counts().index[0],
116+
1,
117+
start.tz_localize(tz),
118+
end.tz_localize(tz),
119+
],
120+
name=name,
121+
index=["count", "unique", "top", "freq", "first", "last"],
122+
)
123+
tm.assert_series_equal(result, expected)
124+
125+
def test_datetime_is_numeric_includes_datetime(self):
126+
s = Series(date_range("2012", periods=3))
127+
result = s.describe(datetime_is_numeric=True)
128+
expected = Series(
129+
[
130+
3,
131+
Timestamp("2012-01-02"),
132+
Timestamp("2012-01-01"),
133+
Timestamp("2012-01-01T12:00:00"),
134+
Timestamp("2012-01-02"),
135+
Timestamp("2012-01-02T12:00:00"),
136+
Timestamp("2012-01-03"),
137+
],
138+
index=["count", "mean", "min", "25%", "50%", "75%", "max"],
139+
)
140+
tm.assert_series_equal(result, expected)

0 commit comments

Comments
 (0)