From 5a5134cbcf1614c4c79b46dc9da93b9849f76d0d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 27 Oct 2022 22:36:01 -0700 Subject: [PATCH 1/2] DEPR: Remove datetime_is_numeric in describe --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/core/describe.py | 39 +++----------------- pandas/core/generic.py | 10 +---- pandas/tests/frame/methods/test_describe.py | 30 ++++----------- pandas/tests/series/methods/test_describe.py | 22 +++++------ 5 files changed, 26 insertions(+), 76 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 252c444b2e60c..c239f049d26fb 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -192,6 +192,7 @@ Removal of prior version deprecations/changes - Removed argument ``sort_columns`` in :meth:`DataFrame.plot` and :meth:`Series.plot` (:issue:`47563`) - Removed argument ``is_copy`` from :meth:`DataFrame.take` and :meth:`Series.take` (:issue:`30615`) - Removed argument ``kind`` from :meth:`Index.get_slice_bound`, :meth:`Index.slice_indexer` and :meth:`Index.slice_locs` (:issue:`41378`) +- Removed argument ``datetime_is_numeric`` from :meth:`DataFrame.describe` and :meth:`Series.describe` as datetime data will always be summarized as numeric data (:issue:`34798`) - Disallow subclass-specific keywords (e.g. "freq", "tz", "names", "closed") in the :class:`Index` constructor (:issue:`38597`) - Removed argument ``inplace`` from :meth:`Categorical.remove_unused_categories` (:issue:`37918`) - Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index ce2fa950e6e62..70ae885678546 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -17,7 +17,6 @@ Sequence, cast, ) -import warnings import numpy as np @@ -27,7 +26,6 @@ NDFrameT, npt, ) -from pandas.util._exceptions import find_stack_level from pandas.util._validators import validate_percentile from pandas.core.dtypes.common import ( @@ -56,7 +54,6 @@ def describe_ndframe( obj: NDFrameT, include: str | Sequence[str] | None, exclude: str | Sequence[str] | None, - datetime_is_numeric: bool, percentiles: Sequence[float] | np.ndarray | None, ) -> NDFrameT: """Describe series or dataframe. @@ -71,8 +68,6 @@ def describe_ndframe( A white list of data types to include in the result. Ignored for ``Series``. exclude : list-like of dtypes or None (default), optional, A black list of data types to omit from the result. Ignored for ``Series``. - datetime_is_numeric : bool, default False - Whether to treat datetime dtypes as numeric. percentiles : list-like of numbers, optional The percentiles to include in the output. All should fall between 0 and 1. The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and @@ -88,14 +83,12 @@ def describe_ndframe( if obj.ndim == 1: describer = SeriesDescriber( obj=cast("Series", obj), - datetime_is_numeric=datetime_is_numeric, ) else: describer = DataFrameDescriber( obj=cast("DataFrame", obj), include=include, exclude=exclude, - datetime_is_numeric=datetime_is_numeric, ) result = describer.describe(percentiles=percentiles) @@ -109,13 +102,10 @@ class NDFrameDescriberAbstract(ABC): ---------- obj : Series or DataFrame Object to be described. - datetime_is_numeric : bool - Whether to treat datetime dtypes as numeric. """ - def __init__(self, obj: DataFrame | Series, datetime_is_numeric: bool) -> None: + def __init__(self, obj: DataFrame | Series) -> None: self.obj = obj - self.datetime_is_numeric = datetime_is_numeric @abstractmethod def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series: @@ -136,7 +126,6 @@ class SeriesDescriber(NDFrameDescriberAbstract): def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series: describe_func = select_describe_func( self.obj, - self.datetime_is_numeric, ) return describe_func(self.obj, percentiles) @@ -152,8 +141,6 @@ class DataFrameDescriber(NDFrameDescriberAbstract): A white list of data types to include in the result. exclude : list-like of dtypes or None A black list of data types to omit from the result. - datetime_is_numeric : bool - Whether to treat datetime dtypes as numeric. """ def __init__( @@ -162,7 +149,6 @@ def __init__( *, include: str | Sequence[str] | None, exclude: str | Sequence[str] | None, - datetime_is_numeric: bool, ) -> None: self.include = include self.exclude = exclude @@ -170,14 +156,14 @@ def __init__( if obj.ndim == 2 and obj.columns.size == 0: raise ValueError("Cannot describe a DataFrame without columns") - super().__init__(obj, datetime_is_numeric=datetime_is_numeric) + super().__init__(obj) def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame: data = self._select_data() ldesc: list[Series] = [] for _, series in data.items(): - describe_func = select_describe_func(series, self.datetime_is_numeric) + describe_func = select_describe_func(series) ldesc.append(describe_func(series, percentiles)) col_names = reorder_columns(ldesc) @@ -194,8 +180,7 @@ def _select_data(self): if (self.include is None) and (self.exclude is None): # when some numerics are found, keep only numerics default_include: list[npt.DTypeLike] = [np.number] - if self.datetime_is_numeric: - default_include.append("datetime") + default_include.append("datetime") data = self.obj.select_dtypes(include=default_include) if len(data.columns) == 0: data = self.obj @@ -360,7 +345,6 @@ def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series: def select_describe_func( data: Series, - datetime_is_numeric: bool, ) -> Callable: """Select proper function for describing series based on data type. @@ -368,26 +352,13 @@ def select_describe_func( ---------- data : Series Series to be described. - datetime_is_numeric : bool - Whether to treat datetime dtypes as numeric. """ if is_bool_dtype(data.dtype): return describe_categorical_1d elif is_numeric_dtype(data): return describe_numeric_1d elif is_datetime64_any_dtype(data.dtype): - if datetime_is_numeric: - return describe_timestamp_1d - else: - warnings.warn( - "Treating datetime data as categorical rather than numeric in " - "`.describe` is deprecated and will be removed in a future " - "version of pandas. Specify `datetime_is_numeric=True` to " - "silence this warning and adopt the future behavior now.", - FutureWarning, - stacklevel=find_stack_level(), - ) - return describe_timestamp_as_categorical_1d + return describe_timestamp_1d elif is_timedelta64_dtype(data.dtype): return describe_numeric_1d else: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 05494e37256df..a6259cf30a31f 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10588,7 +10588,6 @@ def describe( percentiles=None, include=None, exclude=None, - datetime_is_numeric: bool_t = False, ) -> NDFrameT: """ Generate descriptive statistics. @@ -10634,12 +10633,6 @@ def describe( ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To exclude pandas categorical columns, use ``'category'`` - None (default) : The result will exclude nothing. - datetime_is_numeric : bool, default False - Whether to treat datetime dtypes as numeric. This affects statistics - calculated for the column. For DataFrame input, this also - controls whether datetime columns are included by default. - - .. versionadded:: 1.1.0 Returns ------- @@ -10717,7 +10710,7 @@ def describe( ... np.datetime64("2010-01-01"), ... np.datetime64("2010-01-01") ... ]) - >>> s.describe(datetime_is_numeric=True) + >>> s.describe() count 3 mean 2006-09-01 08:00:00 min 2000-01-01 00:00:00 @@ -10835,7 +10828,6 @@ def describe( obj=self, include=include, exclude=exclude, - datetime_is_numeric=datetime_is_numeric, percentiles=percentiles, ) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 24d327a101143..e2b8a0f63c31a 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -274,12 +274,12 @@ def test_describe_tz_values(self, tz_naive_fixture): }, index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"], ) - result = df.describe(include="all", datetime_is_numeric=True) + result = df.describe(include="all") tm.assert_frame_equal(result, expected) def test_datetime_is_numeric_includes_datetime(self): df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]}) - result = df.describe(datetime_is_numeric=True) + result = df.describe() expected = DataFrame( { "a": [ @@ -307,36 +307,22 @@ def test_describe_tz_values2(self): df = DataFrame({"s1": s1, "s2": s2}) s1_ = s1.describe() - s2_ = Series( - [ - 5, - 5, - s2.value_counts().index[0], - 1, - start.tz_localize(tz), - end.tz_localize(tz), - ], - index=["count", "unique", "top", "freq", "first", "last"], - ) + s2_ = s2.describe() idx = [ "count", - "unique", - "top", - "freq", - "first", - "last", "mean", - "std", "min", "25%", "50%", "75%", "max", + "std", ] - expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx] + expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex( + idx, copy=False + ) - with tm.assert_produces_warning(FutureWarning): - result = df.describe(include="all") + result = df.describe(include="all") tm.assert_frame_equal(result, expected) def test_describe_percentiles_integer_idx(self): diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index a7cedd580b2d0..3d813268b57be 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -99,7 +99,7 @@ def test_describe_with_tz(self, tz_naive_fixture): start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s = Series(date_range(start, end, tz=tz), name=name) - result = s.describe(datetime_is_numeric=True) + result = s.describe() expected = Series( [ 5, @@ -115,32 +115,32 @@ def test_describe_with_tz(self, tz_naive_fixture): ) tm.assert_series_equal(result, expected) - def test_describe_with_tz_warns(self): + def test_describe_with_tz_numeric(self): name = tz = "CET" start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s = Series(date_range(start, end, tz=tz), name=name) - with tm.assert_produces_warning(FutureWarning): - result = s.describe() + result = s.describe() expected = Series( [ 5, - 5, - s.value_counts().index[0], - 1, - start.tz_localize(tz), - end.tz_localize(tz), + Timestamp("2018-01-03 00:00:00", tz=tz), + Timestamp("2018-01-01 00:00:00", tz=tz), + Timestamp("2018-01-02 00:00:00", tz=tz), + Timestamp("2018-01-03 00:00:00", tz=tz), + Timestamp("2018-01-04 00:00:00", tz=tz), + Timestamp("2018-01-05 00:00:00", tz=tz), ], name=name, - index=["count", "unique", "top", "freq", "first", "last"], + index=["count", "mean", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) def test_datetime_is_numeric_includes_datetime(self): s = Series(date_range("2012", periods=3)) - result = s.describe(datetime_is_numeric=True) + result = s.describe() expected = Series( [ 3, From 57ff3542ca6be200dd809ff51388e9e32938e76f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 31 Oct 2022 09:59:44 -0700 Subject: [PATCH 2/2] Simplify --- pandas/core/describe.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/core/describe.py b/pandas/core/describe.py index 70ae885678546..33afbfe6489a6 100644 --- a/pandas/core/describe.py +++ b/pandas/core/describe.py @@ -179,8 +179,7 @@ def _select_data(self): """Select columns to be described.""" if (self.include is None) and (self.exclude is None): # when some numerics are found, keep only numerics - default_include: list[npt.DTypeLike] = [np.number] - default_include.append("datetime") + default_include: list[npt.DTypeLike] = [np.number, "datetime"] data = self.obj.select_dtypes(include=default_include) if len(data.columns) == 0: data = self.obj