DEPR: Remove datetime_is_numeric in describe (#49368)

mroeschke · web-flow · commit ead5c756da23 · 2022-10-31T12:17:59.000-07:00
* DEPR: Remove datetime_is_numeric in describe

* Simplify
diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst
@@ -202,6 +202,7 @@ Removal of prior version deprecations/changes
 - Removed argument ``sort_columns`` in :meth:`DataFrame.plot` and :meth:`Series.plot` (:issue:`47563`)
 - Removed argument ``is_copy`` from :meth:`DataFrame.take` and :meth:`Series.take` (:issue:`30615`)
 - Removed argument ``kind`` from :meth:`Index.get_slice_bound`, :meth:`Index.slice_indexer` and :meth:`Index.slice_locs` (:issue:`41378`)
+- Removed argument ``datetime_is_numeric`` from :meth:`DataFrame.describe` and :meth:`Series.describe` as datetime data will always be summarized as numeric data (:issue:`34798`)
 - Disallow subclass-specific keywords (e.g. "freq", "tz", "names", "closed") in the :class:`Index` constructor (:issue:`38597`)
 - Removed argument ``inplace`` from :meth:`Categorical.remove_unused_categories` (:issue:`37918`)
 - Disallow passing non-round floats to :class:`Timestamp` with ``unit="M"`` or ``unit="Y"`` (:issue:`47266`)
diff --git a/pandas/core/describe.py b/pandas/core/describe.py
@@ -17,7 +17,6 @@
     Sequence,
     cast,
 )
-import warnings
 
 import numpy as np
 
@@ -27,7 +26,6 @@
     NDFrameT,
     npt,
 )
-from pandas.util._exceptions import find_stack_level
 from pandas.util._validators import validate_percentile
 
 from pandas.core.dtypes.common import (
@@ -56,7 +54,6 @@ def describe_ndframe(
     obj: NDFrameT,
     include: str | Sequence[str] | None,
     exclude: str | Sequence[str] | None,
-    datetime_is_numeric: bool,
     percentiles: Sequence[float] | np.ndarray | None,
 ) -> NDFrameT:
     """Describe series or dataframe.
@@ -71,8 +68,6 @@ def describe_ndframe(
         A white list of data types to include in the result. Ignored for ``Series``.
     exclude : list-like of dtypes or None (default), optional,
         A black list of data types to omit from the result. Ignored for ``Series``.
-    datetime_is_numeric : bool, default False
-        Whether to treat datetime dtypes as numeric.
     percentiles : list-like of numbers, optional
         The percentiles to include in the output. All should fall between 0 and 1.
         The default is ``[.25, .5, .75]``, which returns the 25th, 50th, and
@@ -88,14 +83,12 @@ def describe_ndframe(
     if obj.ndim == 1:
         describer = SeriesDescriber(
             obj=cast("Series", obj),
-            datetime_is_numeric=datetime_is_numeric,
         )
     else:
         describer = DataFrameDescriber(
             obj=cast("DataFrame", obj),
             include=include,
             exclude=exclude,
-            datetime_is_numeric=datetime_is_numeric,
         )
 
     result = describer.describe(percentiles=percentiles)
@@ -109,13 +102,10 @@ class NDFrameDescriberAbstract(ABC):
     ----------
     obj : Series or DataFrame
         Object to be described.
-    datetime_is_numeric : bool
-        Whether to treat datetime dtypes as numeric.
     """
 
-    def __init__(self, obj: DataFrame | Series, datetime_is_numeric: bool) -> None:
+    def __init__(self, obj: DataFrame | Series) -> None:
         self.obj = obj
-        self.datetime_is_numeric = datetime_is_numeric
 
     @abstractmethod
     def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame | Series:
@@ -136,7 +126,6 @@ class SeriesDescriber(NDFrameDescriberAbstract):
     def describe(self, percentiles: Sequence[float] | np.ndarray) -> Series:
         describe_func = select_describe_func(
             self.obj,
-            self.datetime_is_numeric,
         )
         return describe_func(self.obj, percentiles)
 
@@ -152,8 +141,6 @@ class DataFrameDescriber(NDFrameDescriberAbstract):
         A white list of data types to include in the result.
     exclude : list-like of dtypes or None
         A black list of data types to omit from the result.
-    datetime_is_numeric : bool
-        Whether to treat datetime dtypes as numeric.
     """
 
     def __init__(
@@ -162,22 +149,21 @@ def __init__(
         *,
         include: str | Sequence[str] | None,
         exclude: str | Sequence[str] | None,
-        datetime_is_numeric: bool,
     ) -> None:
         self.include = include
         self.exclude = exclude
 
         if obj.ndim == 2 and obj.columns.size == 0:
             raise ValueError("Cannot describe a DataFrame without columns")
 
-        super().__init__(obj, datetime_is_numeric=datetime_is_numeric)
+        super().__init__(obj)
 
     def describe(self, percentiles: Sequence[float] | np.ndarray) -> DataFrame:
         data = self._select_data()
 
         ldesc: list[Series] = []
         for _, series in data.items():
-            describe_func = select_describe_func(series, self.datetime_is_numeric)
+            describe_func = select_describe_func(series)
             ldesc.append(describe_func(series, percentiles))
 
         col_names = reorder_columns(ldesc)
@@ -193,9 +179,7 @@ def _select_data(self):
         """Select columns to be described."""
         if (self.include is None) and (self.exclude is None):
             # when some numerics are found, keep only numerics
-            default_include: list[npt.DTypeLike] = [np.number]
-            if self.datetime_is_numeric:
-                default_include.append("datetime")
+            default_include: list[npt.DTypeLike] = [np.number, "datetime"]
             data = self.obj.select_dtypes(include=default_include)
             if len(data.columns) == 0:
                 data = self.obj
@@ -360,34 +344,20 @@ def describe_timestamp_1d(data: Series, percentiles: Sequence[float]) -> Series:
 
 def select_describe_func(
     data: Series,
-    datetime_is_numeric: bool,
 ) -> Callable:
     """Select proper function for describing series based on data type.
 
     Parameters
     ----------
     data : Series
         Series to be described.
-    datetime_is_numeric : bool
-        Whether to treat datetime dtypes as numeric.
     """
     if is_bool_dtype(data.dtype):
         return describe_categorical_1d
     elif is_numeric_dtype(data):
         return describe_numeric_1d
     elif is_datetime64_any_dtype(data.dtype):
-        if datetime_is_numeric:
-            return describe_timestamp_1d
-        else:
-            warnings.warn(
-                "Treating datetime data as categorical rather than numeric in "
-                "`.describe` is deprecated and will be removed in a future "
-                "version of pandas. Specify `datetime_is_numeric=True` to "
-                "silence this warning and adopt the future behavior now.",
-                FutureWarning,
-                stacklevel=find_stack_level(),
-            )
-            return describe_timestamp_as_categorical_1d
+        return describe_timestamp_1d
     elif is_timedelta64_dtype(data.dtype):
         return describe_numeric_1d
     else:
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -10545,7 +10545,6 @@ def describe(
         percentiles=None,
         include=None,
         exclude=None,
-        datetime_is_numeric: bool_t = False,
     ) -> NDFrameT:
         """
         Generate descriptive statistics.
@@ -10591,12 +10590,6 @@ def describe(
               ``select_dtypes`` (e.g. ``df.describe(exclude=['O'])``). To
               exclude pandas categorical columns, use ``'category'``
             - None (default) : The result will exclude nothing.
-        datetime_is_numeric : bool, default False
-            Whether to treat datetime dtypes as numeric. This affects statistics
-            calculated for the column. For DataFrame input, this also
-            controls whether datetime columns are included by default.
-
-            .. versionadded:: 1.1.0
 
         Returns
         -------
@@ -10674,7 +10667,7 @@ def describe(
         ...   np.datetime64("2010-01-01"),
         ...   np.datetime64("2010-01-01")
         ... ])
-        >>> s.describe(datetime_is_numeric=True)
+        >>> s.describe()
         count                      3
         mean     2006-09-01 08:00:00
         min      2000-01-01 00:00:00
@@ -10792,7 +10785,6 @@ def describe(
             obj=self,
             include=include,
             exclude=exclude,
-            datetime_is_numeric=datetime_is_numeric,
             percentiles=percentiles,
         )
 
diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py
@@ -274,12 +274,12 @@ def test_describe_tz_values(self, tz_naive_fixture):
             },
             index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"],
         )
-        result = df.describe(include="all", datetime_is_numeric=True)
+        result = df.describe(include="all")
         tm.assert_frame_equal(result, expected)
 
     def test_datetime_is_numeric_includes_datetime(self):
         df = DataFrame({"a": date_range("2012", periods=3), "b": [1, 2, 3]})
-        result = df.describe(datetime_is_numeric=True)
+        result = df.describe()
         expected = DataFrame(
             {
                 "a": [
@@ -307,36 +307,22 @@ def test_describe_tz_values2(self):
         df = DataFrame({"s1": s1, "s2": s2})
 
         s1_ = s1.describe()
-        s2_ = Series(
-            [
-                5,
-                5,
-                s2.value_counts().index[0],
-                1,
-                start.tz_localize(tz),
-                end.tz_localize(tz),
-            ],
-            index=["count", "unique", "top", "freq", "first", "last"],
-        )
+        s2_ = s2.describe()
         idx = [
             "count",
-            "unique",
-            "top",
-            "freq",
-            "first",
-            "last",
             "mean",
-            "std",
             "min",
             "25%",
             "50%",
             "75%",
             "max",
+            "std",
         ]
-        expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx]
+        expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).reindex(
+            idx, copy=False
+        )
 
-        with tm.assert_produces_warning(FutureWarning):
-            result = df.describe(include="all")
+        result = df.describe(include="all")
         tm.assert_frame_equal(result, expected)
 
     def test_describe_percentiles_integer_idx(self):
diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py
@@ -99,7 +99,7 @@ def test_describe_with_tz(self, tz_naive_fixture):
         start = Timestamp(2018, 1, 1)
         end = Timestamp(2018, 1, 5)
         s = Series(date_range(start, end, tz=tz), name=name)
-        result = s.describe(datetime_is_numeric=True)
+        result = s.describe()
         expected = Series(
             [
                 5,
@@ -115,32 +115,32 @@ def test_describe_with_tz(self, tz_naive_fixture):
         )
         tm.assert_series_equal(result, expected)
 
-    def test_describe_with_tz_warns(self):
+    def test_describe_with_tz_numeric(self):
         name = tz = "CET"
         start = Timestamp(2018, 1, 1)
         end = Timestamp(2018, 1, 5)
         s = Series(date_range(start, end, tz=tz), name=name)
 
-        with tm.assert_produces_warning(FutureWarning):
-            result = s.describe()
+        result = s.describe()
 
         expected = Series(
             [
                 5,
-                5,
-                s.value_counts().index[0],
-                1,
-                start.tz_localize(tz),
-                end.tz_localize(tz),
+                Timestamp("2018-01-03 00:00:00", tz=tz),
+                Timestamp("2018-01-01 00:00:00", tz=tz),
+                Timestamp("2018-01-02 00:00:00", tz=tz),
+                Timestamp("2018-01-03 00:00:00", tz=tz),
+                Timestamp("2018-01-04 00:00:00", tz=tz),
+                Timestamp("2018-01-05 00:00:00", tz=tz),
             ],
             name=name,
-            index=["count", "unique", "top", "freq", "first", "last"],
+            index=["count", "mean", "min", "25%", "50%", "75%", "max"],
         )
         tm.assert_series_equal(result, expected)
 
     def test_datetime_is_numeric_includes_datetime(self):
         s = Series(date_range("2012", periods=3))
-        result = s.describe(datetime_is_numeric=True)
+        result = s.describe()
         expected = Series(
             [
                 3,