From 05bd2245250a1708c366a0507eb16de8899afb4b Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jun 2020 08:45:19 -0500 Subject: [PATCH 1/7] API: Make describe changes backwards compatible Adds the new behavior as a feature flag / deprecation. Closes https://github.com/pandas-dev/pandas/issues/33903 --- doc/source/whatsnew/v1.1.0.rst | 3 +- pandas/core/generic.py | 45 ++++++++++++++++++-- pandas/tests/frame/methods/test_describe.py | 35 ++++++++++++++- pandas/tests/series/methods/test_describe.py | 18 +++++++- 4 files changed, 93 insertions(+), 8 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f7e36de059e84..dd551a1ab39ce 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -375,8 +375,6 @@ Development Changes Other API changes ^^^^^^^^^^^^^^^^^ -- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` - will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) - Added :meth:`DataFrame.value_counts` (:issue:`5377`) - :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) @@ -741,6 +739,7 @@ Deprecations - Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`) +- :meth:`Series.describe` and :meth:`DataFrame.describe` treating datetime dtypes as categorical rather than numeric is deprecated. Specify ``datetime_is_numeric=True`` to show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`, :issue:`33903`) - :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`) - Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`) - :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9dcdcaca2f689..009849df1b7ce 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9792,7 +9792,11 @@ def abs(self: FrameOrSeries) -> FrameOrSeries: return np.abs(self) def describe( - self: FrameOrSeries, percentiles=None, include=None, exclude=None + self: FrameOrSeries, + percentiles=None, + include=None, + exclude=None, + datetime_is_numeric=False, ) -> FrameOrSeries: """ Generate descriptive statistics. @@ -9838,6 +9842,10 @@ def describe( ``select_dtypes`` (e.g. ``df.describe(include=['O'])``). To exclude pandas categorical columns, use ``'category'`` - None (default) : The result will exclude nothing. + datetime_is_numeric : bool, default False + Whether to treat datetime dtypes as numeric. + + .. versionadded:: 1.1.0 Returns ------- @@ -10073,8 +10081,37 @@ def describe_categorical_1d(data): dtype = None if result[1] > 0: top, freq = objcounts.index[0], objcounts.iloc[0] - names += ["top", "freq"] - result += [top, freq] + if is_datetime64_any_dtype(data.dtype): + if self.ndim == 1: + stacklevel = 4 + else: + stacklevel = 5 + warnings.warn( + "Treating datetime data as categorical rather than numeric in " + "`.describe` is deprecated and will be removed in a future " + "version of pandas. Specify `datetime_is_numeric=True` to " + "silence this warning and adopt the future behavior now.", + FutureWarning, + stacklevel=stacklevel, + ) + tz = data.dt.tz + asint = data.dropna().values.view("i8") + top = Timestamp(top) + if top.tzinfo is not None and tz is not None: + # Don't tz_localize(None) if key is already tz-aware + top = top.tz_convert(tz) + else: + top = top.tz_localize(tz) + names += ["top", "freq", "first", "last"] + result += [ + top, + freq, + Timestamp(asint.min(), tz=tz), + Timestamp(asint.max(), tz=tz), + ] + else: + names += ["top", "freq"] + result += [top, freq] # If the DataFrame is empty, set 'top' and 'freq' to None # to maintain output shape consistency @@ -10100,7 +10137,7 @@ def describe_1d(data): return describe_categorical_1d(data) elif is_numeric_dtype(data): return describe_numeric_1d(data) - elif is_datetime64_any_dtype(data.dtype): + elif is_datetime64_any_dtype(data.dtype) and datetime_is_numeric: return describe_timestamp_1d(data) elif is_timedelta64_dtype(data.dtype): return describe_numeric_1d(data) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index b61d0d28e2fba..3c8b7100a5e07 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -267,7 +267,40 @@ def test_describe_tz_values(self, tz_naive_fixture): }, index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"], ) - result = df.describe(include="all") + result = df.describe(include="all", datetime_is_numeric=True) + tm.assert_frame_equal(result, expected) + + s1_ = s1.describe() + s2_ = pd.Series( + [ + 5, + 5, + s2.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + ], + index=["count", "unique", "top", "freq", "first", "last"], + ) + idx = [ + "count", + "unique", + "top", + "freq", + "first", + "last", + "mean", + "std", + "min", + "25%", + "50%", + "75%", + "max", + ] + expected = pd.concat([s1_, s2_], axis=1, keys=["s1", "s2"]).loc[idx] + + with tm.assert_produces_warning(FutureWarning): + result = df.describe(include="all") tm.assert_frame_equal(result, expected) def test_describe_percentiles_integer_idx(self): diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index 4e59c6995f4f2..39c0a33a9d002 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -83,7 +83,7 @@ def test_describe_with_tz(self, tz_naive_fixture): start = Timestamp(2018, 1, 1) end = Timestamp(2018, 1, 5) s = Series(date_range(start, end, tz=tz), name=name) - result = s.describe() + result = s.describe(datetime_is_numeric=True) expected = Series( [ 5, @@ -98,3 +98,19 @@ def test_describe_with_tz(self, tz_naive_fixture): index=["count", "mean", "min", "25%", "50%", "75%", "max"], ) tm.assert_series_equal(result, expected) + + with tm.assert_produces_warning(FutureWarning): + result = s.describe() + expected = Series( + [ + 5, + 5, + s.value_counts().index[0], + 1, + start.tz_localize(tz), + end.tz_localize(tz), + ], + name=name, + index=["count", "unique", "top", "freq", "first", "last"], + ) + tm.assert_series_equal(result, expected) From 27e9768ecc61f4f7186ef4908bdd0930a61869bf Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Mon, 15 Jun 2020 09:16:50 -0500 Subject: [PATCH 2/7] doctest --- pandas/core/generic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 009849df1b7ce..bf5fcd52b78d6 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9923,7 +9923,7 @@ def describe( ... np.datetime64("2010-01-01"), ... np.datetime64("2010-01-01") ... ]) - >>> s.describe() + >>> s.describe(datetime_is_numeric=True) count 3 mean 2006-09-01 08:00:00 min 2000-01-01 00:00:00 From 53cfee8e24a56ad1cc0b3f2fa253fd36101b1875 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 7 Jul 2020 09:09:50 -0500 Subject: [PATCH 3/7] whatsnew --- doc/source/whatsnew/v1.1.0.rst | 2 -- 1 file changed, 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index a2b1ca3cd87c2..d072404f26eb8 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -668,8 +668,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once Other API changes ^^^^^^^^^^^^^^^^^ -- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` - will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) - Added :meth:`DataFrame.value_counts` (:issue:`5377`) - :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) - ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) From 65862806f73481c226561d615a22eb9e2b1f494c Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 7 Jul 2020 09:12:59 -0500 Subject: [PATCH 4/7] fixups --- pandas/tests/frame/methods/test_describe.py | 8 ++++++++ pandas/tests/series/methods/test_describe.py | 7 +++++++ 2 files changed, 15 insertions(+) diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index 3c8b7100a5e07..cfa9f9295d133 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -270,6 +270,14 @@ def test_describe_tz_values(self, tz_naive_fixture): result = df.describe(include="all", datetime_is_numeric=True) tm.assert_frame_equal(result, expected) + def test_describe_tz_values2(self): + tz = "CET" + s1 = Series(range(5)) + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s2 = Series(date_range(start, end, tz=tz)) + df = pd.DataFrame({"s1": s1, "s2": s2}) + s1_ = s1.describe() s2_ = pd.Series( [ diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index 39c0a33a9d002..7997cd2f638fa 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -99,8 +99,15 @@ def test_describe_with_tz(self, tz_naive_fixture): ) tm.assert_series_equal(result, expected) + def test_describe_with_tz_warns(self): + name = tz = "CET" + start = Timestamp(2018, 1, 1) + end = Timestamp(2018, 1, 5) + s = Series(date_range(start, end, tz=tz), name=name) + with tm.assert_produces_warning(FutureWarning): result = s.describe() + expected = Series( [ 5, From 0222e760041a67e8af0fb31f5d9008d545999f43 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jul 2020 07:48:56 -0500 Subject: [PATCH 5/7] fixup --- pandas/core/generic.py | 5 ++++- pandas/tests/frame/methods/test_describe.py | 21 ++++++++++++++++++++ pandas/tests/series/methods/test_describe.py | 17 ++++++++++++++++ 3 files changed, 42 insertions(+), 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3686d3992be80..db6b04c71a412 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -10067,7 +10067,10 @@ def describe_1d(data): return describe_1d(self) elif (include is None) and (exclude is None): # when some numerics are found, keep only numerics - data = self.select_dtypes(include=[np.number]) + default_include = [np.number] + if datetime_is_numeric: + default_include.append("datetime") + data = self.select_dtypes(include=default_include) if len(data.columns) == 0: data = self elif include == "all": diff --git a/pandas/tests/frame/methods/test_describe.py b/pandas/tests/frame/methods/test_describe.py index cfa9f9295d133..0b70bead375da 100644 --- a/pandas/tests/frame/methods/test_describe.py +++ b/pandas/tests/frame/methods/test_describe.py @@ -270,6 +270,27 @@ def test_describe_tz_values(self, tz_naive_fixture): result = df.describe(include="all", datetime_is_numeric=True) tm.assert_frame_equal(result, expected) + def test_datetime_is_numeric_includes_datetime(self): + df = pd.DataFrame({"a": pd.date_range("2012", periods=3), "b": [1, 2, 3]}) + result = df.describe(datetime_is_numeric=True) + expected = pd.DataFrame( + { + "a": [ + 3, + pd.Timestamp("2012-01-02"), + pd.Timestamp("2012-01-01"), + pd.Timestamp("2012-01-01T12:00:00"), + pd.Timestamp("2012-01-02"), + pd.Timestamp("2012-01-02T12:00:00"), + pd.Timestamp("2012-01-03"), + np.nan, + ], + "b": [3, 2, 1, 1.5, 2, 2.5, 3, 1], + }, + index=["count", "mean", "min", "25%", "50%", "75%", "max", "std"], + ) + tm.assert_frame_equal(result, expected) + def test_describe_tz_values2(self): tz = "CET" s1 = Series(range(5)) diff --git a/pandas/tests/series/methods/test_describe.py b/pandas/tests/series/methods/test_describe.py index 7997cd2f638fa..a15dc0751aa7d 100644 --- a/pandas/tests/series/methods/test_describe.py +++ b/pandas/tests/series/methods/test_describe.py @@ -121,3 +121,20 @@ def test_describe_with_tz_warns(self): index=["count", "unique", "top", "freq", "first", "last"], ) tm.assert_series_equal(result, expected) + + def test_datetime_is_numeric_includes_datetime(self): + s = Series(date_range("2012", periods=3)) + result = s.describe(datetime_is_numeric=True) + expected = Series( + [ + 3, + Timestamp("2012-01-02"), + Timestamp("2012-01-01"), + Timestamp("2012-01-01T12:00:00"), + Timestamp("2012-01-02"), + Timestamp("2012-01-02T12:00:00"), + Timestamp("2012-01-03"), + ], + index=["count", "mean", "min", "25%", "50%", "75%", "max"], + ) + tm.assert_series_equal(result, expected) From 166f6f460a9995ed9c2b9ed9d0ce7a13df770ee8 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jul 2020 14:10:30 -0500 Subject: [PATCH 6/7] doc fixup --- doc/source/whatsnew/v1.1.0.rst | 39 +--------------------------------- pandas/core/generic.py | 5 ++++- 2 files changed, 5 insertions(+), 39 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index f2afedc8be619..cfac916157649 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -280,6 +280,7 @@ Other enhancements - Added :meth:`DataFrame.value_counts` (:issue:`5377`) - Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. - Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`) +- :meth:`~DataFrame.describe` now includes a ``datetime_is_numeric`` keyword to control how datetime columns are summarized (:issue:`30164`, :issue:`34798`) - :class:`Styler` may now render CSS more efficiently where multiple cells have the same styling (:issue:`30876`) - :meth:`Styler.highlight_null` now accepts ``subset`` argument (:issue:`31345`) - When writing directly to a sqlite connection :func:`to_sql` now supports the ``multi`` method (:issue:`29921`) @@ -675,43 +676,6 @@ apply and applymap on ``DataFrame`` evaluates first row/column only once df.apply(func, axis=1) -.. _whatsnew_110.api.other: - -Other API changes -^^^^^^^^^^^^^^^^^ - -- Added :meth:`DataFrame.value_counts` (:issue:`5377`) -- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) -- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) -- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) -- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. -- Added a :func:`pandas.api.indexers.VariableOffsetWindowIndexer` class to support ``rolling`` operations with non-fixed offsets (:issue:`34994`) -- Added :class:`pandas.errors.InvalidIndexError` (:issue:`34570`). -- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`31126`) -- :meth:`DataFrame.xs` now raises a ``TypeError`` if a ``level`` keyword is supplied and the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`33610`) -- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) - now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. - Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) -- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) -- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) -- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) -- Combining a ``Categorical`` with integer categories and which contains missing values - with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` - will now result in a float column instead of an object dtyped column (:issue:`33607`) -- :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. -- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` - (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) -- :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) -- :meth:`Series.interpolate` and :meth:`DataFrame.interpolate` now raises ValueError if ``limit_direction`` is 'forward' or 'both' and ``method`` is 'backfill' or 'bfill' or ``limit_direction`` is 'backward' or 'both' and ``method`` is 'pad' or 'ffill' (:issue:`34746`) -- The :class:`DataFrame` constructor no longer accepts a list of ``DataFrame`` objects. Because of changes to NumPy, ``DataFrame`` objects are now consistently treated as 2D objects, so a list of ``DataFrames`` is considered 3D, and no longer acceptible for the ``DataFrame`` constructor (:issue:`32289`). -- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` - will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) - - Increased minimum versions for dependencies ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -795,7 +759,6 @@ Deprecations - Lookups on a :class:`Series` with a single-item list containing a slice (e.g. ``ser[[slice(0, 4)]]``) are deprecated, will raise in a future version. Either convert the list to tuple, or pass the slice directly instead (:issue:`31333`) -- :meth:`Series.describe` and :meth:`DataFrame.describe` treating datetime dtypes as categorical rather than numeric is deprecated. Specify ``datetime_is_numeric=True`` to show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`, :issue:`33903`) - :meth:`DataFrame.mean` and :meth:`DataFrame.median` with ``numeric_only=None`` will include datetime64 and datetime64tz columns in a future version (:issue:`29941`) - Setting values with ``.loc`` using a positional slice is deprecated and will raise in a future version. Use ``.loc`` with labels or ``.iloc`` with positions instead (:issue:`31840`) - :meth:`DataFrame.to_dict` has deprecated accepting short names for ``orient`` in future versions (:issue:`32515`) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 200d7935b66ad..a27e9500fa586 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9762,7 +9762,10 @@ def describe( exclude pandas categorical columns, use ``'category'`` - None (default) : The result will exclude nothing. datetime_is_numeric : bool, default False - Whether to treat datetime dtypes as numeric. + Whether to treat datetime dtypes as numeric. This affects statistics + calculated for the column. For DataFrame input, this also + controls whether datetime columns are included by default. + .. versionadded:: 1.1.0 From 122e2f58fb351b4ffe87ebdf5541ce1692f12ce6 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 14 Jul 2020 14:40:49 -0500 Subject: [PATCH 7/7] newline --- pandas/core/generic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a27e9500fa586..eb55369d83593 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -9766,7 +9766,6 @@ def describe( calculated for the column. For DataFrame input, this also controls whether datetime columns are included by default. - .. versionadded:: 1.1.0 Returns