From dd4c6fc0418501b5248d1008742b0ebb3afcb76f Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 11 Feb 2023 13:03:01 -0800 Subject: [PATCH 1/6] ENH: support td64/dt64 in GroupBy.std --- doc/source/whatsnew/v2.0.0.rst | 1 + pandas/_libs/groupby.pyi | 1 + pandas/_libs/groupby.pyx | 8 ++++++- pandas/core/groupby/groupby.py | 14 ++++++++++- pandas/tests/groupby/test_groupby.py | 27 ++++++++++++++++++++++ pandas/tests/groupby/test_raises.py | 4 ++-- pandas/tests/resample/test_resample_api.py | 14 +++++++---- 7 files changed, 60 insertions(+), 9 deletions(-) diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 29f360e050548..62762338fa19a 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -284,6 +284,7 @@ Other enhancements - Added support for ``dt`` accessor methods when using :class:`ArrowDtype` with a ``pyarrow.timestamp`` type (:issue:`50954`) - :func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`) - :meth:`.DataFrameGroupBy.quantile`, :meth:`.SeriesGroupBy.quantile` and :meth:`.DataFrameGroupBy.std` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`) +- :meth:`.DataFrameGroupBy.std`, :meth:`.SeriesGroupBy.std` now support datetime64, timedelta64, and :class:`DatetimeTZDtype` dtypes (:issue:`48481`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`) - :func:`.testing.assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`) - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 09f4fbec5176e..e3ca9c44d5664 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -85,6 +85,7 @@ def group_var( ddof: int = ..., # int64_t mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + is_datetimelike: bool = ..., ) -> None: ... def group_mean( out: np.ndarray, # floating[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index dd2bdadce31c5..0c378acbc6dc3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -818,6 +818,7 @@ def group_var( int64_t ddof=1, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint is_datetimelike=False, ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -852,8 +853,13 @@ def group_var( if uses_mask: isna_entry = mask[i, j] + elif is_datetimelike: + # With group_var, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_entry = val == NPY_NAT else: - isna_entry = _treat_as_na(val, False) + isna_entry = _treat_as_na(val, is_datetimelike) if not isna_entry: nobs[lab, j] += 1 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e42566bfa11a0..5ba5d20e4a74b 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -3724,7 +3724,10 @@ def blk_func(values: ArrayLike) -> ArrayLike: counts = np.zeros(ngroups, dtype=np.int64) func = partial(func, counts=counts) + is_datetimelike = values.dtype.kind in ["m", "M"] vals = values + if is_datetimelike and how == "std": + vals = vals.view("i8") if pre_processing: vals, inferences = pre_processing(vals) @@ -3747,7 +3750,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: result_mask = np.zeros(result.shape, dtype=np.bool_) func = partial(func, result_mask=result_mask) - func(**kwargs) # Call func to modify result in place + # Call func to modify result in place + if how == "std": + func(**kwargs, is_datetimelike=is_datetimelike) + else: + func(**kwargs) if values.ndim == 1: assert result.shape[1] == 1, result.shape @@ -3761,6 +3768,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: result = post_processing(result, inferences, **pp_kwargs) + if how == "std" and is_datetimelike: + unit = values.unit + result = result.astype(np.int64, copy=False) + result = result.view(f"m8[{unit}]") + return result.T # Operate block-wise instead of column-by-column diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e175f6dda980f..024966c215704 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -37,6 +37,33 @@ def test_repr(): assert result == expected +def test_groupby_std_datetimelike(): + # GH#48481 + tdi = pd.timedelta_range("1 Day", periods=10000) + ser = Series(tdi) + ser[::5] *= 2 # get different std for different groups + + df = ser.to_frame("A") + + df["B"] = ser + Timestamp(0) + df["C"] = ser + Timestamp(0, tz="UTC") + df.iloc[-1] = pd.NaT # last group includes NaTs + + gb = df.groupby(list(range(5)) * 2000) + + result = gb.std() + + # Note: this does not _exactly_ match what we would get if we did + # [gb.get_group(i).std() for i in gb.groups] + # but it _does_ match the floating point error we get doing the + # same operation on int64 data xref GH#51332 + td1 = Timedelta("2887 days 11:21:02.326710176") + td4 = Timedelta("2886 days 00:42:34.664668096") + exp_ser = Series([td1 * 2, td1, td1, td1, td4]) + expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) def test_basic(dtype): diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 6ceb23a3c44b6..76ba4c974b3fd 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -224,11 +224,11 @@ def test_groupby_raises_datetime(how, by, groupby_series, groupby_func): "prod": (TypeError, "datetime64 type does not support prod"), "quantile": (None, ""), "rank": (None, ""), - "sem": (TypeError, "Cannot cast DatetimeArray to dtype float64"), + "sem": (None, ""), "shift": (None, ""), "size": (None, ""), "skew": (TypeError, r"dtype datetime64\[ns\] does not support reduction"), - "std": (TypeError, "Cannot cast DatetimeArray to dtype float64"), + "std": (None, ""), "sum": (TypeError, "datetime64 type does not support sum operations"), "var": (None, ""), }[groupby_func] diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index e6e924793389d..19fc849c2e388 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -405,12 +405,16 @@ def test_agg(): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: - # In case 2, "date" is an index and a column, so agg still tries to agg + # In case 2, "date" is an index and a column, so get included in the agg if t == cases[2]: - # .var on dt64 column raises - msg = "Cannot cast DatetimeArray to dtype float64" - with pytest.raises(TypeError, match=msg): - t.aggregate([np.mean, np.std]) + date_mean = t["date"].mean() + date_std = t["date"].std() + exp = pd.concat([date_mean, date_std, expected], axis=1) + exp.columns = pd.MultiIndex.from_product( + [["date", "A", "B"], ["mean", "std"]] + ) + result = t.aggregate([np.mean, np.std]) + tm.assert_frame_equal(result, exp) else: result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, expected) From dfd0be80366676f9164f0fbf4c54b68ba447279b Mon Sep 17 00:00:00 2001 From: Brock Date: Sat, 11 Feb 2023 17:09:13 -0800 Subject: [PATCH 2/6] troubleshoot 32bit builds --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 024966c215704..acaad1df46c7c 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -59,7 +59,7 @@ def test_groupby_std_datetimelike(): # same operation on int64 data xref GH#51332 td1 = Timedelta("2887 days 11:21:02.326710176") td4 = Timedelta("2886 days 00:42:34.664668096") - exp_ser = Series([td1 * 2, td1, td1, td1, td4]) + exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=[0, 1, 2, 3]) expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) tm.assert_frame_equal(result, expected) From 6eb54810aed5f8cabb201c5eab5e57d872444a9f Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 12 Feb 2023 07:39:07 -0800 Subject: [PATCH 3/6] fix test --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index acaad1df46c7c..94cf0c0adcf64 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -59,7 +59,7 @@ def test_groupby_std_datetimelike(): # same operation on int64 data xref GH#51332 td1 = Timedelta("2887 days 11:21:02.326710176") td4 = Timedelta("2886 days 00:42:34.664668096") - exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=[0, 1, 2, 3]) + exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=[0, 1, 2, 3, 4]) expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) tm.assert_frame_equal(result, expected) From aba342ee4df5cb598c12552c4241c33fd94a3b5b Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 12 Feb 2023 09:04:39 -0800 Subject: [PATCH 4/6] troubleshoot 32bit builds --- pandas/tests/groupby/test_groupby.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index 94cf0c0adcf64..a0b129b65d293 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -59,7 +59,7 @@ def test_groupby_std_datetimelike(): # same operation on int64 data xref GH#51332 td1 = Timedelta("2887 days 11:21:02.326710176") td4 = Timedelta("2886 days 00:42:34.664668096") - exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=[0, 1, 2, 3, 4]) + exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) tm.assert_frame_equal(result, expected) From 0992a8678110826fa6026dd66561e30f4c1cd703 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 12 Feb 2023 12:18:39 -0800 Subject: [PATCH 5/6] mypy fixup --- pandas/core/groupby/groupby.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 5ba5d20e4a74b..499d586b0be46 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -97,8 +97,10 @@ class providing the base-class of operations. BaseMaskedArray, BooleanArray, Categorical, + DatetimeArray, ExtensionArray, FloatingArray, + TimedeltaArray, ) from pandas.core.base import ( PandasObject, @@ -3769,6 +3771,7 @@ def blk_func(values: ArrayLike) -> ArrayLike: result = post_processing(result, inferences, **pp_kwargs) if how == "std" and is_datetimelike: + values = cast("DatetimeArray | TimedeltaArray", values) unit = values.unit result = result.astype(np.int64, copy=False) result = result.view(f"m8[{unit}]") From a4d23f353b977828fe20e011be618e6ab5311b4e Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 12 Feb 2023 14:34:51 -0800 Subject: [PATCH 6/6] troubleshoot CI --- pandas/core/groupby/groupby.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 499d586b0be46..810bf27ebf788 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -30,6 +30,7 @@ class providing the base-class of operations. cast, final, ) +import warnings import numpy as np @@ -3773,7 +3774,10 @@ def blk_func(values: ArrayLike) -> ArrayLike: if how == "std" and is_datetimelike: values = cast("DatetimeArray | TimedeltaArray", values) unit = values.unit - result = result.astype(np.int64, copy=False) + with warnings.catch_warnings(): + # suppress "RuntimeWarning: invalid value encountered in cast" + warnings.filterwarnings("ignore") + result = result.astype(np.int64, copy=False) result = result.view(f"m8[{unit}]") return result.T