diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index 778169b0dbeb4..aeb9d476a0a87 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -284,6 +284,7 @@ Other enhancements - Added support for ``dt`` accessor methods when using :class:`ArrowDtype` with a ``pyarrow.timestamp`` type (:issue:`50954`) - :func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`) - :meth:`.DataFrameGroupBy.quantile`, :meth:`.SeriesGroupBy.quantile` and :meth:`.DataFrameGroupBy.std` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`) +- :meth:`.DataFrameGroupBy.std`, :meth:`.SeriesGroupBy.std` now support datetime64, timedelta64, and :class:`DatetimeTZDtype` dtypes (:issue:`48481`) - :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`) - :func:`.testing.assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`) - Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`) diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 09f4fbec5176e..e3ca9c44d5664 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -85,6 +85,7 @@ def group_var( ddof: int = ..., # int64_t mask: np.ndarray | None = ..., result_mask: np.ndarray | None = ..., + is_datetimelike: bool = ..., ) -> None: ... def group_mean( out: np.ndarray, # floating[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index dd2bdadce31c5..0c378acbc6dc3 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -818,6 +818,7 @@ def group_var( int64_t ddof=1, const uint8_t[:, ::1] mask=None, uint8_t[:, ::1] result_mask=None, + bint is_datetimelike=False, ) -> None: cdef: Py_ssize_t i, j, N, K, lab, ncounts = len(counts) @@ -852,8 +853,13 @@ def group_var( if uses_mask: isna_entry = mask[i, j] + elif is_datetimelike: + # With group_var, we cannot just use _treat_as_na bc + # datetimelike dtypes get cast to float64 instead of + # to int64. + isna_entry = val == NPY_NAT else: - isna_entry = _treat_as_na(val, False) + isna_entry = _treat_as_na(val, is_datetimelike) if not isna_entry: nobs[lab, j] += 1 diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index e42566bfa11a0..810bf27ebf788 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -30,6 +30,7 @@ class providing the base-class of operations. cast, final, ) +import warnings import numpy as np @@ -97,8 +98,10 @@ class providing the base-class of operations. BaseMaskedArray, BooleanArray, Categorical, + DatetimeArray, ExtensionArray, FloatingArray, + TimedeltaArray, ) from pandas.core.base import ( PandasObject, @@ -3724,7 +3727,10 @@ def blk_func(values: ArrayLike) -> ArrayLike: counts = np.zeros(ngroups, dtype=np.int64) func = partial(func, counts=counts) + is_datetimelike = values.dtype.kind in ["m", "M"] vals = values + if is_datetimelike and how == "std": + vals = vals.view("i8") if pre_processing: vals, inferences = pre_processing(vals) @@ -3747,7 +3753,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: result_mask = np.zeros(result.shape, dtype=np.bool_) func = partial(func, result_mask=result_mask) - func(**kwargs) # Call func to modify result in place + # Call func to modify result in place + if how == "std": + func(**kwargs, is_datetimelike=is_datetimelike) + else: + func(**kwargs) if values.ndim == 1: assert result.shape[1] == 1, result.shape @@ -3761,6 +3771,15 @@ def blk_func(values: ArrayLike) -> ArrayLike: result = post_processing(result, inferences, **pp_kwargs) + if how == "std" and is_datetimelike: + values = cast("DatetimeArray | TimedeltaArray", values) + unit = values.unit + with warnings.catch_warnings(): + # suppress "RuntimeWarning: invalid value encountered in cast" + warnings.filterwarnings("ignore") + result = result.astype(np.int64, copy=False) + result = result.view(f"m8[{unit}]") + return result.T # Operate block-wise instead of column-by-column diff --git a/pandas/tests/groupby/test_groupby.py b/pandas/tests/groupby/test_groupby.py index e175f6dda980f..a0b129b65d293 100644 --- a/pandas/tests/groupby/test_groupby.py +++ b/pandas/tests/groupby/test_groupby.py @@ -37,6 +37,33 @@ def test_repr(): assert result == expected +def test_groupby_std_datetimelike(): + # GH#48481 + tdi = pd.timedelta_range("1 Day", periods=10000) + ser = Series(tdi) + ser[::5] *= 2 # get different std for different groups + + df = ser.to_frame("A") + + df["B"] = ser + Timestamp(0) + df["C"] = ser + Timestamp(0, tz="UTC") + df.iloc[-1] = pd.NaT # last group includes NaTs + + gb = df.groupby(list(range(5)) * 2000) + + result = gb.std() + + # Note: this does not _exactly_ match what we would get if we did + # [gb.get_group(i).std() for i in gb.groups] + # but it _does_ match the floating point error we get doing the + # same operation on int64 data xref GH#51332 + td1 = Timedelta("2887 days 11:21:02.326710176") + td4 = Timedelta("2886 days 00:42:34.664668096") + exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5)) + expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser}) + tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"]) def test_basic(dtype): diff --git a/pandas/tests/groupby/test_raises.py b/pandas/tests/groupby/test_raises.py index 6ceb23a3c44b6..76ba4c974b3fd 100644 --- a/pandas/tests/groupby/test_raises.py +++ b/pandas/tests/groupby/test_raises.py @@ -224,11 +224,11 @@ def test_groupby_raises_datetime(how, by, groupby_series, groupby_func): "prod": (TypeError, "datetime64 type does not support prod"), "quantile": (None, ""), "rank": (None, ""), - "sem": (TypeError, "Cannot cast DatetimeArray to dtype float64"), + "sem": (None, ""), "shift": (None, ""), "size": (None, ""), "skew": (TypeError, r"dtype datetime64\[ns\] does not support reduction"), - "std": (TypeError, "Cannot cast DatetimeArray to dtype float64"), + "std": (None, ""), "sum": (TypeError, "datetime64 type does not support sum operations"), "var": (None, ""), }[groupby_func] diff --git a/pandas/tests/resample/test_resample_api.py b/pandas/tests/resample/test_resample_api.py index 0b8dc8f3e8ac4..1e54a4c03f4fc 100644 --- a/pandas/tests/resample/test_resample_api.py +++ b/pandas/tests/resample/test_resample_api.py @@ -405,12 +405,16 @@ def test_agg(): expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1) expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]]) for t in cases: - # In case 2, "date" is an index and a column, so agg still tries to agg + # In case 2, "date" is an index and a column, so get included in the agg if t == cases[2]: - # .var on dt64 column raises - msg = "Cannot cast DatetimeArray to dtype float64" - with pytest.raises(TypeError, match=msg): - t.aggregate([np.mean, np.std]) + date_mean = t["date"].mean() + date_std = t["date"].std() + exp = pd.concat([date_mean, date_std, expected], axis=1) + exp.columns = pd.MultiIndex.from_product( + [["date", "A", "B"], ["mean", "std"]] + ) + result = t.aggregate([np.mean, np.std]) + tm.assert_frame_equal(result, exp) else: result = t.aggregate([np.mean, np.std]) tm.assert_frame_equal(result, expected)