Skip to content

ENH: support td64/dt64 in GroupBy.std #51333

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Feb 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ Other enhancements
- Added support for ``dt`` accessor methods when using :class:`ArrowDtype` with a ``pyarrow.timestamp`` type (:issue:`50954`)
- :func:`read_sas` now supports using ``encoding='infer'`` to correctly read and use the encoding specified by the sas file. (:issue:`48048`)
- :meth:`.DataFrameGroupBy.quantile`, :meth:`.SeriesGroupBy.quantile` and :meth:`.DataFrameGroupBy.std` now preserve nullable dtypes instead of casting to numpy dtypes (:issue:`37493`)
- :meth:`.DataFrameGroupBy.std`, :meth:`.SeriesGroupBy.std` now support datetime64, timedelta64, and :class:`DatetimeTZDtype` dtypes (:issue:`48481`)
- :meth:`Series.add_suffix`, :meth:`DataFrame.add_suffix`, :meth:`Series.add_prefix` and :meth:`DataFrame.add_prefix` support an ``axis`` argument. If ``axis`` is set, the default behaviour of which axis to consider can be overwritten (:issue:`47819`)
- :func:`.testing.assert_frame_equal` now shows the first element where the DataFrames differ, analogously to ``pytest``'s output (:issue:`47910`)
- Added ``index`` parameter to :meth:`DataFrame.to_dict` (:issue:`46398`)
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def group_var(
ddof: int = ..., # int64_t
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ...,
) -> None: ...
def group_mean(
out: np.ndarray, # floating[:, ::1]
Expand Down
8 changes: 7 additions & 1 deletion pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -818,6 +818,7 @@ def group_var(
int64_t ddof=1,
const uint8_t[:, ::1] mask=None,
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
) -> None:
cdef:
Py_ssize_t i, j, N, K, lab, ncounts = len(counts)
Expand Down Expand Up @@ -852,8 +853,13 @@ def group_var(

if uses_mask:
isna_entry = mask[i, j]
elif is_datetimelike:
# With group_var, we cannot just use _treat_as_na bc
# datetimelike dtypes get cast to float64 instead of
# to int64.
isna_entry = val == NPY_NAT
else:
isna_entry = _treat_as_na(val, False)
isna_entry = _treat_as_na(val, is_datetimelike)

if not isna_entry:
nobs[lab, j] += 1
Expand Down
21 changes: 20 additions & 1 deletion pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ class providing the base-class of operations.
cast,
final,
)
import warnings

import numpy as np

Expand Down Expand Up @@ -97,8 +98,10 @@ class providing the base-class of operations.
BaseMaskedArray,
BooleanArray,
Categorical,
DatetimeArray,
ExtensionArray,
FloatingArray,
TimedeltaArray,
)
from pandas.core.base import (
PandasObject,
Expand Down Expand Up @@ -3724,7 +3727,10 @@ def blk_func(values: ArrayLike) -> ArrayLike:
counts = np.zeros(ngroups, dtype=np.int64)
func = partial(func, counts=counts)

is_datetimelike = values.dtype.kind in ["m", "M"]
vals = values
if is_datetimelike and how == "std":
vals = vals.view("i8")
if pre_processing:
vals, inferences = pre_processing(vals)

Expand All @@ -3747,7 +3753,11 @@ def blk_func(values: ArrayLike) -> ArrayLike:
result_mask = np.zeros(result.shape, dtype=np.bool_)
func = partial(func, result_mask=result_mask)

func(**kwargs) # Call func to modify result in place
# Call func to modify result in place
if how == "std":
func(**kwargs, is_datetimelike=is_datetimelike)
else:
func(**kwargs)

if values.ndim == 1:
assert result.shape[1] == 1, result.shape
Expand All @@ -3761,6 +3771,15 @@ def blk_func(values: ArrayLike) -> ArrayLike:

result = post_processing(result, inferences, **pp_kwargs)

if how == "std" and is_datetimelike:
values = cast("DatetimeArray | TimedeltaArray", values)
unit = values.unit
with warnings.catch_warnings():
# suppress "RuntimeWarning: invalid value encountered in cast"
warnings.filterwarnings("ignore")
result = result.astype(np.int64, copy=False)
result = result.view(f"m8[{unit}]")

return result.T

# Operate block-wise instead of column-by-column
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,33 @@ def test_repr():
assert result == expected


def test_groupby_std_datetimelike():
# GH#48481
tdi = pd.timedelta_range("1 Day", periods=10000)
ser = Series(tdi)
ser[::5] *= 2 # get different std for different groups

df = ser.to_frame("A")

df["B"] = ser + Timestamp(0)
df["C"] = ser + Timestamp(0, tz="UTC")
df.iloc[-1] = pd.NaT # last group includes NaTs

gb = df.groupby(list(range(5)) * 2000)

result = gb.std()

# Note: this does not _exactly_ match what we would get if we did
# [gb.get_group(i).std() for i in gb.groups]
# but it _does_ match the floating point error we get doing the
# same operation on int64 data xref GH#51332
td1 = Timedelta("2887 days 11:21:02.326710176")
td4 = Timedelta("2886 days 00:42:34.664668096")
exp_ser = Series([td1 * 2, td1, td1, td1, td4], index=np.arange(5))
expected = DataFrame({"A": exp_ser, "B": exp_ser, "C": exp_ser})
tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize("dtype", ["int64", "int32", "float64", "float32"])
def test_basic(dtype):

Expand Down
4 changes: 2 additions & 2 deletions pandas/tests/groupby/test_raises.py
Original file line number Diff line number Diff line change
Expand Up @@ -224,11 +224,11 @@ def test_groupby_raises_datetime(how, by, groupby_series, groupby_func):
"prod": (TypeError, "datetime64 type does not support prod"),
"quantile": (None, ""),
"rank": (None, ""),
"sem": (TypeError, "Cannot cast DatetimeArray to dtype float64"),
"sem": (None, ""),
"shift": (None, ""),
"size": (None, ""),
"skew": (TypeError, r"dtype datetime64\[ns\] does not support reduction"),
"std": (TypeError, "Cannot cast DatetimeArray to dtype float64"),
"std": (None, ""),
"sum": (TypeError, "datetime64 type does not support sum operations"),
"var": (None, ""),
}[groupby_func]
Expand Down
14 changes: 9 additions & 5 deletions pandas/tests/resample/test_resample_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,12 +405,16 @@ def test_agg():
expected = pd.concat([a_mean, a_std, b_mean, b_std], axis=1)
expected.columns = pd.MultiIndex.from_product([["A", "B"], ["mean", "std"]])
for t in cases:
# In case 2, "date" is an index and a column, so agg still tries to agg
# In case 2, "date" is an index and a column, so get included in the agg
if t == cases[2]:
# .var on dt64 column raises
msg = "Cannot cast DatetimeArray to dtype float64"
with pytest.raises(TypeError, match=msg):
t.aggregate([np.mean, np.std])
date_mean = t["date"].mean()
date_std = t["date"].std()
exp = pd.concat([date_mean, date_std, expected], axis=1)
exp.columns = pd.MultiIndex.from_product(
[["date", "A", "B"], ["mean", "std"]]
)
result = t.aggregate([np.mean, np.std])
tm.assert_frame_equal(result, exp)
else:
result = t.aggregate([np.mean, np.std])
tm.assert_frame_equal(result, expected)
Expand Down