Skip to content

BUG: GroupBy.cumsum td64, skipna=False #46216

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/whatsnew/v1.5.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,9 @@ Groupby/resample/rolling
- Bug in :meth:`.ExponentialMovingWindow.mean` with ``axis=1`` and ``engine='numba'`` when the :class:`DataFrame` has more columns than rows (:issue:`46086`)
- Bug when using ``engine="numba"`` would return the same jitted function when modifying ``engine_kwargs`` (:issue:`46086`)
- Bug in :meth:`.DataFrameGroupby.transform` fails when ``axis=1`` and ``func`` is ``"first"`` or ``"last"`` (:issue:`45986`)
- Bug in :meth:`DataFrameGroupby.cumsum` with ``skipna=False`` giving incorrect results (:issue:`??`)
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

forgot to update these, will add this to next "assorted" PR

- Bug in :meth:`.GroupBy.cumsum` with ``timedelta64[ns]`` dtype failing to recognize ``NaT`` as a null value (:issue:`??`)
-

Reshaping
^^^^^^^^^
Expand Down
60 changes: 39 additions & 21 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ from pandas._libs.algos import (
from pandas._libs.dtypes cimport (
iu_64_floating_obj_t,
iu_64_floating_t,
numeric_object_t,
numeric_t,
)
from pandas._libs.missing cimport checknull
Expand Down Expand Up @@ -211,7 +212,7 @@ def group_cumsum(
ndarray[numeric_t, ndim=2] values,
const intp_t[::1] labels,
int ngroups,
is_datetimelike,
bint is_datetimelike,
bint skipna=True,
) -> None:
"""
Expand All @@ -238,14 +239,23 @@ def group_cumsum(
"""
cdef:
Py_ssize_t i, j, N, K, size
numeric_t val, y, t
numeric_t val, y, t, na_val
numeric_t[:, ::1] accum, compensation
intp_t lab
bint isna_entry, isna_prev = False

N, K = (<object>values).shape
accum = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)
compensation = np.zeros((ngroups, K), dtype=np.asarray(values).dtype)

if numeric_t == float32_t or numeric_t == float64_t:
na_val = NaN
elif numeric_t is int64_t and is_datetimelike:
na_val = NPY_NAT
else:
# Will not be used, but define to avoid unitialized warning.
na_val = 0

with nogil:
for i in range(N):
lab = labels[i]
Expand All @@ -255,22 +265,30 @@ def group_cumsum(
for j in range(K):
val = values[i, j]

# For floats, use Kahan summation to reduce floating-point
# error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
if numeric_t == float32_t or numeric_t == float64_t:
if val == val:
isna_entry = _treat_as_na(val, is_datetimelike)

if not skipna:
isna_prev = _treat_as_na(accum[lab, j], is_datetimelike)
if isna_prev:
out[i, j] = na_val
continue


if isna_entry:
out[i, j] = na_val
if not skipna:
accum[lab, j] = na_val

else:
# For floats, use Kahan summation to reduce floating-point
# error (https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
if numeric_t == float32_t or numeric_t == float64_t:
y = val - compensation[lab, j]
t = accum[lab, j] + y
compensation[lab, j] = t - accum[lab, j] - y
accum[lab, j] = t
out[i, j] = t
else:
out[i, j] = NaN
if not skipna:
accum[lab, j] = NaN
break
else:
t = val + accum[lab, j]
t = val + accum[lab, j]

accum[lab, j] = t
out[i, j] = t

Expand Down Expand Up @@ -962,19 +980,19 @@ def group_quantile(
# group_nth, group_last, group_rank
# ----------------------------------------------------------------------

cdef inline bint _treat_as_na(iu_64_floating_obj_t val, bint is_datetimelike) nogil:
if iu_64_floating_obj_t is object:
cdef inline bint _treat_as_na(numeric_object_t val, bint is_datetimelike) nogil:
if numeric_object_t is object:
# Should never be used, but we need to avoid the `val != val` below
# or else cython will raise about gil acquisition.
raise NotImplementedError

elif iu_64_floating_obj_t is int64_t:
elif numeric_object_t is int64_t:
return is_datetimelike and val == NPY_NAT
elif iu_64_floating_obj_t is uint64_t:
# There is no NA value for uint64
return False
else:
elif numeric_object_t is float32_t or numeric_object_t is float64_t:
return val != val
else:
# non-datetimelike integer
return False


# TODO(cython3): GH#31710 use memorviews once cython 0.30 is released so we can
Expand Down
34 changes: 34 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -2468,6 +2468,40 @@ def test_groupby_numerical_stability_cumsum():
tm.assert_frame_equal(result, expected, check_exact=True)


def test_groupby_cumsum_skipna_false():
# don't propagate np.nan above the diagonal
arr = np.random.randn(5, 5)
df = DataFrame(arr)
for i in range(5):
df.iloc[i, i] = np.nan

df["A"] = 1
gb = df.groupby("A")

res = gb.cumsum(skipna=False)

expected = df[[0, 1, 2, 3, 4]].cumsum(skipna=False)
tm.assert_frame_equal(res, expected)


def test_groupby_cumsum_timedelta64():
# don't ignore is_datetimelike in libgroupby.group_cumsum
dti = date_range("2016-01-01", periods=5)
ser = Series(dti) - dti[0]
ser[2] = pd.NaT

df = DataFrame({"A": 1, "B": ser})
gb = df.groupby("A")

res = gb.cumsum(numeric_only=False, skipna=True)
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, ser[4], ser[4] * 2]})
tm.assert_frame_equal(res, exp)

res = gb.cumsum(numeric_only=False, skipna=False)
exp = DataFrame({"B": [ser[0], ser[1], pd.NaT, pd.NaT, pd.NaT]})
tm.assert_frame_equal(res, exp)


def test_groupby_mean_duplicate_index(rand_series_with_duplicate_datetimeindex):
dups = rand_series_with_duplicate_datetimeindex
result = dups.groupby(level=0).mean()
Expand Down