Skip to content

BUG: incorrect rounding in groupby.cummin near int64 implementation bounds #40767

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
Apr 14, 2021
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v1.3.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,7 @@ Groupby/resample/rolling
- Bug in aggregation functions for :class:`DataFrame` not respecting ``numeric_only`` argument when ``level`` keyword was given (:issue:`40660`)
- Bug in :class:`core.window.RollingGroupby` where ``as_index=False`` argument in ``groupby`` was ignored (:issue:`39433`)
- Bug in :meth:`.GroupBy.any` and :meth:`.GroupBy.all` raising ``ValueError`` when using with nullable type columns holding ``NA`` even with ``skipna=True`` (:issue:`40585`)

- Bug in :meth:`GroupBy.cummin` and :meth:`GroupBy.cummax` incorrectly rounding integer values near the ``int64`` implementations bounds (:issue:`40767`)

Reshaping
^^^^^^^^^
Expand Down
32 changes: 26 additions & 6 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -1139,6 +1139,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
Py_ssize_t min_count=-1,
bint is_datetimelike=False,
bint compute_max=True):
"""
Compute minimum/maximum of columns of `values`, in row groups `labels`.
Expand All @@ -1156,6 +1157,8 @@ cdef group_min_max(groupby_t[:, ::1] out,
min_count : Py_ssize_t, default -1
The minimum number of non-NA group elements, NA result if threshold
is not met
is_datetimelike : bool
True if `values` contains datetime-like entries.
compute_max : bint, default True
True to compute group-wise max, False to compute min

Expand Down Expand Up @@ -1204,8 +1207,7 @@ cdef group_min_max(groupby_t[:, ::1] out,
for j in range(K):
val = values[i, j]

if not _treat_as_na(val, True):
# TODO: Sure we always want is_datetimelike=True?
if not _treat_as_na(val, is_datetimelike):
nobs[lab, j] += 1
if compute_max:
if val > group_min_or_max[lab, j]:
Expand Down Expand Up @@ -1237,9 +1239,18 @@ def group_max(groupby_t[:, ::1] out,
int64_t[::1] counts,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
Py_ssize_t min_count=-1) -> None:
Py_ssize_t min_count=-1,
bint is_datetimelike=False) -> None:
"""See group_min_max.__doc__"""
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=True)
group_min_max(
out,
counts,
values,
labels,
min_count=min_count,
is_datetimelike=is_datetimelike,
compute_max=True,
)


@cython.wraparound(False)
Expand All @@ -1248,9 +1259,18 @@ def group_min(groupby_t[:, ::1] out,
int64_t[::1] counts,
ndarray[groupby_t, ndim=2] values,
const intp_t[:] labels,
Py_ssize_t min_count=-1) -> None:
Py_ssize_t min_count=-1,
bint is_datetimelike=False) -> None:
"""See group_min_max.__doc__"""
group_min_max(out, counts, values, labels, min_count=min_count, compute_max=False)
group_min_max(
out,
counts,
values,
labels,
min_count=min_count,
is_datetimelike=is_datetimelike,
compute_max=False,
)


@cython.boundscheck(False)
Expand Down
44 changes: 27 additions & 17 deletions pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

from pandas._libs import (
NaT,
iNaT,
lib,
)
import pandas._libs.groupby as libgroupby
Expand Down Expand Up @@ -663,12 +662,7 @@ def _cython_operation(
elif is_bool_dtype(dtype):
values = ensure_int_or_float(values)
elif is_integer_dtype(dtype):
# we use iNaT for the missing value on ints
# so pre-convert to guard this condition
if (values == iNaT).any():
values = ensure_float64(values)
else:
values = ensure_int_or_float(values)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think these iNaT checks are necessary because in algos like group_min_max, it assumes that it is impossible for an integer array to have iNaT (because it assumes datetimelike=True and safety in setting missing values as NPY_NAT).

eg on this branch:

ser = pd.Series([1, iNaT])
print(ser.groupby([1, 1]).max(min_count=2))

gives

1   -9223372036854775808
dtype: int64

because the iNaT is interpreted as NaN, so min_count isn't reached. (Then the NaN is set with iNaT, but not interpreted as NaN anymore)

Forcing mask usage (#40651 (comment)) would be another way to handle this more robustly (since to specify missing values, the mask itself can just be modified inplace, precluding the need for the existing iNaT workarounds to signify missing values in the algo result)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

because it assumes datetimelike=True

isn't this the underlying problem in this example?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the reason datetimelike isn't used in group_min_max(and some other groupby algos) is because of the additional problem that there is no way to encode a missing value in an integer (not datetimelike) array.

So the current logic casts integer to float if iNaT is present, so that iNaT can be used to signify missing values for integer data (regardless of datetimelike status).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that there is no way to encode a missing value in an integer (not datetimelike) array.

i think we dont need to encode it because we have counts==0 (or counts<min_count) to convey that information

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if we use masks we could avoid this awkwardness but that will need some refactoring

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

its not too hard to edit group_min_max to handle this example (will push in a bit). We can improve matters a bit by using counts < min_count as a mask more consistently, but that won't do anything about int64->float64 conversions being lossy. a full solution to that would be one of a) explicitly be OK with the lossiness, b) detect when the conversion would be lossy and cast to either object or float128, c) return an IntegerArray

values = ensure_int_or_float(values)
elif is_numeric:
if not is_complex_dtype(dtype):
values = ensure_float64(values)
Expand All @@ -686,20 +680,36 @@ def _cython_operation(
result = maybe_fill(np.empty(out_shape, dtype=out_dtype))
if kind == "aggregate":
counts = np.zeros(ngroups, dtype=np.int64)
func(result, counts, values, comp_ids, min_count)
if how in ["min", "max"]:
func(
result,
counts,
values,
comp_ids,
min_count,
is_datetimelike=is_datetimelike,
)
else:
func(result, counts, values, comp_ids, min_count)
elif kind == "transform":
# TODO: min_count
func(result, values, comp_ids, ngroups, is_datetimelike, **kwargs)

if is_integer_dtype(result.dtype) and not is_datetimelike:
mask = result == iNaT
if mask.any():
result = result.astype("float64")
result[mask] = np.nan

if kind == "aggregate" and self._filter_empty_groups and not counts.all():
assert result.ndim != 2
result = result[counts > 0]
if kind == "aggregate":
# i.e. counts is defined. Locations where count<min_count
# need to have the result set to np.nan, which may require casting,
# see GH#40767
if is_integer_dtype(result.dtype) and not is_datetimelike:
cutoff = max(1, min_count)
empty_groups = counts < cutoff
if empty_groups.any():
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this check needed?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if empty_groups.any() then we need to mask in order to cast to float64

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

though this behavior is really surprising, though likely not hit practically. we should move aggressively to return Int dtypes here. Yes this is a breaking change but fixes these types of value dependent behavior.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we should move aggressively to return Int dtypes here

we'd need 2D support for Int dtypes for me to consider getting on board with this, xref #38992

# Note: this conversion could be lossy, see GH#40767
result = result.astype("float64")
result[empty_groups] = np.nan

if self._filter_empty_groups and not counts.all():
assert result.ndim != 2
result = result[counts > 0]

result = result.T

Expand Down
39 changes: 37 additions & 2 deletions pandas/tests/groupby/test_function.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import numpy as np
import pytest

from pandas._libs.tslibs import iNaT
from pandas.errors import UnsupportedFunctionCall

import pandas as pd
Expand Down Expand Up @@ -591,6 +592,38 @@ def test_max_nan_bug():
assert not r["File"].isna().any()


def test_max_inat():
# GH#40767 dont interpret iNaT as NaN
ser = Series([1, iNaT])
gb = ser.groupby([1, 1])

result = gb.max(min_count=2)
expected = Series({1: 1}, dtype=np.int64)
tm.assert_series_equal(result, expected, check_exact=True)

result = gb.min(min_count=2)
expected = Series({1: iNaT}, dtype=np.int64)
tm.assert_series_equal(result, expected, check_exact=True)

# not enough entries -> gets masked to NaN
result = gb.min(min_count=3)
expected = Series({1: np.nan})
tm.assert_series_equal(result, expected, check_exact=True)


def test_max_inat_not_all_na():
# GH#40767 dont interpret iNaT as NaN

# make sure we dont round iNaT+1 to iNaT
ser = Series([1, iNaT, 2, iNaT + 1])
gb = ser.groupby([1, 2, 3, 3])
result = gb.min(min_count=2)

# Note: in converting to float64, the iNaT + 1 maps to iNaT, i.e. is lossy
expected = Series({1: np.nan, 2: np.nan, 3: iNaT + 1})
tm.assert_series_equal(result, expected, check_exact=True)


def test_nlargest():
a = Series([1, 3, 5, 7, 2, 9, 0, 4, 6, 10])
b = Series(list("a" * 5 + "b" * 5))
Expand Down Expand Up @@ -708,11 +741,13 @@ def test_cummin(numpy_dtypes_for_minmax):

# Test w/ min value for dtype
df.loc[[2, 6], "B"] = min_val
df.loc[[1, 5], "B"] = min_val + 1
expected.loc[[2, 3, 6, 7], "B"] = min_val
expected.loc[[1, 5], "B"] = min_val + 1 # should not be rounded to min_val
result = df.groupby("A").cummin()
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected, check_exact=True)
expected = df.groupby("A").B.apply(lambda x: x.cummin()).to_frame()
tm.assert_frame_equal(result, expected)
tm.assert_frame_equal(result, expected, check_exact=True)

# Test nan in some values
base_df.loc[[0, 2, 4, 6], "B"] = np.nan
Expand Down