Skip to content

BUG: Groupby median on timedelta column with NaT returns odd value (#… #57957

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/source/whatsnew/v3.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,7 @@ Performance improvements
Bug fixes
~~~~~~~~~
- Fixed bug in :class:`SparseDtype` for equal comparison with na fill value. (:issue:`54770`)
- Fixed bug in :meth:`.DataFrameGroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
- Fixed bug in :meth:`DataFrame.join` inconsistently setting result index name (:issue:`55815`)
- Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
- Fixed bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`)
Expand Down
1 change: 1 addition & 0 deletions pandas/_libs/groupby.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ def group_median_float64(
min_count: int = ..., # Py_ssize_t
mask: np.ndarray | None = ...,
result_mask: np.ndarray | None = ...,
is_datetimelike: bool = ..., # bint
) -> None: ...
def group_cumprod(
out: np.ndarray, # float64_t[:, ::1]
Expand Down
34 changes: 25 additions & 9 deletions pandas/_libs/groupby.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,11 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
return result


cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
cdef float64_t median_linear(
float64_t* a,
int n,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI these should be Py_ssize_t not int. int is limited to 2 ** 31 - 1 on most platforms, so this would cause undefined behavior if ever called with an n that exceeded that (probably unlikely within groupby...but still possible)

Happy to have that in a follow up PR

bint is_datetimelike=False
) noexcept nogil:
cdef:
int i, j, na_count = 0
float64_t* tmp
Expand All @@ -111,9 +115,14 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
return NaN

# count NAs
for i in range(n):
if a[i] != a[i]:
na_count += 1
if is_datetimelike:
for i in range(n):
if a[i] == NPY_NAT:
na_count += 1
else:
for i in range(n):
if a[i] != a[i]:
na_count += 1

if na_count:
if na_count == n:
Expand All @@ -124,10 +133,16 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
raise MemoryError()

j = 0
for i in range(n):
if a[i] == a[i]:
tmp[j] = a[i]
j += 1
if is_datetimelike:
for i in range(n):
if a[i] != NPY_NAT:
tmp[j] = a[i]
j += 1
else:
for i in range(n):
if a[i] == a[i]:
tmp[j] = a[i]
j += 1

a = tmp
n -= na_count
Expand Down Expand Up @@ -170,6 +185,7 @@ def group_median_float64(
Py_ssize_t min_count=-1,
const uint8_t[:, :] mask=None,
uint8_t[:, ::1] result_mask=None,
bint is_datetimelike=False,
) -> None:
"""
Only aggregates on axis=0
Expand Down Expand Up @@ -228,7 +244,7 @@ def group_median_float64(
ptr += _counts[0]
for j in range(ngroups):
size = _counts[j + 1]
out[j, i] = median_linear(ptr, size)
out[j, i] = median_linear(ptr, size, is_datetimelike)
ptr += size


Expand Down
3 changes: 2 additions & 1 deletion pandas/core/groupby/ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -415,6 +415,7 @@ def _call_cython_op(
"last",
"first",
"sum",
"median",
]:
func(
out=result,
Expand All @@ -427,7 +428,7 @@ def _call_cython_op(
is_datetimelike=is_datetimelike,
**kwargs,
)
elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]:
elif self.how in ["sem", "std", "var", "ohlc", "prod"]:
if self.how in ["std", "sem"]:
kwargs["is_datetimelike"] = is_datetimelike
func(
Expand Down
9 changes: 9 additions & 0 deletions pandas/tests/groupby/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,15 @@ def test_len_nan_group():
assert len(df.groupby(["a", "b"])) == 0


def test_groupby_timedelta_median():
# issue 57926
expected = Series(data=Timedelta("1d"), index=["foo"])
df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1d")]})
gb = df.groupby("label")["timedelta"]
actual = gb.median()
tm.assert_series_equal(actual, expected, check_names=False)


@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
def test_len_categorical(dropna, observed, keys):
# GH#57595
Expand Down
Loading