Skip to content

Commit 44d56aa

Browse files
committed
BUG: Groupby median on timedelta column with NaT returns odd value (#57926)
Handle NaT correctly in group_median_float64
1 parent 8704cfa commit 44d56aa

File tree

5 files changed

+28
-6
lines changed

5 files changed

+28
-6
lines changed

doc/source/whatsnew/v3.0.0.rst

+1
Original file line numberDiff line numberDiff line change
@@ -296,6 +296,7 @@ Bug fixes
296296
- Fixed bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
297297
- Fixed bug in :meth:`DataFrame.update` bool dtype being converted to object (:issue:`55509`)
298298
- Fixed bug in :meth:`DataFrameGroupBy.apply` that was returning a completely empty DataFrame when all return values of ``func`` were ``None`` instead of returning an empty DataFrame with the original columns and dtypes. (:issue:`57775`)
299+
- Fixed bug in :meth:`GroupBy.median` where nat values gave an incorrect result. (:issue:`57926`)
299300
- Fixed bug in :meth:`Series.diff` allowing non-integer values for the ``periods`` argument. (:issue:`56607`)
300301
- Fixed bug in :meth:`Series.rank` that doesn't preserve missing values for nullable integers when ``na_option='keep'``. (:issue:`56976`)
301302
- Fixed bug in :meth:`Series.replace` and :meth:`DataFrame.replace` inconsistently replacing matching instances when ``regex=True`` and missing values are present. (:issue:`56599`)

pandas/_libs/groupby.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def group_median_float64(
1212
min_count: int = ..., # Py_ssize_t
1313
mask: np.ndarray | None = ...,
1414
result_mask: np.ndarray | None = ...,
15+
is_datetimelike: bool = ..., # bint
1516
) -> None: ...
1617
def group_cumprod(
1718
out: np.ndarray, # float64_t[:, ::1]

pandas/_libs/groupby.pyx

+17-5
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,11 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
101101
return result
102102

103103

104-
cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
104+
cdef float64_t median_linear(
105+
float64_t* a,
106+
int n,
107+
bint is_datetimelike=False
108+
) noexcept nogil:
105109
cdef:
106110
int i, j, na_count = 0
107111
float64_t* tmp
@@ -111,9 +115,14 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
111115
return NaN
112116

113117
# count NAs
114-
for i in range(n):
115-
if a[i] != a[i]:
116-
na_count += 1
118+
if is_datetimelike:
119+
for i in range(n):
120+
if a[i] == NPY_NAT:
121+
na_count += 1
122+
else:
123+
for i in range(n):
124+
if a[i] != a[i]:
125+
na_count += 1
117126

118127
if na_count:
119128
if na_count == n:
@@ -125,6 +134,8 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
125134

126135
j = 0
127136
for i in range(n):
137+
if is_datetimelike and a[i] == NPY_NAT:
138+
continue
128139
if a[i] == a[i]:
129140
tmp[j] = a[i]
130141
j += 1
@@ -170,6 +181,7 @@ def group_median_float64(
170181
Py_ssize_t min_count=-1,
171182
const uint8_t[:, :] mask=None,
172183
uint8_t[:, ::1] result_mask=None,
184+
bint is_datetimelike=False,
173185
) -> None:
174186
"""
175187
Only aggregates on axis=0
@@ -228,7 +240,7 @@ def group_median_float64(
228240
ptr += _counts[0]
229241
for j in range(ngroups):
230242
size = _counts[j + 1]
231-
out[j, i] = median_linear(ptr, size)
243+
out[j, i] = median_linear(ptr, size, is_datetimelike)
232244
ptr += size
233245

234246

pandas/core/groupby/ops.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ def _call_cython_op(
415415
"last",
416416
"first",
417417
"sum",
418+
"median",
418419
]:
419420
func(
420421
out=result,
@@ -427,7 +428,7 @@ def _call_cython_op(
427428
is_datetimelike=is_datetimelike,
428429
**kwargs,
429430
)
430-
elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]:
431+
elif self.how in ["sem", "std", "var", "ohlc", "prod"]:
431432
if self.how in ["std", "sem"]:
432433
kwargs["is_datetimelike"] = is_datetimelike
433434
func(

pandas/tests/groupby/test_groupby.py

+7
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,13 @@ def test_len_nan_group():
145145
assert len(df.groupby(["a", "b"])) == 0
146146

147147

148+
def test_groupby_timedelta_median():
149+
# issue 57926
150+
df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1d")]})
151+
median = df.groupby("label")["timedelta"].median()
152+
assert median.loc["foo"] == Timedelta("1d")
153+
154+
148155
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
149156
def test_len_categorical(dropna, observed, keys):
150157
# GH#57595

0 commit comments

Comments
 (0)