Skip to content

Commit 9a39c14

Browse files
JJLLWWpmhatre1
authored andcommitted
BUG: Groupby median on timedelta column with NaT returns odd value (#… (pandas-dev#57957)
1 parent baeff05 commit 9a39c14

File tree

4 files changed

+37
-10
lines changed

4 files changed

+37
-10
lines changed

pandas/_libs/groupby.pyi

+1
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ def group_median_float64(
1212
min_count: int = ..., # Py_ssize_t
1313
mask: np.ndarray | None = ...,
1414
result_mask: np.ndarray | None = ...,
15+
is_datetimelike: bool = ..., # bint
1516
) -> None: ...
1617
def group_cumprod(
1718
out: np.ndarray, # float64_t[:, ::1]

pandas/_libs/groupby.pyx

+25-9
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,11 @@ cdef float64_t median_linear_mask(float64_t* a, int n, uint8_t* mask) noexcept n
101101
return result
102102

103103

104-
cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
104+
cdef float64_t median_linear(
105+
float64_t* a,
106+
int n,
107+
bint is_datetimelike=False
108+
) noexcept nogil:
105109
cdef:
106110
int i, j, na_count = 0
107111
float64_t* tmp
@@ -111,9 +115,14 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
111115
return NaN
112116

113117
# count NAs
114-
for i in range(n):
115-
if a[i] != a[i]:
116-
na_count += 1
118+
if is_datetimelike:
119+
for i in range(n):
120+
if a[i] == NPY_NAT:
121+
na_count += 1
122+
else:
123+
for i in range(n):
124+
if a[i] != a[i]:
125+
na_count += 1
117126

118127
if na_count:
119128
if na_count == n:
@@ -124,10 +133,16 @@ cdef float64_t median_linear(float64_t* a, int n) noexcept nogil:
124133
raise MemoryError()
125134

126135
j = 0
127-
for i in range(n):
128-
if a[i] == a[i]:
129-
tmp[j] = a[i]
130-
j += 1
136+
if is_datetimelike:
137+
for i in range(n):
138+
if a[i] != NPY_NAT:
139+
tmp[j] = a[i]
140+
j += 1
141+
else:
142+
for i in range(n):
143+
if a[i] == a[i]:
144+
tmp[j] = a[i]
145+
j += 1
131146

132147
a = tmp
133148
n -= na_count
@@ -170,6 +185,7 @@ def group_median_float64(
170185
Py_ssize_t min_count=-1,
171186
const uint8_t[:, :] mask=None,
172187
uint8_t[:, ::1] result_mask=None,
188+
bint is_datetimelike=False,
173189
) -> None:
174190
"""
175191
Only aggregates on axis=0
@@ -228,7 +244,7 @@ def group_median_float64(
228244
ptr += _counts[0]
229245
for j in range(ngroups):
230246
size = _counts[j + 1]
231-
out[j, i] = median_linear(ptr, size)
247+
out[j, i] = median_linear(ptr, size, is_datetimelike)
232248
ptr += size
233249

234250

pandas/core/groupby/ops.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,7 @@ def _call_cython_op(
415415
"last",
416416
"first",
417417
"sum",
418+
"median",
418419
]:
419420
func(
420421
out=result,
@@ -427,7 +428,7 @@ def _call_cython_op(
427428
is_datetimelike=is_datetimelike,
428429
**kwargs,
429430
)
430-
elif self.how in ["sem", "std", "var", "ohlc", "prod", "median"]:
431+
elif self.how in ["sem", "std", "var", "ohlc", "prod"]:
431432
if self.how in ["std", "sem"]:
432433
kwargs["is_datetimelike"] = is_datetimelike
433434
func(

pandas/tests/groupby/test_groupby.py

+9
Original file line numberDiff line numberDiff line change
@@ -145,6 +145,15 @@ def test_len_nan_group():
145145
assert len(df.groupby(["a", "b"])) == 0
146146

147147

148+
def test_groupby_timedelta_median():
149+
# issue 57926
150+
expected = Series(data=Timedelta("1d"), index=["foo"])
151+
df = DataFrame({"label": ["foo", "foo"], "timedelta": [pd.NaT, Timedelta("1d")]})
152+
gb = df.groupby("label")["timedelta"]
153+
actual = gb.median()
154+
tm.assert_series_equal(actual, expected, check_names=False)
155+
156+
148157
@pytest.mark.parametrize("keys", [["a"], ["a", "b"]])
149158
def test_len_categorical(dropna, observed, keys):
150159
# GH#57595

0 commit comments

Comments
 (0)