Skip to content

Commit 985e89a

Browse files
authored
BUG: GroupBy.quantile with datetimelike and NaT (#51373)
* BUG: GroupBy.quantile with datetimelike and NaT * GH refs * mypy, 32bit fixups
1 parent ac9a768 commit 985e89a

File tree

3 files changed

+53
-7
lines changed

3 files changed

+53
-7
lines changed

doc/source/whatsnew/v2.0.0.rst

+2
Original file line numberDiff line numberDiff line change
@@ -1193,6 +1193,8 @@ Datetimelike
11931193
- Bug in :func:`DataFrame.from_records` when given a :class:`DataFrame` input with timezone-aware datetime64 columns incorrectly dropping the timezone-awareness (:issue:`51162`)
11941194
- Bug in :func:`to_datetime` was raising ``decimal.InvalidOperation`` when parsing date strings with ``errors='coerce'`` (:issue:`51084`)
11951195
- Bug in :func:`to_datetime` with both ``unit`` and ``origin`` specified returning incorrect results (:issue:`42624`)
1196+
- Bug in :meth:`GroupBy.quantile` with datetime or timedelta dtypes giving incorrect results for groups containing ``NaT`` (:issue:`51373`)
1197+
- Bug in :meth:`Groupby.quantile` incorrectly raising with :class:`PeriodDtype` or :class:`DatetimeTZDtype` (:issue:`51373`)
11961198
-
11971199

11981200
Timedelta

pandas/core/groupby/groupby.py

+28-7
Original file line numberDiff line numberDiff line change
@@ -73,15 +73,14 @@ class providing the base-class of operations.
7373
from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
7474
from pandas.core.dtypes.common import (
7575
is_bool_dtype,
76-
is_datetime64_dtype,
7776
is_float_dtype,
7877
is_hashable,
7978
is_integer,
8079
is_integer_dtype,
8180
is_numeric_dtype,
8281
is_object_dtype,
8382
is_scalar,
84-
is_timedelta64_dtype,
83+
needs_i8_conversion,
8584
)
8685
from pandas.core.dtypes.missing import (
8786
isna,
@@ -3192,12 +3191,15 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
31923191
inference = np.dtype(np.int64)
31933192
elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):
31943193
out = vals.to_numpy(dtype=float, na_value=np.nan)
3195-
elif is_datetime64_dtype(vals.dtype):
3194+
elif needs_i8_conversion(vals.dtype):
31963195
inference = vals.dtype
3197-
out = np.asarray(vals).astype(float)
3198-
elif is_timedelta64_dtype(vals.dtype):
3199-
inference = vals.dtype
3200-
out = np.asarray(vals).astype(float)
3196+
# In this case we need to delay the casting until after the
3197+
# np.lexsort below.
3198+
# error: Incompatible return value type (got
3199+
# "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any,
3200+
# ExtensionDtype]]", expected "Tuple[ndarray[Any, Any],
3201+
# Optional[Union[dtype[Any], ExtensionDtype]]]")
3202+
return vals, inference # type: ignore[return-value]
32013203
elif isinstance(vals, ExtensionArray) and is_float_dtype(vals):
32023204
inference = np.dtype(np.float64)
32033205
out = vals.to_numpy(dtype=float, na_value=np.nan)
@@ -3236,6 +3238,18 @@ def post_processor(
32363238
is_integer_dtype(inference)
32373239
and interpolation in {"linear", "midpoint"}
32383240
):
3241+
if needs_i8_conversion(inference):
3242+
# error: Item "ExtensionArray" of "Union[ExtensionArray,
3243+
# ndarray[Any, Any]]" has no attribute "_ndarray"
3244+
vals = vals.astype("i8").view(
3245+
orig_vals._ndarray.dtype # type: ignore[union-attr]
3246+
)
3247+
# error: Item "ExtensionArray" of "Union[ExtensionArray,
3248+
# ndarray[Any, Any]]" has no attribute "_from_backing_data"
3249+
return orig_vals._from_backing_data( # type: ignore[union-attr]
3250+
vals
3251+
)
3252+
32393253
assert isinstance(inference, np.dtype) # for mypy
32403254
return vals.astype(inference)
32413255

@@ -3272,6 +3286,8 @@ def blk_func(values: ArrayLike) -> ArrayLike:
32723286
mask = isna(values)
32733287
result_mask = None
32743288

3289+
is_datetimelike = needs_i8_conversion(values.dtype)
3290+
32753291
vals, inference = pre_processor(values)
32763292

32773293
ncols = 1
@@ -3289,6 +3305,11 @@ def blk_func(values: ArrayLike) -> ArrayLike:
32893305
order = (vals, shaped_labels)
32903306
sort_arr = np.lexsort(order).astype(np.intp, copy=False)
32913307

3308+
if is_datetimelike:
3309+
# This casting needs to happen after the lexsort in order
3310+
# to ensure that NaTs are placed at the end and not the front
3311+
vals = vals.view("i8").astype(np.float64)
3312+
32923313
if vals.ndim == 1:
32933314
# Ea is always 1d
32943315
func(

pandas/tests/groupby/test_quantile.py

+23
Original file line numberDiff line numberDiff line change
@@ -445,3 +445,26 @@ def test_timestamp_groupby_quantile():
445445
)
446446

447447
tm.assert_frame_equal(result, expected)
448+
449+
450+
def test_groupby_quantile_dt64tz_period():
451+
# GH#51373
452+
dti = pd.date_range("2016-01-01", periods=1000)
453+
ser = pd.Series(dti)
454+
df = ser.to_frame()
455+
df[1] = dti.tz_localize("US/Pacific")
456+
df[2] = dti.to_period("D")
457+
df[3] = dti - dti[0]
458+
df.iloc[-1] = pd.NaT
459+
460+
by = np.tile(np.arange(5), 200)
461+
gb = df.groupby(by)
462+
463+
result = gb.quantile(0.5)
464+
465+
# Check that we match the group-by-group result
466+
exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
467+
expected = DataFrame(exp).T
468+
expected.index = expected.index.astype(np.int_)
469+
470+
tm.assert_frame_equal(result, expected)

0 commit comments

Comments
 (0)