Skip to content

BUG: GroupBy.quantile with datetimelike and NaT #51373

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Feb 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1186,6 +1186,8 @@ Datetimelike
- Bug in :func:`DataFrame.from_records` when given a :class:`DataFrame` input with timezone-aware datetime64 columns incorrectly dropping the timezone-awareness (:issue:`51162`)
- Bug in :func:`to_datetime` was raising ``decimal.InvalidOperation`` when parsing date strings with ``errors='coerce'`` (:issue:`51084`)
- Bug in :func:`to_datetime` with both ``unit`` and ``origin`` specified returning incorrect results (:issue:`42624`)
- Bug in :meth:`GroupBy.quantile` with datetime or timedelta dtypes giving incorrect results for groups containing ``NaT`` (:issue:`51373`)
- Bug in :meth:`Groupby.quantile` incorrectly raising with :class:`PeriodDtype` or :class:`DatetimeTZDtype` (:issue:`51373`)
-

Timedelta
Expand Down
35 changes: 28 additions & 7 deletions pandas/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,15 +73,14 @@ class providing the base-class of operations.
from pandas.core.dtypes.cast import ensure_dtype_can_hold_na
from pandas.core.dtypes.common import (
is_bool_dtype,
is_datetime64_dtype,
is_float_dtype,
is_hashable,
is_integer,
is_integer_dtype,
is_numeric_dtype,
is_object_dtype,
is_scalar,
is_timedelta64_dtype,
needs_i8_conversion,
)
from pandas.core.dtypes.missing import (
isna,
Expand Down Expand Up @@ -3192,12 +3191,15 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]:
inference = np.dtype(np.int64)
elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray):
out = vals.to_numpy(dtype=float, na_value=np.nan)
elif is_datetime64_dtype(vals.dtype):
elif needs_i8_conversion(vals.dtype):
inference = vals.dtype
out = np.asarray(vals).astype(float)
elif is_timedelta64_dtype(vals.dtype):
inference = vals.dtype
out = np.asarray(vals).astype(float)
# In this case we need to delay the casting until after the
# np.lexsort below.
# error: Incompatible return value type (got
# "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any,
# ExtensionDtype]]", expected "Tuple[ndarray[Any, Any],
# Optional[Union[dtype[Any], ExtensionDtype]]]")
return vals, inference # type: ignore[return-value]
elif isinstance(vals, ExtensionArray) and is_float_dtype(vals):
inference = np.dtype(np.float64)
out = vals.to_numpy(dtype=float, na_value=np.nan)
Expand Down Expand Up @@ -3236,6 +3238,18 @@ def post_processor(
is_integer_dtype(inference)
and interpolation in {"linear", "midpoint"}
):
if needs_i8_conversion(inference):
# error: Item "ExtensionArray" of "Union[ExtensionArray,
# ndarray[Any, Any]]" has no attribute "_ndarray"
vals = vals.astype("i8").view(
orig_vals._ndarray.dtype # type: ignore[union-attr]
)
# error: Item "ExtensionArray" of "Union[ExtensionArray,
# ndarray[Any, Any]]" has no attribute "_from_backing_data"
return orig_vals._from_backing_data( # type: ignore[union-attr]
vals
)

assert isinstance(inference, np.dtype) # for mypy
return vals.astype(inference)

Expand Down Expand Up @@ -3272,6 +3286,8 @@ def blk_func(values: ArrayLike) -> ArrayLike:
mask = isna(values)
result_mask = None

is_datetimelike = needs_i8_conversion(values.dtype)

vals, inference = pre_processor(values)

ncols = 1
Expand All @@ -3289,6 +3305,11 @@ def blk_func(values: ArrayLike) -> ArrayLike:
order = (vals, shaped_labels)
sort_arr = np.lexsort(order).astype(np.intp, copy=False)

if is_datetimelike:
# This casting needs to happen after the lexsort in order
# to ensure that NaTs are placed at the end and not the front
vals = vals.view("i8").astype(np.float64)

if vals.ndim == 1:
# Ea is always 1d
func(
Expand Down
23 changes: 23 additions & 0 deletions pandas/tests/groupby/test_quantile.py
Original file line number Diff line number Diff line change
Expand Up @@ -445,3 +445,26 @@ def test_timestamp_groupby_quantile():
)

tm.assert_frame_equal(result, expected)


def test_groupby_quantile_dt64tz_period():
# GH#51373
dti = pd.date_range("2016-01-01", periods=1000)
ser = pd.Series(dti)
df = ser.to_frame()
df[1] = dti.tz_localize("US/Pacific")
df[2] = dti.to_period("D")
df[3] = dti - dti[0]
df.iloc[-1] = pd.NaT

by = np.tile(np.arange(5), 200)
gb = df.groupby(by)

result = gb.quantile(0.5)

# Check that we match the group-by-group result
exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)}
expected = DataFrame(exp).T
expected.index = expected.index.astype(np.int_)

tm.assert_frame_equal(result, expected)