diff --git a/doc/source/whatsnew/v2.0.0.rst b/doc/source/whatsnew/v2.0.0.rst index d1b965e64e43b..bd6e2608f97ae 100644 --- a/doc/source/whatsnew/v2.0.0.rst +++ b/doc/source/whatsnew/v2.0.0.rst @@ -1186,6 +1186,8 @@ Datetimelike - Bug in :func:`DataFrame.from_records` when given a :class:`DataFrame` input with timezone-aware datetime64 columns incorrectly dropping the timezone-awareness (:issue:`51162`) - Bug in :func:`to_datetime` was raising ``decimal.InvalidOperation`` when parsing date strings with ``errors='coerce'`` (:issue:`51084`) - Bug in :func:`to_datetime` with both ``unit`` and ``origin`` specified returning incorrect results (:issue:`42624`) +- Bug in :meth:`GroupBy.quantile` with datetime or timedelta dtypes giving incorrect results for groups containing ``NaT`` (:issue:`51373`) +- Bug in :meth:`Groupby.quantile` incorrectly raising with :class:`PeriodDtype` or :class:`DatetimeTZDtype` (:issue:`51373`) - Timedelta diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 763494666d870..b25c767db42ff 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -73,7 +73,6 @@ class providing the base-class of operations. from pandas.core.dtypes.cast import ensure_dtype_can_hold_na from pandas.core.dtypes.common import ( is_bool_dtype, - is_datetime64_dtype, is_float_dtype, is_hashable, is_integer, @@ -81,7 +80,7 @@ class providing the base-class of operations. is_numeric_dtype, is_object_dtype, is_scalar, - is_timedelta64_dtype, + needs_i8_conversion, ) from pandas.core.dtypes.missing import ( isna, @@ -3192,12 +3191,15 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, DtypeObj | None]: inference = np.dtype(np.int64) elif is_bool_dtype(vals.dtype) and isinstance(vals, ExtensionArray): out = vals.to_numpy(dtype=float, na_value=np.nan) - elif is_datetime64_dtype(vals.dtype): + elif needs_i8_conversion(vals.dtype): inference = vals.dtype - out = np.asarray(vals).astype(float) - elif is_timedelta64_dtype(vals.dtype): - inference = vals.dtype - out = np.asarray(vals).astype(float) + # In this case we need to delay the casting until after the + # np.lexsort below. + # error: Incompatible return value type (got + # "Tuple[Union[ExtensionArray, ndarray[Any, Any]], Union[Any, + # ExtensionDtype]]", expected "Tuple[ndarray[Any, Any], + # Optional[Union[dtype[Any], ExtensionDtype]]]") + return vals, inference # type: ignore[return-value] elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): inference = np.dtype(np.float64) out = vals.to_numpy(dtype=float, na_value=np.nan) @@ -3236,6 +3238,18 @@ def post_processor( is_integer_dtype(inference) and interpolation in {"linear", "midpoint"} ): + if needs_i8_conversion(inference): + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # ndarray[Any, Any]]" has no attribute "_ndarray" + vals = vals.astype("i8").view( + orig_vals._ndarray.dtype # type: ignore[union-attr] + ) + # error: Item "ExtensionArray" of "Union[ExtensionArray, + # ndarray[Any, Any]]" has no attribute "_from_backing_data" + return orig_vals._from_backing_data( # type: ignore[union-attr] + vals + ) + assert isinstance(inference, np.dtype) # for mypy return vals.astype(inference) @@ -3272,6 +3286,8 @@ def blk_func(values: ArrayLike) -> ArrayLike: mask = isna(values) result_mask = None + is_datetimelike = needs_i8_conversion(values.dtype) + vals, inference = pre_processor(values) ncols = 1 @@ -3289,6 +3305,11 @@ def blk_func(values: ArrayLike) -> ArrayLike: order = (vals, shaped_labels) sort_arr = np.lexsort(order).astype(np.intp, copy=False) + if is_datetimelike: + # This casting needs to happen after the lexsort in order + # to ensure that NaTs are placed at the end and not the front + vals = vals.view("i8").astype(np.float64) + if vals.ndim == 1: # Ea is always 1d func( diff --git a/pandas/tests/groupby/test_quantile.py b/pandas/tests/groupby/test_quantile.py index 8cba3a8afdfae..79354e550d3f6 100644 --- a/pandas/tests/groupby/test_quantile.py +++ b/pandas/tests/groupby/test_quantile.py @@ -445,3 +445,26 @@ def test_timestamp_groupby_quantile(): ) tm.assert_frame_equal(result, expected) + + +def test_groupby_quantile_dt64tz_period(): + # GH#51373 + dti = pd.date_range("2016-01-01", periods=1000) + ser = pd.Series(dti) + df = ser.to_frame() + df[1] = dti.tz_localize("US/Pacific") + df[2] = dti.to_period("D") + df[3] = dti - dti[0] + df.iloc[-1] = pd.NaT + + by = np.tile(np.arange(5), 200) + gb = df.groupby(by) + + result = gb.quantile(0.5) + + # Check that we match the group-by-group result + exp = {i: df.iloc[i::5].quantile(0.5) for i in range(5)} + expected = DataFrame(exp).T + expected.index = expected.index.astype(np.int_) + + tm.assert_frame_equal(result, expected)