From 0137eaaba5e518b4a8bb0ac61b2ffafd44e69db3 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 29 Nov 2020 13:59:41 -0800 Subject: [PATCH 1/4] REF: implement disallow_invalid_ops --- pandas/core/groupby/ops.py | 50 +++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 19 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 2542b2d7071cb..d96f3150dd6df 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -24,7 +24,7 @@ from pandas._libs import NaT, iNaT, lib import pandas._libs.groupby as libgroupby import pandas._libs.reduction as libreduction -from pandas._typing import F, FrameOrSeries, Label, Shape +from pandas._typing import ArrayLike, F, FrameOrSeries, Label, Shape from pandas.errors import AbstractMethodError from pandas.util._decorators import cache_readonly @@ -445,6 +445,35 @@ def _get_cython_func_and_vals( raise return func, values + def _disallow_invalid_ops(self, values: ArrayLike, how: str): + """ + Check if we can do this operation with our cython functions. + + Raises + ------ + NotImplementedError + This is either not a valid function for this dtype, or + valid but not implemented in cython. + """ + dtype = values.dtype + + if is_categorical_dtype(dtype) or is_sparse(dtype): + # categoricals are only 1d, so we + # are not setup for dim transforming + raise NotImplementedError(f"{dtype} dtype not supported") + elif is_datetime64_any_dtype(dtype): + # we raise NotImplemented if this is an invalid operation + # entirely, e.g. adding datetimes + if how in ["add", "prod", "cumsum", "cumprod"]: + raise NotImplementedError( + f"datetime64 type does not support {how} operations" + ) + elif is_timedelta64_dtype(dtype): + if how in ["prod", "cumprod"]: + raise NotImplementedError( + f"timedelta64 type does not support {how} operations" + ) + def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: @@ -466,24 +495,7 @@ def _cython_operation( # can we do this operation with our cython functions # if not raise NotImplementedError - - # we raise NotImplemented if this is an invalid operation - # entirely, e.g. adding datetimes - - # categoricals are only 1d, so we - # are not setup for dim transforming - if is_categorical_dtype(values.dtype) or is_sparse(values.dtype): - raise NotImplementedError(f"{values.dtype} dtype not supported") - elif is_datetime64_any_dtype(values.dtype): - if how in ["add", "prod", "cumsum", "cumprod"]: - raise NotImplementedError( - f"datetime64 type does not support {how} operations" - ) - elif is_timedelta64_dtype(values.dtype): - if how in ["prod", "cumprod"]: - raise NotImplementedError( - f"timedelta64 type does not support {how} operations" - ) + self._disallow_invalid_ops(values, how) if is_datetime64tz_dtype(values.dtype): # Cast to naive; we'll cast back at the end of the function From 6e2b30ec600fee1bba1078036a532e92def89545 Mon Sep 17 00:00:00 2001 From: Brock Date: Sun, 29 Nov 2020 15:06:02 -0800 Subject: [PATCH 2/4] REF: implement _ea_wrap_cython_operation --- pandas/core/groupby/ops.py | 56 ++++++++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 17 deletions(-) diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index d96f3150dd6df..db0f0dbe17386 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -474,6 +474,39 @@ def _disallow_invalid_ops(self, values: ArrayLike, how: str): f"timedelta64 type does not support {how} operations" ) + def _ea_wrap_cython_operation( + self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs + ) -> Tuple[np.ndarray, Optional[List[str]]]: + """ + If we have an ExtensionArray, unwrap, call _cython_operation, and + re-wrap if appropriate. + """ + # TODO: general case implementation overrideable by EAs. + orig_values = values + + if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype): + # All of the functions implemented here are ordinal, so we can + # operate on the tz-naive equivalents + values = values.view("M8[ns]") + res_values, names = self._cython_operation( + kind, values, how, axis, min_count, **kwargs + ) + res_values = res_values.astype("i8", copy=False) + # FIXME: this is wrong for rank, but not tested. + result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype) + return result, names + + elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype): + # IntegerArray or BooleanArray + values = ensure_int_or_float(values) + res_values, names = self._cython_operation( + kind, values, how, axis, min_count, **kwargs + ) + result = maybe_cast_result(result=res_values, obj=orig_values, how=how) + return result, names + + raise NotImplementedError(values.dtype) + def _cython_operation( self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs ) -> Tuple[np.ndarray, Optional[List[str]]]: @@ -483,8 +516,8 @@ def _cython_operation( Names is only useful when dealing with 2D results, like ohlc (see self._name_functions). """ - assert kind in ["transform", "aggregate"] orig_values = values + assert kind in ["transform", "aggregate"] if values.ndim > 2: raise NotImplementedError("number of dimensions is currently limited to 2") @@ -497,11 +530,10 @@ def _cython_operation( # if not raise NotImplementedError self._disallow_invalid_ops(values, how) - if is_datetime64tz_dtype(values.dtype): - # Cast to naive; we'll cast back at the end of the function - # TODO: possible need to reshape? - # TODO(EA2D):kludge can be avoided when 2D EA is allowed. - values = values.view("M8[ns]") + if is_extension_array_dtype(values.dtype): + return self._ea_wrap_cython_operation( + kind, values, how, axis, min_count, **kwargs + ) is_datetimelike = needs_i8_conversion(values.dtype) is_numeric = is_numeric_dtype(values.dtype) @@ -585,19 +617,9 @@ def _cython_operation( if swapped: result = result.swapaxes(0, axis) - if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype( - orig_values.dtype - ): - # We need to use the constructors directly for these dtypes - # since numpy won't recognize them - # https://github.com/pandas-dev/pandas/issues/31471 - result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype) - elif is_datetimelike and kind == "aggregate": + if is_datetimelike and kind == "aggregate": result = result.astype(orig_values.dtype) - if is_extension_array_dtype(orig_values.dtype): - result = maybe_cast_result(result=result, obj=orig_values, how=how) - return result, names def _aggregate( From 99cb3420fe46193a5cef66fd6cdbf6c0ddb2d679 Mon Sep 17 00:00:00 2001 From: Brock Date: Mon, 30 Nov 2020 07:58:43 -0800 Subject: [PATCH 3/4] BUG: groupby.rank with EA dtypes --- pandas/core/groupby/groupby.py | 2 +- pandas/core/groupby/ops.py | 5 +- pandas/tests/groupby/test_rank.py | 77 +++++++++++++++++++++++++++++-- 3 files changed, 78 insertions(+), 6 deletions(-) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a9672c25ea4c2..184fa3a2b4204 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -995,7 +995,7 @@ def _cython_transform( try: result, _ = self.grouper._cython_operation( - "transform", obj.values, how, axis, **kwargs + "transform", obj._values, how, axis, **kwargs ) except NotImplementedError: continue diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index db0f0dbe17386..ded5f610b850e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -491,8 +491,11 @@ def _ea_wrap_cython_operation( res_values, names = self._cython_operation( kind, values, how, axis, min_count, **kwargs ) + if how in ["rank"]: + # preserve float64 dtype + return res_values, names + res_values = res_values.astype("i8", copy=False) - # FIXME: this is wrong for rank, but not tested. result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype) return result, names diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 3461bf6e10662..ef6b4ae4836f8 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -42,7 +42,10 @@ def test_rank_apply(): @pytest.mark.parametrize( "vals", [ - [2, 2, 8, 2, 6], + np.array([2, 2, 8, 2, 6], dtype=dtype) + for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"] + ] + + [ [ pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-02"), @@ -50,7 +53,29 @@ def test_rank_apply(): pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-06"), ], + [ + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-08", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-06", tz="US/Pacific"), + ], + [ + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-08") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-06") - pd.Timestamp(0), + ], + [ + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-08").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-06").to_period("D"), + ], ], + ids=lambda x: type(x[0]), ) @pytest.mark.parametrize( "ties_method,ascending,pct,exp", @@ -79,7 +104,12 @@ def test_rank_apply(): ) def test_rank_args(grps, vals, ties_method, ascending, pct, exp): key = np.repeat(grps, len(vals)) - vals = vals * len(grps) + + orig_vals = vals + vals = list(vals) * len(grps) + if isinstance(orig_vals, np.ndarray): + vals = np.array(vals, dtype=orig_vals.dtype) + df = DataFrame({"key": key, "val": vals}) result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) @@ -142,7 +172,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): @pytest.mark.parametrize( "vals", [ - [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], + np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype) + for dtype in ["f8", "f4", "f2"] + ] + + [ [ pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-02"), @@ -153,7 +186,38 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): np.nan, np.nan, ], + [ + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + np.nan, + pd.Timestamp("2018-01-08", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-06", tz="US/Pacific"), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + np.nan, + pd.Timestamp("2018-01-08") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-06") - pd.Timestamp(0), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + np.nan, + pd.Timestamp("2018-01-08").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-06").to_period("D"), + np.nan, + np.nan, + ], ], + ids=lambda x: type(x[0]), ) @pytest.mark.parametrize( "ties_method,ascending,na_option,pct,exp", @@ -346,7 +410,12 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): ) def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): key = np.repeat(grps, len(vals)) - vals = vals * len(grps) + + orig_vals = vals + vals = list(vals) * len(grps) + if isinstance(orig_vals, np.ndarray): + vals = np.array(vals, dtype=orig_vals.dtype) + df = DataFrame({"key": key, "val": vals}) result = df.groupby("key").rank( method=ties_method, ascending=ascending, na_option=na_option, pct=pct From 86c9ce8c690d9832407e9afec2a808303bdfa6ab Mon Sep 17 00:00:00 2001 From: Brock Date: Tue, 1 Dec 2020 18:17:07 -0800 Subject: [PATCH 4/4] whatsnew --- doc/source/whatsnew/v1.2.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 24db70481c136..2add16ae8ec8f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -766,6 +766,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) - Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) - Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) +- Bug in :meth:`DataFrameGroupBy.rank` with ``datetime64tz`` or period dtype incorrectly casting results to those dtypes instead of returning ``float64`` dtype (:issue:`38187`) Reshaping ^^^^^^^^^