diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 24db70481c136..2add16ae8ec8f 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -766,6 +766,7 @@ Groupby/resample/rolling - Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`) - Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`) - Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`) +- Bug in :meth:`DataFrameGroupBy.rank` with ``datetime64tz`` or period dtype incorrectly casting results to those dtypes instead of returning ``float64`` dtype (:issue:`38187`) Reshaping ^^^^^^^^^ diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index a9672c25ea4c2..184fa3a2b4204 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -995,7 +995,7 @@ def _cython_transform( try: result, _ = self.grouper._cython_operation( - "transform", obj.values, how, axis, **kwargs + "transform", obj._values, how, axis, **kwargs ) except NotImplementedError: continue diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index db0f0dbe17386..ded5f610b850e 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -491,8 +491,11 @@ def _ea_wrap_cython_operation( res_values, names = self._cython_operation( kind, values, how, axis, min_count, **kwargs ) + if how in ["rank"]: + # preserve float64 dtype + return res_values, names + res_values = res_values.astype("i8", copy=False) - # FIXME: this is wrong for rank, but not tested. result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype) return result, names diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py index 3461bf6e10662..ef6b4ae4836f8 100644 --- a/pandas/tests/groupby/test_rank.py +++ b/pandas/tests/groupby/test_rank.py @@ -42,7 +42,10 @@ def test_rank_apply(): @pytest.mark.parametrize( "vals", [ - [2, 2, 8, 2, 6], + np.array([2, 2, 8, 2, 6], dtype=dtype) + for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"] + ] + + [ [ pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-02"), @@ -50,7 +53,29 @@ def test_rank_apply(): pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-06"), ], + [ + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-08", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-06", tz="US/Pacific"), + ], + [ + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-08") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-06") - pd.Timestamp(0), + ], + [ + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-08").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-06").to_period("D"), + ], ], + ids=lambda x: type(x[0]), ) @pytest.mark.parametrize( "ties_method,ascending,pct,exp", @@ -79,7 +104,12 @@ def test_rank_apply(): ) def test_rank_args(grps, vals, ties_method, ascending, pct, exp): key = np.repeat(grps, len(vals)) - vals = vals * len(grps) + + orig_vals = vals + vals = list(vals) * len(grps) + if isinstance(orig_vals, np.ndarray): + vals = np.array(vals, dtype=orig_vals.dtype) + df = DataFrame({"key": key, "val": vals}) result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct) @@ -142,7 +172,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): @pytest.mark.parametrize( "vals", [ - [2, 2, np.nan, 8, 2, 6, np.nan, np.nan], + np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype) + for dtype in ["f8", "f4", "f2"] + ] + + [ [ pd.Timestamp("2018-01-02"), pd.Timestamp("2018-01-02"), @@ -153,7 +186,38 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): np.nan, np.nan, ], + [ + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + np.nan, + pd.Timestamp("2018-01-08", tz="US/Pacific"), + pd.Timestamp("2018-01-02", tz="US/Pacific"), + pd.Timestamp("2018-01-06", tz="US/Pacific"), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + np.nan, + pd.Timestamp("2018-01-08") - pd.Timestamp(0), + pd.Timestamp("2018-01-02") - pd.Timestamp(0), + pd.Timestamp("2018-01-06") - pd.Timestamp(0), + np.nan, + np.nan, + ], + [ + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + np.nan, + pd.Timestamp("2018-01-08").to_period("D"), + pd.Timestamp("2018-01-02").to_period("D"), + pd.Timestamp("2018-01-06").to_period("D"), + np.nan, + np.nan, + ], ], + ids=lambda x: type(x[0]), ) @pytest.mark.parametrize( "ties_method,ascending,na_option,pct,exp", @@ -346,7 +410,12 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp): ) def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp): key = np.repeat(grps, len(vals)) - vals = vals * len(grps) + + orig_vals = vals + vals = list(vals) * len(grps) + if isinstance(orig_vals, np.ndarray): + vals = np.array(vals, dtype=orig_vals.dtype) + df = DataFrame({"key": key, "val": vals}) result = df.groupby("key").rank( method=ties_method, ascending=ascending, na_option=na_option, pct=pct