From 0137eaaba5e518b4a8bb0ac61b2ffafd44e69db3 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 29 Nov 2020 13:59:41 -0800
Subject: [PATCH 1/4] REF: implement disallow_invalid_ops

---
 pandas/core/groupby/ops.py | 50 +++++++++++++++++++++++---------------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index 2542b2d7071cb..d96f3150dd6df 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -24,7 +24,7 @@
 from pandas._libs import NaT, iNaT, lib
 import pandas._libs.groupby as libgroupby
 import pandas._libs.reduction as libreduction
-from pandas._typing import F, FrameOrSeries, Label, Shape
+from pandas._typing import ArrayLike, F, FrameOrSeries, Label, Shape
 from pandas.errors import AbstractMethodError
 from pandas.util._decorators import cache_readonly
 
@@ -445,6 +445,35 @@ def _get_cython_func_and_vals(
                 raise
         return func, values
 
+    def _disallow_invalid_ops(self, values: ArrayLike, how: str):
+        """
+        Check if we can do this operation with our cython functions.
+
+        Raises
+        ------
+        NotImplementedError
+            This is either not a valid function for this dtype, or
+            valid but not implemented in cython.
+        """
+        dtype = values.dtype
+
+        if is_categorical_dtype(dtype) or is_sparse(dtype):
+            # categoricals are only 1d, so we
+            #  are not setup for dim transforming
+            raise NotImplementedError(f"{dtype} dtype not supported")
+        elif is_datetime64_any_dtype(dtype):
+            # we raise NotImplemented if this is an invalid operation
+            #  entirely, e.g. adding datetimes
+            if how in ["add", "prod", "cumsum", "cumprod"]:
+                raise NotImplementedError(
+                    f"datetime64 type does not support {how} operations"
+                )
+        elif is_timedelta64_dtype(dtype):
+            if how in ["prod", "cumprod"]:
+                raise NotImplementedError(
+                    f"timedelta64 type does not support {how} operations"
+                )
+
     def _cython_operation(
         self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
     ) -> Tuple[np.ndarray, Optional[List[str]]]:
@@ -466,24 +495,7 @@ def _cython_operation(
 
         # can we do this operation with our cython functions
         # if not raise NotImplementedError
-
-        # we raise NotImplemented if this is an invalid operation
-        # entirely, e.g. adding datetimes
-
-        # categoricals are only 1d, so we
-        # are not setup for dim transforming
-        if is_categorical_dtype(values.dtype) or is_sparse(values.dtype):
-            raise NotImplementedError(f"{values.dtype} dtype not supported")
-        elif is_datetime64_any_dtype(values.dtype):
-            if how in ["add", "prod", "cumsum", "cumprod"]:
-                raise NotImplementedError(
-                    f"datetime64 type does not support {how} operations"
-                )
-        elif is_timedelta64_dtype(values.dtype):
-            if how in ["prod", "cumprod"]:
-                raise NotImplementedError(
-                    f"timedelta64 type does not support {how} operations"
-                )
+        self._disallow_invalid_ops(values, how)
 
         if is_datetime64tz_dtype(values.dtype):
             # Cast to naive; we'll cast back at the end of the function

From 6e2b30ec600fee1bba1078036a532e92def89545 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Sun, 29 Nov 2020 15:06:02 -0800
Subject: [PATCH 2/4] REF: implement _ea_wrap_cython_operation

---
 pandas/core/groupby/ops.py | 56 ++++++++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 17 deletions(-)

diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index d96f3150dd6df..db0f0dbe17386 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -474,6 +474,39 @@ def _disallow_invalid_ops(self, values: ArrayLike, how: str):
                     f"timedelta64 type does not support {how} operations"
                 )
 
+    def _ea_wrap_cython_operation(
+        self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
+    ) -> Tuple[np.ndarray, Optional[List[str]]]:
+        """
+        If we have an ExtensionArray, unwrap, call _cython_operation, and
+        re-wrap if appropriate.
+        """
+        # TODO: general case implementation overrideable by EAs.
+        orig_values = values
+
+        if is_datetime64tz_dtype(values.dtype) or is_period_dtype(values.dtype):
+            # All of the functions implemented here are ordinal, so we can
+            #  operate on the tz-naive equivalents
+            values = values.view("M8[ns]")
+            res_values, names = self._cython_operation(
+                kind, values, how, axis, min_count, **kwargs
+            )
+            res_values = res_values.astype("i8", copy=False)
+            # FIXME: this is wrong for rank, but not tested.
+            result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype)
+            return result, names
+
+        elif is_integer_dtype(values.dtype) or is_bool_dtype(values.dtype):
+            # IntegerArray or BooleanArray
+            values = ensure_int_or_float(values)
+            res_values, names = self._cython_operation(
+                kind, values, how, axis, min_count, **kwargs
+            )
+            result = maybe_cast_result(result=res_values, obj=orig_values, how=how)
+            return result, names
+
+        raise NotImplementedError(values.dtype)
+
     def _cython_operation(
         self, kind: str, values, how: str, axis: int, min_count: int = -1, **kwargs
     ) -> Tuple[np.ndarray, Optional[List[str]]]:
@@ -483,8 +516,8 @@ def _cython_operation(
         Names is only useful when dealing with 2D results, like ohlc
         (see self._name_functions).
         """
-        assert kind in ["transform", "aggregate"]
         orig_values = values
+        assert kind in ["transform", "aggregate"]
 
         if values.ndim > 2:
             raise NotImplementedError("number of dimensions is currently limited to 2")
@@ -497,11 +530,10 @@ def _cython_operation(
         # if not raise NotImplementedError
         self._disallow_invalid_ops(values, how)
 
-        if is_datetime64tz_dtype(values.dtype):
-            # Cast to naive; we'll cast back at the end of the function
-            # TODO: possible need to reshape?
-            # TODO(EA2D):kludge can be avoided when 2D EA is allowed.
-            values = values.view("M8[ns]")
+        if is_extension_array_dtype(values.dtype):
+            return self._ea_wrap_cython_operation(
+                kind, values, how, axis, min_count, **kwargs
+            )
 
         is_datetimelike = needs_i8_conversion(values.dtype)
         is_numeric = is_numeric_dtype(values.dtype)
@@ -585,19 +617,9 @@ def _cython_operation(
         if swapped:
             result = result.swapaxes(0, axis)
 
-        if is_datetime64tz_dtype(orig_values.dtype) or is_period_dtype(
-            orig_values.dtype
-        ):
-            # We need to use the constructors directly for these dtypes
-            # since numpy won't recognize them
-            # https://github.com/pandas-dev/pandas/issues/31471
-            result = type(orig_values)(result.astype(np.int64), dtype=orig_values.dtype)
-        elif is_datetimelike and kind == "aggregate":
+        if is_datetimelike and kind == "aggregate":
             result = result.astype(orig_values.dtype)
 
-        if is_extension_array_dtype(orig_values.dtype):
-            result = maybe_cast_result(result=result, obj=orig_values, how=how)
-
         return result, names
 
     def _aggregate(

From 99cb3420fe46193a5cef66fd6cdbf6c0ddb2d679 Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Mon, 30 Nov 2020 07:58:43 -0800
Subject: [PATCH 3/4] BUG: groupby.rank with EA dtypes

---
 pandas/core/groupby/groupby.py    |  2 +-
 pandas/core/groupby/ops.py        |  5 +-
 pandas/tests/groupby/test_rank.py | 77 +++++++++++++++++++++++++++++--
 3 files changed, 78 insertions(+), 6 deletions(-)

diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py
index a9672c25ea4c2..184fa3a2b4204 100644
--- a/pandas/core/groupby/groupby.py
+++ b/pandas/core/groupby/groupby.py
@@ -995,7 +995,7 @@ def _cython_transform(
 
             try:
                 result, _ = self.grouper._cython_operation(
-                    "transform", obj.values, how, axis, **kwargs
+                    "transform", obj._values, how, axis, **kwargs
                 )
             except NotImplementedError:
                 continue
diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py
index db0f0dbe17386..ded5f610b850e 100644
--- a/pandas/core/groupby/ops.py
+++ b/pandas/core/groupby/ops.py
@@ -491,8 +491,11 @@ def _ea_wrap_cython_operation(
             res_values, names = self._cython_operation(
                 kind, values, how, axis, min_count, **kwargs
             )
+            if how in ["rank"]:
+                # preserve float64 dtype
+                return res_values, names
+
             res_values = res_values.astype("i8", copy=False)
-            # FIXME: this is wrong for rank, but not tested.
             result = type(orig_values)._simple_new(res_values, dtype=orig_values.dtype)
             return result, names
 
diff --git a/pandas/tests/groupby/test_rank.py b/pandas/tests/groupby/test_rank.py
index 3461bf6e10662..ef6b4ae4836f8 100644
--- a/pandas/tests/groupby/test_rank.py
+++ b/pandas/tests/groupby/test_rank.py
@@ -42,7 +42,10 @@ def test_rank_apply():
 @pytest.mark.parametrize(
     "vals",
     [
-        [2, 2, 8, 2, 6],
+        np.array([2, 2, 8, 2, 6], dtype=dtype)
+        for dtype in ["i8", "i4", "i2", "i1", "u8", "u4", "u2", "u1", "f8", "f4", "f2"]
+    ]
+    + [
         [
             pd.Timestamp("2018-01-02"),
             pd.Timestamp("2018-01-02"),
@@ -50,7 +53,29 @@ def test_rank_apply():
             pd.Timestamp("2018-01-02"),
             pd.Timestamp("2018-01-06"),
         ],
+        [
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-08", tz="US/Pacific"),
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-06", tz="US/Pacific"),
+        ],
+        [
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-08") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-06") - pd.Timestamp(0),
+        ],
+        [
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-08").to_period("D"),
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-06").to_period("D"),
+        ],
     ],
+    ids=lambda x: type(x[0]),
 )
 @pytest.mark.parametrize(
     "ties_method,ascending,pct,exp",
@@ -79,7 +104,12 @@ def test_rank_apply():
 )
 def test_rank_args(grps, vals, ties_method, ascending, pct, exp):
     key = np.repeat(grps, len(vals))
-    vals = vals * len(grps)
+
+    orig_vals = vals
+    vals = list(vals) * len(grps)
+    if isinstance(orig_vals, np.ndarray):
+        vals = np.array(vals, dtype=orig_vals.dtype)
+
     df = DataFrame({"key": key, "val": vals})
     result = df.groupby("key").rank(method=ties_method, ascending=ascending, pct=pct)
 
@@ -142,7 +172,10 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
 @pytest.mark.parametrize(
     "vals",
     [
-        [2, 2, np.nan, 8, 2, 6, np.nan, np.nan],
+        np.array([2, 2, np.nan, 8, 2, 6, np.nan, np.nan], dtype=dtype)
+        for dtype in ["f8", "f4", "f2"]
+    ]
+    + [
         [
             pd.Timestamp("2018-01-02"),
             pd.Timestamp("2018-01-02"),
@@ -153,7 +186,38 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
             np.nan,
             np.nan,
         ],
+        [
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            np.nan,
+            pd.Timestamp("2018-01-08", tz="US/Pacific"),
+            pd.Timestamp("2018-01-02", tz="US/Pacific"),
+            pd.Timestamp("2018-01-06", tz="US/Pacific"),
+            np.nan,
+            np.nan,
+        ],
+        [
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            np.nan,
+            pd.Timestamp("2018-01-08") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-02") - pd.Timestamp(0),
+            pd.Timestamp("2018-01-06") - pd.Timestamp(0),
+            np.nan,
+            np.nan,
+        ],
+        [
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-02").to_period("D"),
+            np.nan,
+            pd.Timestamp("2018-01-08").to_period("D"),
+            pd.Timestamp("2018-01-02").to_period("D"),
+            pd.Timestamp("2018-01-06").to_period("D"),
+            np.nan,
+            np.nan,
+        ],
     ],
+    ids=lambda x: type(x[0]),
 )
 @pytest.mark.parametrize(
     "ties_method,ascending,na_option,pct,exp",
@@ -346,7 +410,12 @@ def test_infs_n_nans(grps, vals, ties_method, ascending, na_option, exp):
 )
 def test_rank_args_missing(grps, vals, ties_method, ascending, na_option, pct, exp):
     key = np.repeat(grps, len(vals))
-    vals = vals * len(grps)
+
+    orig_vals = vals
+    vals = list(vals) * len(grps)
+    if isinstance(orig_vals, np.ndarray):
+        vals = np.array(vals, dtype=orig_vals.dtype)
+
     df = DataFrame({"key": key, "val": vals})
     result = df.groupby("key").rank(
         method=ties_method, ascending=ascending, na_option=na_option, pct=pct

From 86c9ce8c690d9832407e9afec2a808303bdfa6ab Mon Sep 17 00:00:00 2001
From: Brock <jbrockmendel@gmail.com>
Date: Tue, 1 Dec 2020 18:17:07 -0800
Subject: [PATCH 4/4] whatsnew

---
 doc/source/whatsnew/v1.2.0.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst
index 24db70481c136..2add16ae8ec8f 100644
--- a/doc/source/whatsnew/v1.2.0.rst
+++ b/doc/source/whatsnew/v1.2.0.rst
@@ -766,6 +766,7 @@ Groupby/resample/rolling
 - Bug in :meth:`DataFrame.groupby` dropped ``nan`` groups from result with ``dropna=False`` when grouping over a single column (:issue:`35646`, :issue:`35542`)
 - Bug in :meth:`.DataFrameGroupBy.head`, :meth:`.DataFrameGroupBy.tail`, :meth:`SeriesGroupBy.head`, and :meth:`SeriesGroupBy.tail` would raise when used with ``axis=1`` (:issue:`9772`)
 - Bug in :meth:`.DataFrameGroupBy.transform` would raise when used with ``axis=1`` and a transformation kernel (e.g. "shift") (:issue:`36308`)
+- Bug in :meth:`DataFrameGroupBy.rank` with ``datetime64tz`` or period dtype incorrectly casting results to those dtypes instead of returning ``float64`` dtype (:issue:`38187`)
 
 Reshaping
 ^^^^^^^^^