From c6f894b7a1bbc3e384f1e4fecaa9678858132ad0 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Wed, 22 Jun 2022 09:53:36 -0700
Subject: [PATCH 1/9] start arrow groupby tests

---
 pandas/tests/extension/test_arrow.py | 78 ++++++++++++++++++++++++++++
 1 file changed, 78 insertions(+)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 9eeaf39959f29..8d5354e3247b2 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -105,6 +105,53 @@ def all_data(request, data, data_missing):
         return data_missing
 
 
+@pytest.fixture
+def data_for_grouping(dtype):
+    """
+    Data for factorization, grouping, and unique tests.
+
+    Expected to be like [B, B, NA, NA, A, A, B, C]
+
+    Where A < B < C and NA is missing
+    """
+    pa_dtype = dtype.pyarrow_dtype
+    if pa.types.is_boolean(pa_dtype):
+        A = False
+        B = True
+        C = True
+    elif pa.types.is_floating(pa_dtype):
+        A = -1.1
+        B = 0.0
+        C = 1.1
+    elif pa.types.is_signed_integer(pa_dtype):
+        A = -1
+        B = 0
+        C = 1
+    elif pa.types.is_unsigned_integer(pa_dtype):
+        A = 0
+        B = 1
+        C = 10
+    elif pa.types.is_date(pa_dtype):
+        A = date(1999, 12, 31)
+        B = date(2010, 1, 1)
+        C = date(2022, 1, 1)
+    elif pa.types.is_timestamp(pa_dtype):
+        A = datetime(1999, 1, 1, 1, 1, 1, 1)
+        B = datetime(2020, 1, 1)
+        C = datetime(2020, 1, 1, 1)
+    elif pa.types.is_duration(pa_dtype):
+        A = timedelta(-1)
+        B = timedelta(0)
+        C = timedelta(1, 4)
+    elif pa.types.is_time(pa_dtype):
+        A = time(0, 0)
+        B = time(0, 12)
+        C = time(12, 12)
+    else:
+        raise NotImplementedError
+    return pd.array([B, B, None, None, A, A, B, C], dtype=dtype)
+
+
 @pytest.fixture
 def na_value():
     """The scalar missing value for this type. Default 'None'"""
@@ -218,6 +265,37 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data):
         super().test_loc_iloc_frame_single_dtype(data)
 
 
+class TestBaseGroupby(base.BaseGroupbyTests):
+    @pytest.mark.parametrize("as_index", [True, False])
+    def test_groupby_extension_agg(self, as_index, data_for_grouping, request):
+        pa_dtype = data_for_grouping.dtype.pyarrow_dtype
+        if pa.types.is_boolean(pa_dtype):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    raises=ValueError,
+                    reason=f"{pa_dtype} only has 2 unique possible values",
+                )
+            )
+        elif pa.types.is_duration(pa_dtype):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    raises=pa.ArrowNotImplementedError,
+                    reason=f"pyarrow doesn't support factorizing {pa_dtype}",
+                )
+            )
+        elif as_index is True and (
+            pa.types.is_date(pa_dtype)
+            or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None)
+        ):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    raises=AttributeError,
+                    reason="GH 34986",
+                )
+            )
+        super().test_groupby_extension_agg(as_index, data_for_grouping)
+
+
 class TestBaseDtype(base.BaseDtypeTests):
     def test_construct_from_string_own_name(self, dtype, request):
         pa_dtype = dtype.pyarrow_dtype

From 98c2875959cd0effddbfdd31ec518953aebbc9cf Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Thu, 23 Jun 2022 21:35:16 -0700
Subject: [PATCH 2/9] Add more tests

---
 pandas/tests/extension/test_arrow.py | 35 ++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index b16ab223ba23f..9aeb667b20ef8 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -254,6 +254,41 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data):
 
 
 class TestBaseGroupby(base.BaseGroupbyTests):
+    def test_groupby_extension_apply(
+        self, data_for_grouping, groupby_apply_op, request
+    ):
+        pa_dtype = data_for_grouping.dtype.pyarrow_dtype
+        # Is there way to get the "Series" ID for groupby_apply_op?
+        is_series = isinstance(groupby_apply_op([]), pd.Series)
+        if pa.types.is_duration(pa_dtype):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    raises=pa.ArrowNotImplementedError,
+                    reason=f"pyarrow doesn't support factorizing {pa_dtype}",
+                )
+            )
+        elif not is_series and (
+            pa.types.is_date(pa_dtype)
+            or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None)
+        ):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    raises=AttributeError,
+                    reason="GH 34986",
+                )
+            )
+        super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
+
+    def test_in_numeric_groupby(self, data_for_grouping, request):
+        pa_dtype = data_for_grouping.dtype.pyarrow_dtype
+        if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    reason="ArrowExtensionArray doesn't support .sum() yet.",
+                )
+            )
+        super().test_in_numeric_groupby(data_for_grouping)
+
     @pytest.mark.parametrize("as_index", [True, False])
     def test_groupby_extension_agg(self, as_index, data_for_grouping, request):
         pa_dtype = data_for_grouping.dtype.pyarrow_dtype

From a8e5c8ad263a2403d41e97f7bb25f1881c0e0a8f Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Fri, 24 Jun 2022 16:43:23 -0700
Subject: [PATCH 3/9] User better id checking

---
 pandas/tests/extension/test_arrow.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 9428e491bc7ac..642d412966439 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -271,8 +271,8 @@ def test_groupby_extension_apply(
         self, data_for_grouping, groupby_apply_op, request
     ):
         pa_dtype = data_for_grouping.dtype.pyarrow_dtype
-        # Is there way to get the "Series" ID for groupby_apply_op?
-        is_series = isinstance(groupby_apply_op([]), pd.Series)
+        # Is there a better way to get the "series" ID for groupby_apply_op?
+        is_series = "series" in request.node.nodeid
         if pa.types.is_duration(pa_dtype):
             request.node.add_marker(
                 pytest.mark.xfail(
@@ -849,8 +849,8 @@ def test_setitem_slice_array(self, data, request):
     def test_setitem_with_expansion_dataframe_column(
         self, data, full_indexer, using_array_manager, request
     ):
-        # Is there a way to get the full_indexer id "null_slice"?
-        is_null_slice = full_indexer(pd.Series(dtype=object)) == slice(None)
+        # Is there a better way to get the full_indexer id "null_slice"?
+        is_null_slice = "null_slice" in request.node.nodeid
         tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
         if pa_version_under2p0 and tz not in (None, "UTC") and not is_null_slice:
             request.node.add_marker(

From e53c6afa7943576419f540cd5cd8524cc17a3fad Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Sun, 26 Jun 2022 17:22:01 -0700
Subject: [PATCH 4/9] Finish groupby tests

---
 pandas/core/dtypes/missing.py        |  5 ++-
 pandas/tests/extension/test_arrow.py | 67 ++++++++++++++++++++++++----
 2 files changed, 62 insertions(+), 10 deletions(-)

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 37b42ad66c027..7fa0db78edfe1 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -754,10 +754,11 @@ def isna_all(arr: ArrayLike) -> bool:
     chunk_len = max(total_len // 40, 1000)
 
     dtype = arr.dtype
-    if dtype.kind == "f":
+    is_np_missing_value = na_value_for_dtype(dtype) is not libmissing.NA
+    if dtype.kind == "f" and is_np_missing_value:
         checker = nan_checker
 
-    elif dtype.kind in ["m", "M"] or dtype.type is Period:
+    elif (dtype.kind in ["m", "M"] or dtype.type is Period) and is_np_missing_value:
         # error: Incompatible types in assignment (expression has type
         # "Callable[[Any], Any]", variable has type "ufunc")
         checker = lambda x: np.asarray(x.view("i8")) == iNaT  # type: ignore[assignment]
diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 642d412966439..4aa03199a83b0 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -267,12 +267,56 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data):
 
 
 class TestBaseGroupby(base.BaseGroupbyTests):
+    def test_groupby_extension_no_sort(self, data_for_grouping, request):
+        pa_dtype = data_for_grouping.dtype.pyarrow_dtype
+        if pa.types.is_boolean(pa_dtype):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    reason=f"{pa_dtype} only has 2 unique possible values",
+                )
+            )
+        elif pa.types.is_duration(pa_dtype):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    raises=pa.ArrowNotImplementedError,
+                    reason=f"pyarrow doesn't support factorizing {pa_dtype}",
+                )
+            )
+        elif pa.types.is_date(pa_dtype) or (
+            pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None
+        ):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    raises=AttributeError,
+                    reason="GH 34986",
+                )
+            )
+        super().test_groupby_extension_no_sort(data_for_grouping)
+
+    def test_groupby_extension_transform(self, data_for_grouping, request):
+        pa_dtype = data_for_grouping.dtype.pyarrow_dtype
+        if pa.types.is_boolean(pa_dtype):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    reason=f"{pa_dtype} only has 2 unique possible values",
+                )
+            )
+        elif pa.types.is_duration(pa_dtype):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    raises=pa.ArrowNotImplementedError,
+                    reason=f"pyarrow doesn't support factorizing {pa_dtype}",
+                )
+            )
+        super().test_groupby_extension_transform(data_for_grouping)
+
     def test_groupby_extension_apply(
         self, data_for_grouping, groupby_apply_op, request
     ):
         pa_dtype = data_for_grouping.dtype.pyarrow_dtype
         # Is there a better way to get the "series" ID for groupby_apply_op?
         is_series = "series" in request.node.nodeid
+        is_object = "object" in request.node.nodeid
         if pa.types.is_duration(pa_dtype):
             request.node.add_marker(
                 pytest.mark.xfail(
@@ -280,16 +324,23 @@ def test_groupby_extension_apply(
                     reason=f"pyarrow doesn't support factorizing {pa_dtype}",
                 )
             )
-        elif not is_series and (
-            pa.types.is_date(pa_dtype)
-            or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None)
+        elif pa.types.is_date(pa_dtype) or (
+            pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None
         ):
-            request.node.add_marker(
-                pytest.mark.xfail(
-                    raises=AttributeError,
-                    reason="GH 34986",
+            if is_object:
+                request.node.add_marker(
+                    pytest.mark.xfail(
+                        raises=TypeError,
+                        reason="GH 47514: _concat_datetime expects axis arg.",
+                    )
+                )
+            elif not is_series:
+                request.node.add_marker(
+                    pytest.mark.xfail(
+                        raises=AttributeError,
+                        reason="GH 34986",
+                    )
                 )
-            )
         super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
 
     def test_in_numeric_groupby(self, data_for_grouping, request):

From 2816a8319bd913e641a7b0ed8b901b60747cdfe9 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Sun, 26 Jun 2022 19:22:46 -0700
Subject: [PATCH 5/9] Add xfail for tz data

---
 pandas/tests/extension/test_arrow.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index 4aa03199a83b0..d8b241c53cf3e 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -267,6 +267,16 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data):
 
 
 class TestBaseGroupby(base.BaseGroupbyTests):
+    def test_groupby_agg_extension(self, data_for_grouping, request):
+        tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
+        if pa_version_under2p0 and tz not in (None, "UTC"):
+            request.node.add_marker(
+                pytest.mark.xfail(
+                    reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}.")
+                )
+            )
+        super().test_groupby_agg_extension(data_for_grouping)
+
     def test_groupby_extension_no_sort(self, data_for_grouping, request):
         pa_dtype = data_for_grouping.dtype.pyarrow_dtype
         if pa.types.is_boolean(pa_dtype):

From d4ba7d14e3e87210ede40c6d400cdb11a2b6bbae Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Sun, 26 Jun 2022 19:23:03 -0700
Subject: [PATCH 6/9] Add xfail for tz data

---
 pandas/tests/extension/test_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index d8b241c53cf3e..fcbe592aacd25 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -272,7 +272,7 @@ def test_groupby_agg_extension(self, data_for_grouping, request):
         if pa_version_under2p0 and tz not in (None, "UTC"):
             request.node.add_marker(
                 pytest.mark.xfail(
-                    reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}.")
+                    reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}."
                 )
             )
         super().test_groupby_agg_extension(data_for_grouping)

From 4b245669edbc17804398b7932b00646b1aeca5bd Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Sun, 26 Jun 2022 21:09:22 -0700
Subject: [PATCH 7/9] Copy paste error

---
 pandas/tests/extension/test_arrow.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py
index fcbe592aacd25..eec22013f116f 100644
--- a/pandas/tests/extension/test_arrow.py
+++ b/pandas/tests/extension/test_arrow.py
@@ -268,7 +268,7 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data):
 
 class TestBaseGroupby(base.BaseGroupbyTests):
     def test_groupby_agg_extension(self, data_for_grouping, request):
-        tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
+        tz = getattr(data_for_grouping.dtype.pyarrow_dtype, "tz", None)
         if pa_version_under2p0 and tz not in (None, "UTC"):
             request.node.add_marker(
                 pytest.mark.xfail(

From b1f8e87f8f5b7cd1c4a5993f4cd8c40b9646c369 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Wed, 29 Jun 2022 10:20:31 -0700
Subject: [PATCH 8/9] Check for numpy dtype first

---
 pandas/core/dtypes/missing.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 7fa0db78edfe1..0b1e47c420ba9 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -754,11 +754,10 @@ def isna_all(arr: ArrayLike) -> bool:
     chunk_len = max(total_len // 40, 1000)
 
     dtype = arr.dtype
-    is_np_missing_value = na_value_for_dtype(dtype) is not libmissing.NA
-    if dtype.kind == "f" and is_np_missing_value:
+    if dtype.kind == "f" and isinstance(dtype, np.dtype):
         checker = nan_checker
 
-    elif (dtype.kind in ["m", "M"] or dtype.type is Period) and is_np_missing_value:
+    elif dtype.kind in ["m", "M"] or dtype.type is Period:
         # error: Incompatible types in assignment (expression has type
         # "Callable[[Any], Any]", variable has type "ufunc")
         checker = lambda x: np.asarray(x.view("i8")) == iNaT  # type: ignore[assignment]

From fa7091d44b6184ea8c18de252e461eb5d8faefcc Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <emailformattr@gmail.com>
Date: Wed, 29 Jun 2022 14:22:59 -0700
Subject: [PATCH 9/9] Now modify nat case

---
 pandas/core/dtypes/missing.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py
index 0b1e47c420ba9..e809e761ebbb2 100644
--- a/pandas/core/dtypes/missing.py
+++ b/pandas/core/dtypes/missing.py
@@ -41,6 +41,7 @@
 )
 from pandas.core.dtypes.dtypes import (
     CategoricalDtype,
+    DatetimeTZDtype,
     ExtensionDtype,
     IntervalDtype,
     PeriodDtype,
@@ -757,7 +758,11 @@ def isna_all(arr: ArrayLike) -> bool:
     if dtype.kind == "f" and isinstance(dtype, np.dtype):
         checker = nan_checker
 
-    elif dtype.kind in ["m", "M"] or dtype.type is Period:
+    elif (
+        (isinstance(dtype, np.dtype) and dtype.kind in ["m", "M"])
+        or isinstance(dtype, DatetimeTZDtype)
+        or dtype.type is Period
+    ):
         # error: Incompatible types in assignment (expression has type
         # "Callable[[Any], Any]", variable has type "ufunc")
         checker = lambda x: np.asarray(x.view("i8")) == iNaT  # type: ignore[assignment]