From c6f894b7a1bbc3e384f1e4fecaa9678858132ad0 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 22 Jun 2022 09:53:36 -0700 Subject: [PATCH 1/9] start arrow groupby tests --- pandas/tests/extension/test_arrow.py | 78 ++++++++++++++++++++++++++++ 1 file changed, 78 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9eeaf39959f29..8d5354e3247b2 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -105,6 +105,53 @@ def all_data(request, data, data_missing): return data_missing +@pytest.fixture +def data_for_grouping(dtype): + """ + Data for factorization, grouping, and unique tests. + + Expected to be like [B, B, NA, NA, A, A, B, C] + + Where A < B < C and NA is missing + """ + pa_dtype = dtype.pyarrow_dtype + if pa.types.is_boolean(pa_dtype): + A = False + B = True + C = True + elif pa.types.is_floating(pa_dtype): + A = -1.1 + B = 0.0 + C = 1.1 + elif pa.types.is_signed_integer(pa_dtype): + A = -1 + B = 0 + C = 1 + elif pa.types.is_unsigned_integer(pa_dtype): + A = 0 + B = 1 + C = 10 + elif pa.types.is_date(pa_dtype): + A = date(1999, 12, 31) + B = date(2010, 1, 1) + C = date(2022, 1, 1) + elif pa.types.is_timestamp(pa_dtype): + A = datetime(1999, 1, 1, 1, 1, 1, 1) + B = datetime(2020, 1, 1) + C = datetime(2020, 1, 1, 1) + elif pa.types.is_duration(pa_dtype): + A = timedelta(-1) + B = timedelta(0) + C = timedelta(1, 4) + elif pa.types.is_time(pa_dtype): + A = time(0, 0) + B = time(0, 12) + C = time(12, 12) + else: + raise NotImplementedError + return pd.array([B, B, None, None, A, A, B, C], dtype=dtype) + + @pytest.fixture def na_value(): """The scalar missing value for this type. Default 'None'""" @@ -218,6 +265,37 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): super().test_loc_iloc_frame_single_dtype(data) +class TestBaseGroupby(base.BaseGroupbyTests): + @pytest.mark.parametrize("as_index", [True, False]) + def test_groupby_extension_agg(self, as_index, data_for_grouping, request): + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_boolean(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=ValueError, + reason=f"{pa_dtype} only has 2 unique possible values", + ) + ) + elif pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support factorizing {pa_dtype}", + ) + ) + elif as_index is True and ( + pa.types.is_date(pa_dtype) + or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + ): + request.node.add_marker( + pytest.mark.xfail( + raises=AttributeError, + reason="GH 34986", + ) + ) + super().test_groupby_extension_agg(as_index, data_for_grouping) + + class TestBaseDtype(base.BaseDtypeTests): def test_construct_from_string_own_name(self, dtype, request): pa_dtype = dtype.pyarrow_dtype From 98c2875959cd0effddbfdd31ec518953aebbc9cf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 23 Jun 2022 21:35:16 -0700 Subject: [PATCH 2/9] Add more tests --- pandas/tests/extension/test_arrow.py | 35 ++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index b16ab223ba23f..9aeb667b20ef8 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -254,6 +254,41 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): class TestBaseGroupby(base.BaseGroupbyTests): + def test_groupby_extension_apply( + self, data_for_grouping, groupby_apply_op, request + ): + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + # Is there way to get the "Series" ID for groupby_apply_op? + is_series = isinstance(groupby_apply_op([]), pd.Series) + if pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support factorizing {pa_dtype}", + ) + ) + elif not is_series and ( + pa.types.is_date(pa_dtype) + or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + ): + request.node.add_marker( + pytest.mark.xfail( + raises=AttributeError, + reason="GH 34986", + ) + ) + super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) + + def test_in_numeric_groupby(self, data_for_grouping, request): + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason="ArrowExtensionArray doesn't support .sum() yet.", + ) + ) + super().test_in_numeric_groupby(data_for_grouping) + @pytest.mark.parametrize("as_index", [True, False]) def test_groupby_extension_agg(self, as_index, data_for_grouping, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype From a8e5c8ad263a2403d41e97f7bb25f1881c0e0a8f Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 24 Jun 2022 16:43:23 -0700 Subject: [PATCH 3/9] User better id checking --- pandas/tests/extension/test_arrow.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 9428e491bc7ac..642d412966439 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -271,8 +271,8 @@ def test_groupby_extension_apply( self, data_for_grouping, groupby_apply_op, request ): pa_dtype = data_for_grouping.dtype.pyarrow_dtype - # Is there way to get the "Series" ID for groupby_apply_op? - is_series = isinstance(groupby_apply_op([]), pd.Series) + # Is there a better way to get the "series" ID for groupby_apply_op? + is_series = "series" in request.node.nodeid if pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( @@ -849,8 +849,8 @@ def test_setitem_slice_array(self, data, request): def test_setitem_with_expansion_dataframe_column( self, data, full_indexer, using_array_manager, request ): - # Is there a way to get the full_indexer id "null_slice"? - is_null_slice = full_indexer(pd.Series(dtype=object)) == slice(None) + # Is there a better way to get the full_indexer id "null_slice"? + is_null_slice = "null_slice" in request.node.nodeid tz = getattr(data.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC") and not is_null_slice: request.node.add_marker( From e53c6afa7943576419f540cd5cd8524cc17a3fad Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 26 Jun 2022 17:22:01 -0700 Subject: [PATCH 4/9] Finish groupby tests --- pandas/core/dtypes/missing.py | 5 ++- pandas/tests/extension/test_arrow.py | 67 ++++++++++++++++++++++++---- 2 files changed, 62 insertions(+), 10 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 37b42ad66c027..7fa0db78edfe1 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -754,10 +754,11 @@ def isna_all(arr: ArrayLike) -> bool: chunk_len = max(total_len // 40, 1000) dtype = arr.dtype - if dtype.kind == "f": + is_np_missing_value = na_value_for_dtype(dtype) is not libmissing.NA + if dtype.kind == "f" and is_np_missing_value: checker = nan_checker - elif dtype.kind in ["m", "M"] or dtype.type is Period: + elif (dtype.kind in ["m", "M"] or dtype.type is Period) and is_np_missing_value: # error: Incompatible types in assignment (expression has type # "Callable[[Any], Any]", variable has type "ufunc") checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment] diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 642d412966439..4aa03199a83b0 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -267,12 +267,56 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): class TestBaseGroupby(base.BaseGroupbyTests): + def test_groupby_extension_no_sort(self, data_for_grouping, request): + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_boolean(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"{pa_dtype} only has 2 unique possible values", + ) + ) + elif pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support factorizing {pa_dtype}", + ) + ) + elif pa.types.is_date(pa_dtype) or ( + pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None + ): + request.node.add_marker( + pytest.mark.xfail( + raises=AttributeError, + reason="GH 34986", + ) + ) + super().test_groupby_extension_no_sort(data_for_grouping) + + def test_groupby_extension_transform(self, data_for_grouping, request): + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_boolean(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"{pa_dtype} only has 2 unique possible values", + ) + ) + elif pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"pyarrow doesn't support factorizing {pa_dtype}", + ) + ) + super().test_groupby_extension_transform(data_for_grouping) + def test_groupby_extension_apply( self, data_for_grouping, groupby_apply_op, request ): pa_dtype = data_for_grouping.dtype.pyarrow_dtype # Is there a better way to get the "series" ID for groupby_apply_op? is_series = "series" in request.node.nodeid + is_object = "object" in request.node.nodeid if pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( @@ -280,16 +324,23 @@ def test_groupby_extension_apply( reason=f"pyarrow doesn't support factorizing {pa_dtype}", ) ) - elif not is_series and ( - pa.types.is_date(pa_dtype) - or (pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None) + elif pa.types.is_date(pa_dtype) or ( + pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None ): - request.node.add_marker( - pytest.mark.xfail( - raises=AttributeError, - reason="GH 34986", + if is_object: + request.node.add_marker( + pytest.mark.xfail( + raises=TypeError, + reason="GH 47514: _concat_datetime expects axis arg.", + ) + ) + elif not is_series: + request.node.add_marker( + pytest.mark.xfail( + raises=AttributeError, + reason="GH 34986", + ) ) - ) super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op) def test_in_numeric_groupby(self, data_for_grouping, request): From 2816a8319bd913e641a7b0ed8b901b60747cdfe9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 26 Jun 2022 19:22:46 -0700 Subject: [PATCH 5/9] Add xfail for tz data --- pandas/tests/extension/test_arrow.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 4aa03199a83b0..d8b241c53cf3e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -267,6 +267,16 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): class TestBaseGroupby(base.BaseGroupbyTests): + def test_groupby_agg_extension(self, data_for_grouping, request): + tz = getattr(data.dtype.pyarrow_dtype, "tz", None) + if pa_version_under2p0 and tz not in (None, "UTC"): + request.node.add_marker( + pytest.mark.xfail( + reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}.") + ) + ) + super().test_groupby_agg_extension(data_for_grouping) + def test_groupby_extension_no_sort(self, data_for_grouping, request): pa_dtype = data_for_grouping.dtype.pyarrow_dtype if pa.types.is_boolean(pa_dtype): From d4ba7d14e3e87210ede40c6d400cdb11a2b6bbae Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 26 Jun 2022 19:23:03 -0700 Subject: [PATCH 6/9] Add xfail for tz data --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index d8b241c53cf3e..fcbe592aacd25 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -272,7 +272,7 @@ def test_groupby_agg_extension(self, data_for_grouping, request): if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( pytest.mark.xfail( - reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}.") + reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}." ) ) super().test_groupby_agg_extension(data_for_grouping) From 4b245669edbc17804398b7932b00646b1aeca5bd Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sun, 26 Jun 2022 21:09:22 -0700 Subject: [PATCH 7/9] Copy paste error --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index fcbe592aacd25..eec22013f116f 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -268,7 +268,7 @@ def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data): class TestBaseGroupby(base.BaseGroupbyTests): def test_groupby_agg_extension(self, data_for_grouping, request): - tz = getattr(data.dtype.pyarrow_dtype, "tz", None) + tz = getattr(data_for_grouping.dtype.pyarrow_dtype, "tz", None) if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( pytest.mark.xfail( From b1f8e87f8f5b7cd1c4a5993f4cd8c40b9646c369 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 29 Jun 2022 10:20:31 -0700 Subject: [PATCH 8/9] Check for numpy dtype first --- pandas/core/dtypes/missing.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 7fa0db78edfe1..0b1e47c420ba9 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -754,11 +754,10 @@ def isna_all(arr: ArrayLike) -> bool: chunk_len = max(total_len // 40, 1000) dtype = arr.dtype - is_np_missing_value = na_value_for_dtype(dtype) is not libmissing.NA - if dtype.kind == "f" and is_np_missing_value: + if dtype.kind == "f" and isinstance(dtype, np.dtype): checker = nan_checker - elif (dtype.kind in ["m", "M"] or dtype.type is Period) and is_np_missing_value: + elif dtype.kind in ["m", "M"] or dtype.type is Period: # error: Incompatible types in assignment (expression has type # "Callable[[Any], Any]", variable has type "ufunc") checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment] From fa7091d44b6184ea8c18de252e461eb5d8faefcc Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 29 Jun 2022 14:22:59 -0700 Subject: [PATCH 9/9] Now modify nat case --- pandas/core/dtypes/missing.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/pandas/core/dtypes/missing.py b/pandas/core/dtypes/missing.py index 0b1e47c420ba9..e809e761ebbb2 100644 --- a/pandas/core/dtypes/missing.py +++ b/pandas/core/dtypes/missing.py @@ -41,6 +41,7 @@ ) from pandas.core.dtypes.dtypes import ( CategoricalDtype, + DatetimeTZDtype, ExtensionDtype, IntervalDtype, PeriodDtype, @@ -757,7 +758,11 @@ def isna_all(arr: ArrayLike) -> bool: if dtype.kind == "f" and isinstance(dtype, np.dtype): checker = nan_checker - elif dtype.kind in ["m", "M"] or dtype.type is Period: + elif ( + (isinstance(dtype, np.dtype) and dtype.kind in ["m", "M"]) + or isinstance(dtype, DatetimeTZDtype) + or dtype.type is Period + ): # error: Incompatible types in assignment (expression has type # "Callable[[Any], Any]", variable has type "ufunc") checker = lambda x: np.asarray(x.view("i8")) == iNaT # type: ignore[assignment]