From 11f1213f878518f095981c6ef77eab28c0777124 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 29 Jun 2022 17:08:30 -0700 Subject: [PATCH 1/7] ENH/TST: Add BaseMethodsTests tests for ArrowExtensionArray --- pandas/core/base.py | 10 +- pandas/tests/extension/base/methods.py | 4 +- pandas/tests/extension/test_arrow.py | 248 +++++++++++++++++++++++++ 3 files changed, 256 insertions(+), 6 deletions(-) diff --git a/pandas/core/base.py b/pandas/core/base.py index 3d18194d14bec..33dbc292660d0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -983,10 +983,12 @@ def unique(self): if not isinstance(values, np.ndarray): result: ArrayLike = values.unique() - if self.dtype.kind in ["m", "M"] and isinstance(self, ABCSeries): - # GH#31182 Series._values returns EA, unpack for backward-compat - if getattr(self.dtype, "tz", None) is None: - result = np.asarray(result) + if ( + isinstance(self.dtype, np.dtype) and self.dtype.kind in ["m", "M"] + ) and isinstance(self, ABCSeries): + # GH#31182 Series._values returns EA + # unpack numpy datetime for backward-compat + result = np.asarray(result) else: result = unique1d(values) diff --git a/pandas/tests/extension/base/methods.py b/pandas/tests/extension/base/methods.py index 6e9130b18e94f..838c9f5b8a35f 100644 --- a/pandas/tests/extension/base/methods.py +++ b/pandas/tests/extension/base/methods.py @@ -5,6 +5,7 @@ import pytest from pandas.core.dtypes.common import is_bool_dtype +from pandas.core.dtypes.missing import na_value_for_dtype import pandas as pd import pandas._testing as tm @@ -49,8 +50,7 @@ def test_value_counts_with_normalize(self, data): else: expected = pd.Series(0.0, index=result.index) expected[result > 0] = 1 / len(values) - - if isinstance(data.dtype, pd.core.dtypes.dtypes.BaseMaskedDtype): + if na_value_for_dtype(data.dtype) is pd.NA: # TODO(GH#44692): avoid special-casing expected = expected.astype("Float64") diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 5e2006811f6eb..ccea785ae51ad 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -106,6 +106,77 @@ def all_data(request, data, data_missing): return data_missing +@pytest.fixture +def data_for_grouping(dtype): + """ + Data for factorization, grouping, and unique tests. + Expected to be like [B, B, NA, NA, A, A, B, C] + Where A < B < C and NA is missing + """ + pa_dtype = dtype.pyarrow_dtype + if pa.types.is_boolean(pa_dtype): + A = False + B = True + C = True + elif pa.types.is_floating(pa_dtype): + A = -1.1 + B = 0.0 + C = 1.1 + elif pa.types.is_signed_integer(pa_dtype): + A = -1 + B = 0 + C = 1 + elif pa.types.is_unsigned_integer(pa_dtype): + A = 0 + B = 1 + C = 10 + elif pa.types.is_date(pa_dtype): + A = date(1999, 12, 31) + B = date(2010, 1, 1) + C = date(2022, 1, 1) + elif pa.types.is_timestamp(pa_dtype): + A = datetime(1999, 1, 1, 1, 1, 1, 1) + B = datetime(2020, 1, 1) + C = datetime(2020, 1, 1, 1) + elif pa.types.is_duration(pa_dtype): + A = timedelta(-1) + B = timedelta(0) + C = timedelta(1, 4) + elif pa.types.is_time(pa_dtype): + A = time(0, 0) + B = time(0, 12) + C = time(12, 12) + else: + raise NotImplementedError + return pd.array([B, B, None, None, A, A, B, C], dtype=dtype) + + +@pytest.fixture +def data_for_sorting(data_for_grouping): + """ + Length-3 array with a known sort order. + + This should be three items [B, C, A] with + A < B < C + """ + return type(data_for_grouping)._from_sequence( + [data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]] + ) + + +@pytest.fixture +def data_missing_for_sorting(data_for_grouping): + """ + Length-3 array with a known sort order. + + This should be three items [B, NA, A] with + A < B and NA missing. + """ + return type(data_for_grouping)._from_sequence( + [data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]] + ) + + @pytest.fixture def na_value(): """The scalar missing value for this type. Default 'None'""" @@ -814,6 +885,183 @@ def test_EA_types(self, engine, data, request): super().test_EA_types(engine, data) +class TestBaseMethods(base.BaseMethodsTests): + @pytest.mark.parametrize("dropna", [True, False]) + def test_value_counts(self, all_data, dropna, request): + pa_dtype = all_data.dtype.pyarrow_dtype + if pa.types.is_date(pa_dtype) or ( + pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None + ): + request.node.add_marker( + pytest.mark.xfail( + raises=AttributeError, + reason="GH 34986", + ) + ) + elif pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"value_count has no kernel for {pa_dtype}", + ) + ) + super().test_value_counts(all_data, dropna) + + def test_value_counts_with_normalize(self, data, request): + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_date(pa_dtype) or ( + pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is None + ): + request.node.add_marker( + pytest.mark.xfail( + raises=AttributeError, + reason="GH 34986", + ) + ) + elif pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"value_count has no pyarrow kernel for {pa_dtype}", + ) + ) + super().test_value_counts_with_normalize(data) + + def test_argmin_argmax( + self, data_for_sorting, data_missing_for_sorting, na_value, request + ): + pa_dtype = data_for_sorting.dtype.pyarrow_dtype + if pa.types.is_boolean(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"{pa_dtype} only has 2 unique possible values", + ) + ) + super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values(self, data_for_sorting, ascending, sort_by_key, request): + pa_dtype = data_for_sorting.dtype.pyarrow_dtype + if pa.types.is_duration(pa_dtype) and not ascending: + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=( + f"unique has no pyarrow kernel " + f"for {pa_dtype} when ascending={ascending}" + ), + ) + ) + super().test_sort_values(data_for_sorting, ascending, sort_by_key) + + @pytest.mark.parametrize("ascending", [True, False]) + def test_sort_values_frame(self, data_for_sorting, ascending, request): + pa_dtype = data_for_sorting.dtype.pyarrow_dtype + if pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=( + f"dictionary_encode has no pyarrow kernel " + f"for {pa_dtype} when ascending={ascending}" + ), + ) + ) + super().test_sort_values_frame(data_for_sorting, ascending) + + @pytest.mark.parametrize("box", [pd.Series, lambda x: x]) + @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) + def test_unique(self, data, box, method, request): + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"unique has no pyarrow kernel for {pa_dtype}.", + ) + ) + super().test_unique(data, box, method) + + @pytest.mark.parametrize("na_sentinel", [-1, -2]) + def test_factorize(self, data_for_grouping, na_sentinel, request): + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}", + ) + ) + elif pa.types.is_boolean(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"{pa_dtype} only has 2 unique possible values", + ) + ) + super().test_factorize(data_for_grouping, na_sentinel) + + @pytest.mark.parametrize("na_sentinel", [-1, -2]) + def test_factorize_equivalence(self, data_for_grouping, na_sentinel, request): + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}", + ) + ) + super().test_factorize_equivalence(data_for_grouping, na_sentinel) + + def test_factorize_empty(self, data, request): + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}", + ) + ) + super().test_factorize_empty(data) + + @pytest.mark.xfail( + reason="result dtype pyarrow[bool] better than expected dtype object" + ) + def test_combine_le(self, data_repeated): + super().test_combine_le(data_repeated) + + def test_combine_add(self, data_repeated, request): + pa_dtype = next(data_repeated(1)).dtype.pyarrow_dtype + if pa.types.is_temporal(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=TypeError, + reason=f"{pa_dtype} cannot be added to {pa_dtype}", + ) + ) + super().test_combine_add(data_repeated) + + def test_searchsorted(self, data_for_sorting, as_series, request): + pa_dtype = data_for_sorting.dtype.pyarrow_dtype + if pa.types.is_boolean(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"{pa_dtype} only has 2 unique possible values", + ) + ) + super().test_searchsorted(data_for_sorting, as_series) + + def test_where_series(self, data, na_value, as_frame, request): + pa_dtype = data.dtype.pyarrow_dtype + if pa.types.is_temporal(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"Unsupported cast from double to {pa_dtype}", + ) + ) + super().test_where_series(data, na_value, as_frame) + + def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): with pytest.raises(NotImplementedError, match="Passing pyarrow type"): ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") From 36b120afecb11d5134d5dc0f8760ab1649172e6a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 30 Jun 2022 08:23:32 -0700 Subject: [PATCH 2/7] Passing test now --- pandas/tests/extension/test_string.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py index 8a8bdee90e467..6cea21b6672d8 100644 --- a/pandas/tests/extension/test_string.py +++ b/pandas/tests/extension/test_string.py @@ -167,9 +167,7 @@ def test_reduce_series_numeric(self, data, all_numeric_reductions, skipna): class TestMethods(base.BaseMethodsTests): - @pytest.mark.xfail(reason="returns nullable: GH 44692") - def test_value_counts_with_normalize(self, data): - super().test_value_counts_with_normalize(data) + pass class TestCasting(base.BaseCastingTests): From 24670cda34cdc59bcbbd94becdc23db5206dbcc2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 30 Jun 2022 09:50:32 -0700 Subject: [PATCH 3/7] add xfails for arraymanager --- pandas/tests/extension/test_arrow.py | 60 +++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index ccea785ae51ad..eb4f4ae4b1dec 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1023,6 +1023,56 @@ def test_factorize_empty(self, data, request): ) super().test_factorize_empty(data) + def test_fillna_copy_frame(self, data_missing, request, using_array_manager): + pa_dtype = data.dtype.pyarrow_dtype + if using_array_manager and pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"Checking ndim when using arraymanager with {pa_dtype}" + ) + ) + super().test_fillna_copy_frame(data_missing) + + def test_fillna_copy_series(self, data_missing, request, using_array_manager): + pa_dtype = data_missing.dtype.pyarrow_dtype + if using_array_manager and pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"Checking ndim when using arraymanager with {pa_dtype}" + ) + ) + super().test_fillna_copy_series(data_missing) + + def test_combine_first(self, data, request, using_array_manager): + pa_dtype = data.dtype.pyarrow_dtype + if using_array_manager and pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"Checking ndim when using arraymanager with {pa_dtype}" + ) + ) + super().test_combine_first(data) + + @pytest.mark.parametrize("frame", [True, False]) + @pytest.mark.parametrize( + "periods, indices", + [(-2, [2, 3, 4, -1, -1]), (0, [0, 1, 2, 3, 4]), (2, [-1, -1, 0, 1, 2])], + ) + def test_container_shift( + self, data, frame, periods, indices, request, using_array_manager + ): + pa_dtype = data.dtype.pyarrow_dtype + if using_array_manager and pa.types.is_duration(pa_dtype) and periods == 2: + request.node.add_marker( + pytest.mark.xfail( + reason=( + f"Checking ndim when using arraymanager with " + f"{pa_dtype} and periods={periods}" + ) + ) + ) + super().test_container_shift(data, frame, periods, indices) + @pytest.mark.xfail( reason="result dtype pyarrow[bool] better than expected dtype object" ) @@ -1050,9 +1100,15 @@ def test_searchsorted(self, data_for_sorting, as_series, request): ) super().test_searchsorted(data_for_sorting, as_series) - def test_where_series(self, data, na_value, as_frame, request): + def test_where_series(self, data, na_value, as_frame, request, using_array_manager): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_temporal(pa_dtype): + if using_array_manager and pa.types.is_duration(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + reason=f"Checking ndim when using arraymanager with {pa_dtype}" + ) + ) + elif pa.types.is_temporal(pa_dtype): request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, From 9d29395ea9fa338ddc6568f7a316e9bdeab1ceee Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 30 Jun 2022 10:39:37 -0700 Subject: [PATCH 4/7] Fix typo --- pandas/tests/extension/test_arrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index eb4f4ae4b1dec..418d192444cd1 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1024,7 +1024,7 @@ def test_factorize_empty(self, data, request): super().test_factorize_empty(data) def test_fillna_copy_frame(self, data_missing, request, using_array_manager): - pa_dtype = data.dtype.pyarrow_dtype + pa_dtype = data_missing.dtype.pyarrow_dtype if using_array_manager and pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( From 30cddcf3d015b48bdc1184e6d92470c9147dbcd4 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 30 Jun 2022 10:41:40 -0700 Subject: [PATCH 5/7] Trigger CI From f1fc6644f815219db3db8ef5a0757d572e963390 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 30 Jun 2022 15:57:04 -0700 Subject: [PATCH 6/7] Add xfails for min version and datamanger --- pandas/tests/extension/test_arrow.py | 49 +++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 418d192444cd1..db6c508b65652 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -551,7 +551,7 @@ def test_setitem_loc_scalar_single(self, data, using_array_manager, request): if pa_version_under2p0 and tz not in (None, "UTC"): request.node.add_marker( pytest.mark.xfail( - reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}") + reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" ) ) elif using_array_manager and pa.types.is_duration(data.dtype.pyarrow_dtype): @@ -942,7 +942,7 @@ def test_argmin_argmax( @pytest.mark.parametrize("ascending", [True, False]) def test_sort_values(self, data_for_sorting, ascending, sort_by_key, request): pa_dtype = data_for_sorting.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype) and not ascending: + if pa.types.is_duration(pa_dtype) and not ascending and not pa_version_under2p0: request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, @@ -973,7 +973,7 @@ def test_sort_values_frame(self, data_for_sorting, ascending, request): @pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique]) def test_unique(self, data, box, method, request): pa_dtype = data.dtype.pyarrow_dtype - if pa.types.is_duration(pa_dtype): + if pa.types.is_duration(pa_dtype) and not pa_version_under2p0: request.node.add_marker( pytest.mark.xfail( raises=pa.ArrowNotImplementedError, @@ -1043,14 +1043,55 @@ def test_fillna_copy_series(self, data_missing, request, using_array_manager): ) super().test_fillna_copy_series(data_missing) + def test_shift_fill_value(self, data, request): + pa_dtype = data.dtype.pyarrow_dtype + tz = getattr(pa_dtype, "tz", None) + if pa_version_under2p0 and tz not in (None, "UTC"): + request.node.add_marker( + pytest.mark.xfail( + reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" + ) + ) + super().test_shift_fill_value(data) + + @pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]]) + def test_repeat(self, data, repeats, as_series, use_numpy, request): + pa_dtype = data.dtype.pyarrow_dtype + tz = getattr(pa_dtype, "tz", None) + if pa_version_under2p0 and tz not in (None, "UTC"): + request.node.add_marker( + pytest.mark.xfail( + reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" + ) + ) + super().test_repeat(data, repeats, as_series, use_numpy) + + def test_insert(self, data, request): + pa_dtype = data.dtype.pyarrow_dtype + tz = getattr(pa_dtype, "tz", None) + if pa_version_under2p0 and tz not in (None, "UTC"): + request.node.add_marker( + pytest.mark.xfail( + reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" + ) + ) + super().test_insert(data) + def test_combine_first(self, data, request, using_array_manager): pa_dtype = data.dtype.pyarrow_dtype + tz = getattr(pa_dtype, "tz", None) if using_array_manager and pa.types.is_duration(pa_dtype): request.node.add_marker( pytest.mark.xfail( reason=f"Checking ndim when using arraymanager with {pa_dtype}" ) ) + elif pa_version_under2p0 and tz not in (None, "UTC"): + request.node.add_marker( + pytest.mark.xfail( + reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" + ) + ) super().test_combine_first(data) @pytest.mark.parametrize("frame", [True, False]) @@ -1062,7 +1103,7 @@ def test_container_shift( self, data, frame, periods, indices, request, using_array_manager ): pa_dtype = data.dtype.pyarrow_dtype - if using_array_manager and pa.types.is_duration(pa_dtype) and periods == 2: + if using_array_manager and pa.types.is_duration(pa_dtype) and periods == -2: request.node.add_marker( pytest.mark.xfail( reason=( From 5858824186f9001fd8012843770d1d9657c52deb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 1 Jul 2022 15:43:57 -0700 Subject: [PATCH 7/7] Adjust more tests --- pandas/tests/extension/test_arrow.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index db6c508b65652..680a5465d9b41 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1058,10 +1058,13 @@ def test_shift_fill_value(self, data, request): def test_repeat(self, data, repeats, as_series, use_numpy, request): pa_dtype = data.dtype.pyarrow_dtype tz = getattr(pa_dtype, "tz", None) - if pa_version_under2p0 and tz not in (None, "UTC"): + if pa_version_under2p0 and tz not in (None, "UTC") and repeats != 0: request.node.add_marker( pytest.mark.xfail( - reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}" + reason=( + f"Not supported by pyarrow < 2.0 with " + f"timestamp type {tz} when repeats={repeats}" + ) ) ) super().test_repeat(data, repeats, as_series, use_numpy) @@ -1103,7 +1106,11 @@ def test_container_shift( self, data, frame, periods, indices, request, using_array_manager ): pa_dtype = data.dtype.pyarrow_dtype - if using_array_manager and pa.types.is_duration(pa_dtype) and periods == -2: + if ( + using_array_manager + and pa.types.is_duration(pa_dtype) + and periods in (-2, 2) + ): request.node.add_marker( pytest.mark.xfail( reason=(