From 4c65875a22e3ff3140be47c74ca897784e2702db Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 15 Jul 2022 13:03:11 -0700 Subject: [PATCH 01/16] Add mode --- pandas/core/arrays/arrow/array.py | 27 ++++++++++++++++++++++++++ pandas/tests/extension/test_arrow.py | 29 ++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 92aedbb836b38..de1e0d7233073 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -608,6 +608,33 @@ def _indexing_key_to_indices( indices = np.arange(n)[key] return indices + def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT: + """ + Returns the mode(s) of the ExtensionArray. + + Always returns `ExtensionArray` even if only one value. + + Parameters + ---------- + dropna : bool, default True + Don't consider counts of NA values. + Not implemented by pyarrow. + + Returns + ------- + same type as self + Sorted, if possible. + """ + modes = pc.mode(self._data, pc.count_distinct(self._data).as_py()) + values = modes.field(0) + counts = modes.field(1) + # counts sorted descending i.e counts[0] = max + mask = pc.equal(counts, counts[0]) + most_common = values.filter(mask) + # error: Incompatible return value type (got "Union[ExtensionArray, + # ndarray[Any, Any]]", expected "ExtensionArrayT") + return type(self)(most_common) # type: ignore[return-value] + def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" # TODO: Make more robust like ArrowStringArray._maybe_convert_setitem_value diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 7e0792a6010a7..e274d3889339e 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1494,3 +1494,32 @@ def test_where_series(self, data, na_value, as_frame, request, using_array_manag def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): with pytest.raises(NotImplementedError, match="Passing pyarrow type"): ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") + + +@pytest.mark.parametrize("dropna", [True, False]) +@pytest.mark.parametrize( + "take_idx, exp_idx", + [[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]], + ids=["multi_mode", "single_mode"], +) +def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_temporal(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"mode not supported by pyarrow for {pa_dtype}", + ) + ) + elif pa.types.is_boolean(pa_dtype) and "multi_mode" in request.node.nodeid: + # https://issues.apache.org/jira/browse/ARROW-17096 + request.node.add_marker( + pytest.mark.xfail( + reason="https://issues.apache.org/jira/browse/ARROW-17096", + ) + ) + data = data_for_grouping.take(take_idx) + ser = pd.Series(data) + result = ser.mode(dropna=dropna) + expected = pd.Series(data_for_grouping.take(exp_idx)) + tm.assert_series_equal(result, expected) From 285aee24d1182e29275b3436dc357fe6a6b6167b Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 15 Jul 2022 14:12:39 -0700 Subject: [PATCH 02/16] implement quantile --- pandas/core/arrays/arrow/array.py | 18 ++++++++++++++++++ pandas/tests/extension/test_arrow.py | 27 +++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index de1e0d7233073..29fd23e522c47 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -608,6 +608,24 @@ def _indexing_key_to_indices( indices = np.arange(n)[key] return indices + def _quantile( + self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str + ) -> ArrowExtensionArrayT: + """ + Compute the quantiles of self for each quantile in `qs`. + + Parameters + ---------- + qs : np.ndarray[float64] + interpolation: str + + Returns + ------- + same type as self + """ + result = pc.quantile(self._data, q=qs, interpolation=interpolation) + return type(self)(result) + def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT: """ Returns the mode(s) of the ExtensionArray. diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index e274d3889339e..0664567216d85 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1496,6 +1496,33 @@ def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") +@pytest.mark.parametrize( + "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] +) +@pytest.mark.parametrize("quantile", [0.5, [0.5, 0.5]]) +def test_quantile(data, interpolation, quantile, request): + pa_dtype = data.dtype.pyarrow_dtype + if not (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"quantile not supported by pyarrow for {pa_dtype}", + ) + ) + data = data.take([0, 0, 0]) + ser = pd.Series(data) + result = ser.quantile(q=quantile, interpolation=interpolation) + if quantile == 0.5: + assert result == data[0] + else: + # Just check the values + result = result.astype("float64[pyarrow]") + expected = pd.Series( + data.take([0, 0]).astype("float64[pyarrow]"), index=[0.5, 0.5] + ) + tm.assert_series_equal(result, expected) + + @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize( "take_idx, exp_idx", From 218fc809e8f66700e75b3fcbe3e0adec6e02967c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 15 Jul 2022 14:25:22 -0700 Subject: [PATCH 03/16] Make note about rank --- pandas/core/arrays/arrow/array.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 29fd23e522c47..c387bfc088adc 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -608,6 +608,8 @@ def _indexing_key_to_indices( indices = np.arange(n)[key] return indices + # TODO: redefine _rank using pc.rank with pyarrow 9.0 + def _quantile( self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str ) -> ArrowExtensionArrayT: From ecaf2db1d7cae07c2ac85566168dc85b683ba7cb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 15 Jul 2022 15:15:13 -0700 Subject: [PATCH 04/16] Remove typing --- pandas/core/arrays/arrow/array.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c387bfc088adc..07f25c0427f04 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -651,9 +651,7 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra # counts sorted descending i.e counts[0] = max mask = pc.equal(counts, counts[0]) most_common = values.filter(mask) - # error: Incompatible return value type (got "Union[ExtensionArray, - # ndarray[Any, Any]]", expected "ExtensionArrayT") - return type(self)(most_common) # type: ignore[return-value] + return type(self)(most_common) def _maybe_convert_setitem_value(self, value): """Maybe convert value to be pyarrow compatible.""" From 79b85a7c2594085191ae608f917bd1c578460726 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 15 Jul 2022 17:29:43 -0700 Subject: [PATCH 05/16] Add NotImplementedError for unsupported versions --- pandas/core/arrays/arrow/array.py | 7 +++++++ pandas/tests/extension/test_arrow.py | 2 ++ 2 files changed, 9 insertions(+) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 07f25c0427f04..c2afdec935ae1 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,6 +18,7 @@ from pandas.compat import ( pa_version_under1p01, pa_version_under2p0, + pa_version_under4p0, pa_version_under5p0, pa_version_under6p0, ) @@ -625,6 +626,10 @@ def _quantile( ------- same type as self """ + if pa_version_under4p0: + raise NotImplementedError( + "quantile only supported for pyarrow version >= 6.0" + ) result = pc.quantile(self._data, q=qs, interpolation=interpolation) return type(self)(result) @@ -645,6 +650,8 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra same type as self Sorted, if possible. """ + if pa_version_under6p0: + raise NotImplementedError("mode only supported for pyarrow version >= 6.0") modes = pc.mode(self._data, pc.count_distinct(self._data).as_py()) values = modes.field(0) counts = modes.field(1) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 0664567216d85..efa82cb1d6cee 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -24,6 +24,7 @@ from pandas.compat import ( pa_version_under2p0, pa_version_under3p0, + pa_version_under6p0, ) import pandas as pd @@ -1523,6 +1524,7 @@ def test_quantile(data, interpolation, quantile, request): tm.assert_series_equal(result, expected) +@pytest.mark.xfail(pa_version_under6p0, raises=NotImplementedError) @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize( "take_idx, exp_idx", From 53e4d8a4c91d10078176c8776699f94c69e15b60 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 15 Jul 2022 17:32:52 -0700 Subject: [PATCH 06/16] improve xfails --- pandas/tests/extension/test_arrow.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index efa82cb1d6cee..46526e3a9ab02 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -24,6 +24,7 @@ from pandas.compat import ( pa_version_under2p0, pa_version_under3p0, + pa_version_under4p0, pa_version_under6p0, ) @@ -1497,6 +1498,11 @@ def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") +@pytest.mark.xfail( + pa_version_under4p0, + raises=NotImplementedError, + reason="Unsupported for pyarrow < 4", +) @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] ) @@ -1524,7 +1530,11 @@ def test_quantile(data, interpolation, quantile, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(pa_version_under6p0, raises=NotImplementedError) +@pytest.mark.xfail( + pa_version_under6p0, + raises=NotImplementedError, + reason="Unsupported for pyarrow < 6", +) @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize( "take_idx, exp_idx", From 18be328accc2c90f41508f68407ad728dc23a93d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Fri, 15 Jul 2022 18:59:23 -0700 Subject: [PATCH 07/16] Change notimmplemented to performancewarning --- pandas/core/arrays/arrow/_arrow_utils.py | 7 +- pandas/core/arrays/arrow/array.py | 19 ++++-- pandas/tests/extension/test_arrow.py | 83 ++++++++++++------------ 3 files changed, 59 insertions(+), 50 deletions(-) diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index c9666de9f892d..7744993f39ca9 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -19,9 +19,12 @@ def fallback_performancewarning(version: str | None = None) -> None: Raise a PerformanceWarning for falling back to ExtensionArray's non-pyarrow method """ - msg = "Falling back on a non-pyarrow code path which may decrease performance." + msg = ( + "Falling back on a non-pyarrow code path which may decrease performance or " + "not be fully compatible with pyarrow." + ) if version is not None: - msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." + msg += f" Upgrade to pyarrow >={version} to suppress this warning." warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index c2afdec935ae1..1169c05e481d5 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -35,7 +35,10 @@ from pandas.core.algorithms import resolve_na_sentinel from pandas.core.arraylike import OpsMixin -from pandas.core.arrays.base import ExtensionArray +from pandas.core.arrays.base import ( + ExtensionArray, + ExtensionArrayT, +) from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -613,7 +616,7 @@ def _indexing_key_to_indices( def _quantile( self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str - ) -> ArrowExtensionArrayT: + ) -> ArrowExtensionArrayT | ExtensionArrayT: """ Compute the quantiles of self for each quantile in `qs`. @@ -627,13 +630,14 @@ def _quantile( same type as self """ if pa_version_under4p0: - raise NotImplementedError( - "quantile only supported for pyarrow version >= 6.0" - ) + fallback_performancewarning("4") + return super()._quantile(qs, interpolation) result = pc.quantile(self._data, q=qs, interpolation=interpolation) return type(self)(result) - def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT: + def _mode( + self: ArrowExtensionArrayT, dropna: bool = True + ) -> ArrowExtensionArrayT | ExtensionArrayT: """ Returns the mode(s) of the ExtensionArray. @@ -651,7 +655,8 @@ def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArra Sorted, if possible. """ if pa_version_under6p0: - raise NotImplementedError("mode only supported for pyarrow version >= 6.0") + fallback_performancewarning("6") + return super()._mode(dropna) modes = pc.mode(self._data, pc.count_distinct(self._data).as_py()) values = modes.field(0) counts = modes.field(1) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 46526e3a9ab02..ae1cfb43e3d3d 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -10,7 +10,7 @@ classes (if they are relevant for the extension interface for all dtypes), or be added to the array-specific tests in `pandas/tests/arrays/`. """ - +import contextlib from datetime import ( date, datetime, @@ -27,6 +27,7 @@ pa_version_under4p0, pa_version_under6p0, ) +from pandas.errors import PerformanceWarning import pandas as pd import pandas._testing as tm @@ -1498,31 +1499,29 @@ def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") -@pytest.mark.xfail( - pa_version_under4p0, - raises=NotImplementedError, - reason="Unsupported for pyarrow < 4", -) @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] ) @pytest.mark.parametrize("quantile", [0.5, [0.5, 0.5]]) def test_quantile(data, interpolation, quantile, request): - pa_dtype = data.dtype.pyarrow_dtype - if not (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"quantile not supported by pyarrow for {pa_dtype}", - ) - ) data = data.take([0, 0, 0]) ser = pd.Series(data) - result = ser.quantile(q=quantile, interpolation=interpolation) - if quantile == 0.5: - assert result == data[0] + if pa_version_under4p0: + with tm.assert_produces_warning(PerformanceWarning): + # Just validate the PerformanceWarning + # ExtensionArray._quantile may not support all pyarrow types + with contextlib.suppress(Exception): + ser.quantile(q=quantile, interpolation=interpolation) else: - # Just check the values + pa_dtype = data.dtype.pyarrow_dtype + if not (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"quantile not supported by pyarrow for {pa_dtype}", + ) + ) + result = ser.quantile(q=quantile, interpolation=interpolation) result = result.astype("float64[pyarrow]") expected = pd.Series( data.take([0, 0]).astype("float64[pyarrow]"), index=[0.5, 0.5] @@ -1530,11 +1529,6 @@ def test_quantile(data, interpolation, quantile, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail( - pa_version_under6p0, - raises=NotImplementedError, - reason="Unsupported for pyarrow < 6", -) @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize( "take_idx, exp_idx", @@ -1542,23 +1536,30 @@ def test_quantile(data, interpolation, quantile, request): ids=["multi_mode", "single_mode"], ) def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): - pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_temporal(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"mode not supported by pyarrow for {pa_dtype}", - ) - ) - elif pa.types.is_boolean(pa_dtype) and "multi_mode" in request.node.nodeid: - # https://issues.apache.org/jira/browse/ARROW-17096 - request.node.add_marker( - pytest.mark.xfail( - reason="https://issues.apache.org/jira/browse/ARROW-17096", - ) - ) data = data_for_grouping.take(take_idx) ser = pd.Series(data) - result = ser.mode(dropna=dropna) - expected = pd.Series(data_for_grouping.take(exp_idx)) - tm.assert_series_equal(result, expected) + if pa_version_under6p0: + with tm.assert_produces_warning(PerformanceWarning): + # Just validate the PerformanceWarning + # ExtensionArray._mode may not support all pyarrow types + with contextlib.suppress(Exception): + ser.mode(dropna=dropna) + else: + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_temporal(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"mode not supported by pyarrow for {pa_dtype}", + ) + ) + elif pa.types.is_boolean(pa_dtype) and "multi_mode" in request.node.nodeid: + # https://issues.apache.org/jira/browse/ARROW-17096 + request.node.add_marker( + pytest.mark.xfail( + reason="https://issues.apache.org/jira/browse/ARROW-17096", + ) + ) + result = ser.mode(dropna=dropna) + expected = pd.Series(data_for_grouping.take(exp_idx)) + tm.assert_series_equal(result, expected) From f7879037312c98ec1a88da7d4737e7f92ce4245a Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 16 Jul 2022 11:32:09 -0700 Subject: [PATCH 08/16] Fix scalar case --- pandas/tests/extension/test_arrow.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index b1512801187f0..8a12aa832f324 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1866,11 +1866,15 @@ def test_quantile(data, interpolation, quantile, request): ) ) result = ser.quantile(q=quantile, interpolation=interpolation) - result = result.astype("float64[pyarrow]") - expected = pd.Series( - data.take([0, 0]).astype("float64[pyarrow]"), index=[0.5, 0.5] - ) - tm.assert_series_equal(result, expected) + if quantile == 0.5: + assert result == data[0] + else: + # Just check the values + result = result.astype("float64[pyarrow]") + expected = pd.Series( + data.take([0, 0]).astype("float64[pyarrow]"), index=[0.5, 0.5] + ) + tm.assert_series_equal(result, expected) @pytest.mark.parametrize("dropna", [True, False]) From 1fa1cfae6fde782d9d7204d4bcec581fce5360c7 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Sat, 16 Jul 2022 20:08:46 -0700 Subject: [PATCH 09/16] Ignore other warnings --- pandas/tests/extension/test_arrow.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 8a12aa832f324..57257d62c5126 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1887,7 +1887,9 @@ def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): data = data_for_grouping.take(take_idx) ser = pd.Series(data) if pa_version_under6p0: - with tm.assert_produces_warning(PerformanceWarning): + with tm.assert_produces_warning( + PerformanceWarning, raise_on_extra_warnings=False + ): # Just validate the PerformanceWarning # ExtensionArray._mode may not support all pyarrow types with contextlib.suppress(Exception): From 8a6586ff82cd66a69422fee6c200791f6aa792ae Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 18 Jul 2022 09:57:24 -0700 Subject: [PATCH 10/16] Print tests to debug min build timeout --- ci/run_tests.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index e6de5caf955fc..5bff8bb412947 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -24,7 +24,7 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then XVFB="xvfb-run " fi -PYTEST_CMD="${XVFB}pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="${XVFB}pytest -v -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" From 1b7e575630d2afb4b40d5d078cf88a4d0b0e487d Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 18 Jul 2022 15:49:35 -0700 Subject: [PATCH 11/16] Raise notimplementederror --- ci/run_tests.sh | 2 +- pandas/core/arrays/arrow/array.py | 8 +-- pandas/tests/extension/test_arrow.py | 90 ++++++++++++---------------- 3 files changed, 42 insertions(+), 58 deletions(-) diff --git a/ci/run_tests.sh b/ci/run_tests.sh index 5bff8bb412947..e6de5caf955fc 100755 --- a/ci/run_tests.sh +++ b/ci/run_tests.sh @@ -24,7 +24,7 @@ if [[ $(uname) == "Linux" && -z $DISPLAY ]]; then XVFB="xvfb-run " fi -PYTEST_CMD="${XVFB}pytest -v -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" +PYTEST_CMD="${XVFB}pytest -r fEs -n $PYTEST_WORKERS --dist=loadfile $TEST_ARGS $COVERAGE $PYTEST_TARGET" if [[ "$PATTERN" ]]; then PYTEST_CMD="$PYTEST_CMD -m \"$PATTERN\"" diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index ff54bc5201d0e..2a81299cefcfd 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -723,8 +723,9 @@ def _quantile( same type as self """ if pa_version_under4p0: - fallback_performancewarning("4") - return super()._quantile(qs, interpolation) + raise NotImplementedError( + "quantile only supported for pyarrow version >= 4.0" + ) result = pc.quantile(self._data, q=qs, interpolation=interpolation) return type(self)(result) @@ -748,8 +749,7 @@ def _mode( Sorted, if possible. """ if pa_version_under6p0: - fallback_performancewarning("6") - return super()._mode(dropna) + raise NotImplementedError("mode only supported for pyarrow version >= 6.0") modes = pc.mode(self._data, pc.count_distinct(self._data).as_py()) values = modes.field(0) counts = modes.field(1) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 57257d62c5126..055e6b54452cb 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -10,7 +10,6 @@ classes (if they are relevant for the extension interface for all dtypes), or be added to the array-specific tests in `pandas/tests/arrays/`. """ -import contextlib from datetime import ( date, datetime, @@ -28,7 +27,6 @@ pa_version_under6p0, pa_version_under8p0, ) -from pandas.errors import PerformanceWarning import pandas as pd import pandas._testing as tm @@ -1843,40 +1841,35 @@ def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") +@pytest.mark.xfail(pa_version_under4p0, raises=NotImplementedError) @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] ) @pytest.mark.parametrize("quantile", [0.5, [0.5, 0.5]]) def test_quantile(data, interpolation, quantile, request): + pa_dtype = data.dtype.pyarrow_dtype + if not (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"quantile not supported by pyarrow for {pa_dtype}", + ) + ) data = data.take([0, 0, 0]) ser = pd.Series(data) - if pa_version_under4p0: - with tm.assert_produces_warning(PerformanceWarning): - # Just validate the PerformanceWarning - # ExtensionArray._quantile may not support all pyarrow types - with contextlib.suppress(Exception): - ser.quantile(q=quantile, interpolation=interpolation) + result = ser.quantile(q=quantile, interpolation=interpolation) + if quantile == 0.5: + assert result == data[0] else: - pa_dtype = data.dtype.pyarrow_dtype - if not (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"quantile not supported by pyarrow for {pa_dtype}", - ) - ) - result = ser.quantile(q=quantile, interpolation=interpolation) - if quantile == 0.5: - assert result == data[0] - else: - # Just check the values - result = result.astype("float64[pyarrow]") - expected = pd.Series( - data.take([0, 0]).astype("float64[pyarrow]"), index=[0.5, 0.5] - ) - tm.assert_series_equal(result, expected) + # Just check the values + result = result.astype("float64[pyarrow]") + expected = pd.Series( + data.take([0, 0]).astype("float64[pyarrow]"), index=[0.5, 0.5] + ) + tm.assert_series_equal(result, expected) +@pytest.mark.xfail(pa_version_under6p0, raises=NotImplementedError) @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize( "take_idx, exp_idx", @@ -1884,32 +1877,23 @@ def test_quantile(data, interpolation, quantile, request): ids=["multi_mode", "single_mode"], ) def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): - data = data_for_grouping.take(take_idx) - ser = pd.Series(data) - if pa_version_under6p0: - with tm.assert_produces_warning( - PerformanceWarning, raise_on_extra_warnings=False - ): - # Just validate the PerformanceWarning - # ExtensionArray._mode may not support all pyarrow types - with contextlib.suppress(Exception): - ser.mode(dropna=dropna) - else: - pa_dtype = data_for_grouping.dtype.pyarrow_dtype - if pa.types.is_temporal(pa_dtype): - request.node.add_marker( - pytest.mark.xfail( - raises=pa.ArrowNotImplementedError, - reason=f"mode not supported by pyarrow for {pa_dtype}", - ) + pa_dtype = data_for_grouping.dtype.pyarrow_dtype + if pa.types.is_temporal(pa_dtype): + request.node.add_marker( + pytest.mark.xfail( + raises=pa.ArrowNotImplementedError, + reason=f"mode not supported by pyarrow for {pa_dtype}", ) - elif pa.types.is_boolean(pa_dtype) and "multi_mode" in request.node.nodeid: - # https://issues.apache.org/jira/browse/ARROW-17096 - request.node.add_marker( - pytest.mark.xfail( - reason="https://issues.apache.org/jira/browse/ARROW-17096", - ) + ) + elif pa.types.is_boolean(pa_dtype) and "multi_mode" in request.node.nodeid: + # https://issues.apache.org/jira/browse/ARROW-17096 + request.node.add_marker( + pytest.mark.xfail( + reason="https://issues.apache.org/jira/browse/ARROW-17096", ) - result = ser.mode(dropna=dropna) - expected = pd.Series(data_for_grouping.take(exp_idx)) - tm.assert_series_equal(result, expected) + ) + data = data_for_grouping.take(take_idx) + ser = pd.Series(data) + result = ser.mode(dropna=dropna) + expected = pd.Series(data_for_grouping.take(exp_idx)) + tm.assert_series_equal(result, expected) From c8f7468b15ea703ce596fb46e1eb840614a2f7df Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 18 Jul 2022 15:51:03 -0700 Subject: [PATCH 12/16] Undo warning message --- pandas/core/arrays/arrow/_arrow_utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/pandas/core/arrays/arrow/_arrow_utils.py b/pandas/core/arrays/arrow/_arrow_utils.py index 7744993f39ca9..c9666de9f892d 100644 --- a/pandas/core/arrays/arrow/_arrow_utils.py +++ b/pandas/core/arrays/arrow/_arrow_utils.py @@ -19,12 +19,9 @@ def fallback_performancewarning(version: str | None = None) -> None: Raise a PerformanceWarning for falling back to ExtensionArray's non-pyarrow method """ - msg = ( - "Falling back on a non-pyarrow code path which may decrease performance or " - "not be fully compatible with pyarrow." - ) + msg = "Falling back on a non-pyarrow code path which may decrease performance." if version is not None: - msg += f" Upgrade to pyarrow >={version} to suppress this warning." + msg += f" Upgrade to pyarrow >={version} to possibly suppress this warning." warnings.warn(msg, PerformanceWarning, stacklevel=find_stack_level()) From 6d6c735e87344272177d4a7926dd57b93015a5b1 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 18 Jul 2022 15:52:29 -0700 Subject: [PATCH 13/16] Undo typing --- pandas/core/arrays/arrow/array.py | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 2a81299cefcfd..1c259829a4f8c 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -35,10 +35,7 @@ from pandas.core.algorithms import resolve_na_sentinel from pandas.core.arraylike import OpsMixin -from pandas.core.arrays.base import ( - ExtensionArray, - ExtensionArrayT, -) +from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import ( check_array_indexer, unpack_tuple_and_ellipses, @@ -709,7 +706,7 @@ def _indexing_key_to_indices( def _quantile( self: ArrowExtensionArrayT, qs: npt.NDArray[np.float64], interpolation: str - ) -> ArrowExtensionArrayT | ExtensionArrayT: + ) -> ArrowExtensionArrayT: """ Compute the quantiles of self for each quantile in `qs`. @@ -729,9 +726,7 @@ def _quantile( result = pc.quantile(self._data, q=qs, interpolation=interpolation) return type(self)(result) - def _mode( - self: ArrowExtensionArrayT, dropna: bool = True - ) -> ArrowExtensionArrayT | ExtensionArrayT: + def _mode(self: ArrowExtensionArrayT, dropna: bool = True) -> ArrowExtensionArrayT: """ Returns the mode(s) of the ExtensionArray. From a27c8543fc0ecebcc24f75ef2c3ce0a0b0fd4d63 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Mon, 18 Jul 2022 17:57:21 -0700 Subject: [PATCH 14/16] reason --- pandas/tests/extension/test_arrow.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 055e6b54452cb..531cdb8001e82 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -1841,7 +1841,11 @@ def test_arrowdtype_construct_from_string_type_with_unsupported_parameters(): ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]") -@pytest.mark.xfail(pa_version_under4p0, raises=NotImplementedError) +@pytest.mark.xfail( + pa_version_under4p0, + raises=NotImplementedError, + reason="quantile only supported for pyarrow version >= 4.0", +) @pytest.mark.parametrize( "interpolation", ["linear", "lower", "higher", "nearest", "midpoint"] ) @@ -1869,7 +1873,11 @@ def test_quantile(data, interpolation, quantile, request): tm.assert_series_equal(result, expected) -@pytest.mark.xfail(pa_version_under6p0, raises=NotImplementedError) +@pytest.mark.xfail( + pa_version_under6p0, + raises=NotImplementedError, + reason="mode only supported for pyarrow version >= 6.0", +) @pytest.mark.parametrize("dropna", [True, False]) @pytest.mark.parametrize( "take_idx, exp_idx", From f5d1b9788cbbde7788b32e4b54757a3be2d18406 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 19 Jul 2022 10:22:05 -0700 Subject: [PATCH 15/16] multimode case will be fixed in pa=9 --- pandas/compat/__init__.py | 2 ++ pandas/compat/pyarrow.py | 2 ++ pandas/tests/extension/test_arrow.py | 8 ++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 5db859897b663..147134afd70c3 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -28,6 +28,7 @@ pa_version_under6p0, pa_version_under7p0, pa_version_under8p0, + pa_version_under9p0, ) if TYPE_CHECKING: @@ -160,4 +161,5 @@ def get_lzma_file() -> type[lzma.LZMAFile]: "pa_version_under6p0", "pa_version_under7p0", "pa_version_under8p0", + "pa_version_under9p0", ] diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 833cda20368a2..98d6f764490ba 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,6 +17,7 @@ pa_version_under6p0 = _palv < Version("6.0.0") pa_version_under7p0 = _palv < Version("7.0.0") pa_version_under8p0 = _palv < Version("8.0.0") + pa_version_under8p0 = _palv < Version("9.0.0") except ImportError: pa_version_under1p01 = True pa_version_under2p0 = True @@ -26,3 +27,4 @@ pa_version_under6p0 = True pa_version_under7p0 = True pa_version_under8p0 = True + pa_version_under9p0 = True diff --git a/pandas/tests/extension/test_arrow.py b/pandas/tests/extension/test_arrow.py index 531cdb8001e82..048b7d5b0b428 100644 --- a/pandas/tests/extension/test_arrow.py +++ b/pandas/tests/extension/test_arrow.py @@ -26,6 +26,7 @@ pa_version_under4p0, pa_version_under6p0, pa_version_under8p0, + pa_version_under9p0, ) import pandas as pd @@ -1893,8 +1894,11 @@ def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request): reason=f"mode not supported by pyarrow for {pa_dtype}", ) ) - elif pa.types.is_boolean(pa_dtype) and "multi_mode" in request.node.nodeid: - # https://issues.apache.org/jira/browse/ARROW-17096 + elif ( + pa.types.is_boolean(pa_dtype) + and "multi_mode" in request.node.nodeid + and pa_version_under9p0 + ): request.node.add_marker( pytest.mark.xfail( reason="https://issues.apache.org/jira/browse/ARROW-17096", From cd412ab950d4493c144f9a7117675affe6a46b7c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Tue, 19 Jul 2022 10:23:15 -0700 Subject: [PATCH 16/16] Fix typo --- pandas/compat/pyarrow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py index 98d6f764490ba..6965865acb5da 100644 --- a/pandas/compat/pyarrow.py +++ b/pandas/compat/pyarrow.py @@ -17,7 +17,7 @@ pa_version_under6p0 = _palv < Version("6.0.0") pa_version_under7p0 = _palv < Version("7.0.0") pa_version_under8p0 = _palv < Version("8.0.0") - pa_version_under8p0 = _palv < Version("9.0.0") + pa_version_under9p0 = _palv < Version("9.0.0") except ImportError: pa_version_under1p01 = True pa_version_under2p0 = True