ENH: Use pyarrow.compute for unique, dropna (pandas-dev#46725)

mroeschke · yehoshuadimarsky · commit a55ad542470d · 2022-07-13T10:18:00.000-04:00
diff --git a/doc/source/whatsnew/v1.5.0.rst b/doc/source/whatsnew/v1.5.0.rst
@@ -114,6 +114,7 @@ Other enhancements
 - :meth:`pd.concat` now raises when ``levels`` is given but ``keys`` is None (:issue:`46653`)
 - :meth:`pd.concat` now raises when ``levels`` contains duplicate values (:issue:`46653`)
 - Added ``numeric_only`` argument to :meth:`DataFrame.corr`, :meth:`DataFrame.corrwith`, and :meth:`DataFrame.cov` (:issue:`46560`)
+- A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`, :issue:`46725`)
 - Added ``validate`` argument to :meth:`DataFrame.join` (:issue:`46622`)
 - A :class:`errors.PerformanceWarning` is now thrown when using ``string[pyarrow]`` dtype with methods that don't dispatch to ``pyarrow.compute`` methods (:issue:`42613`)
 - Added ``numeric_only`` argument to :meth:`Resampler.sum`, :meth:`Resampler.prod`, :meth:`Resampler.min`, :meth:`Resampler.max`, :meth:`Resampler.first`, and :meth:`Resampler.last` (:issue:`46442`)
diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py
@@ -64,7 +64,10 @@
     rands_array,
     randu_array,
 )
-from pandas._testing._warnings import assert_produces_warning  # noqa:F401
+from pandas._testing._warnings import (  # noqa:F401
+    assert_produces_warning,
+    maybe_produces_warning,
+)
 from pandas._testing.asserters import (  # noqa:F401
     assert_almost_equal,
     assert_attr_equal,
diff --git a/pandas/_testing/_warnings.py b/pandas/_testing/_warnings.py
@@ -1,6 +1,9 @@
 from __future__ import annotations
 
-from contextlib import contextmanager
+from contextlib import (
+    contextmanager,
+    nullcontext,
+)
 import re
 import sys
 from typing import (
@@ -97,6 +100,16 @@ class for all warnings. To check that no warning is returned,
             )
 
 
+def maybe_produces_warning(warning: type[Warning], condition: bool, **kwargs):
+    """
+    Return a context manager that possibly checks a warning based on the condition
+    """
+    if condition:
+        return assert_produces_warning(warning, **kwargs)
+    else:
+        return nullcontext()
+
+
 def _assert_caught_expected_warning(
     *,
     caught_warnings: Sequence[warnings.WarningMessage],
diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py
@@ -23,6 +23,8 @@
     pa_version_under3p0,
     pa_version_under4p0,
     pa_version_under5p0,
+    pa_version_under6p0,
+    pa_version_under7p0,
 )
 
 PY39 = sys.version_info >= (3, 9)
@@ -150,4 +152,6 @@ def get_lzma_file():
     "pa_version_under3p0",
     "pa_version_under4p0",
     "pa_version_under5p0",
+    "pa_version_under6p0",
+    "pa_version_under7p0",
 ]
diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py
@@ -16,6 +16,7 @@
     pa_version_under1p01,
     pa_version_under2p0,
     pa_version_under5p0,
+    pa_version_under6p0,
 )
 from pandas.util._decorators import doc
 
@@ -37,6 +38,8 @@
     import pyarrow as pa
     import pyarrow.compute as pc
 
+    from pandas.core.arrays.arrow._arrow_utils import fallback_performancewarning
+
 if TYPE_CHECKING:
     from pandas import Series
 
@@ -104,6 +107,20 @@ def copy(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
         """
         return type(self)(self._data)
 
+    def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
+        """
+        Return ArrowExtensionArray without NA values.
+
+        Returns
+        -------
+        ArrowExtensionArray
+        """
+        if pa_version_under6p0:
+            fallback_performancewarning(version="6")
+            return super().dropna()
+        else:
+            return type(self)(pc.drop_null(self._data))
+
     @doc(ExtensionArray.factorize)
     def factorize(self, na_sentinel: int = -1) -> tuple[np.ndarray, ExtensionArray]:
         encoded = self._data.dictionary_encode()
@@ -219,6 +236,20 @@ def take(
                 indices_array[indices_array < 0] += len(self._data)
             return type(self)(self._data.take(indices_array))
 
+    def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
+        """
+        Compute the ArrowExtensionArray of unique values.
+
+        Returns
+        -------
+        ArrowExtensionArray
+        """
+        if pa_version_under2p0:
+            fallback_performancewarning(version="2")
+            return super().unique()
+        else:
+            return type(self)(pc.unique(self._data))
+
     def value_counts(self, dropna: bool = True) -> Series:
         """
         Return a Series containing counts of each unique value.
diff --git a/pandas/tests/arrays/string_/test_string.py b/pandas/tests/arrays/string_/test_string.py
@@ -2,8 +2,6 @@
 This module tests the functionality of StringArray and ArrowStringArray.
 Tests for the str accessors are in pandas/tests/strings/test_string_array.py
 """
-from contextlib import nullcontext
-
 import numpy as np
 import pytest
 
@@ -18,13 +16,6 @@
 from pandas.core.arrays.string_arrow import ArrowStringArray
 
 
-def maybe_perf_warn(using_pyarrow):
-    if using_pyarrow:
-        return tm.assert_produces_warning(PerformanceWarning, match="Falling back")
-    else:
-        return nullcontext()
-
-
 @pytest.fixture
 def dtype(string_storage):
     return pd.StringDtype(storage=string_storage)
@@ -568,22 +559,30 @@ def test_to_numpy_na_value(dtype, nulls_fixture):
 def test_isin(dtype, fixed_now_ts):
     s = pd.Series(["a", "b", None], dtype=dtype)
 
-    with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0):
+    with tm.maybe_produces_warning(
+        PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0
+    ):
         result = s.isin(["a", "c"])
     expected = pd.Series([True, False, False])
     tm.assert_series_equal(result, expected)
 
-    with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0):
+    with tm.maybe_produces_warning(
+        PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0
+    ):
         result = s.isin(["a", pd.NA])
     expected = pd.Series([True, False, True])
     tm.assert_series_equal(result, expected)
 
-    with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0):
+    with tm.maybe_produces_warning(
+        PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0
+    ):
         result = s.isin([])
     expected = pd.Series([False, False, False])
     tm.assert_series_equal(result, expected)
 
-    with maybe_perf_warn(dtype == "pyarrow" and pa_version_under2p0):
+    with tm.maybe_produces_warning(
+        PerformanceWarning, dtype == "pyarrow" and pa_version_under2p0
+    ):
         result = s.isin(["a", fixed_now_ts])
     expected = pd.Series([True, False, False])
     tm.assert_series_equal(result, expected)
diff --git a/pandas/tests/base/test_unique.py b/pandas/tests/base/test_unique.py
@@ -1,6 +1,9 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under2p0
+from pandas.errors import PerformanceWarning
+
 from pandas.core.dtypes.common import is_datetime64tz_dtype
 
 import pandas as pd
@@ -12,7 +15,11 @@
 def test_unique(index_or_series_obj):
     obj = index_or_series_obj
     obj = np.repeat(obj, range(1, len(obj) + 1))
-    result = obj.unique()
+    with tm.maybe_produces_warning(
+        PerformanceWarning,
+        pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]",
+    ):
+        result = obj.unique()
 
     # dict.fromkeys preserves the order
     unique_values = list(dict.fromkeys(obj.values))
@@ -50,7 +57,11 @@ def test_unique_null(null_obj, index_or_series_obj):
     klass = type(obj)
     repeated_values = np.repeat(values, range(1, len(values) + 1))
     obj = klass(repeated_values, dtype=obj.dtype)
-    result = obj.unique()
+    with tm.maybe_produces_warning(
+        PerformanceWarning,
+        pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]",
+    ):
+        result = obj.unique()
 
     unique_values_raw = dict.fromkeys(obj.values)
     # because np.nan == np.nan is False, but None == None is True
@@ -75,7 +86,11 @@ def test_unique_null(null_obj, index_or_series_obj):
 def test_nunique(index_or_series_obj):
     obj = index_or_series_obj
     obj = np.repeat(obj, range(1, len(obj) + 1))
-    expected = len(obj.unique())
+    with tm.maybe_produces_warning(
+        PerformanceWarning,
+        pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]",
+    ):
+        expected = len(obj.unique())
     assert obj.nunique(dropna=False) == expected
 
 
@@ -99,9 +114,21 @@ def test_nunique_null(null_obj, index_or_series_obj):
         assert obj.nunique() == len(obj.categories)
         assert obj.nunique(dropna=False) == len(obj.categories) + 1
     else:
-        num_unique_values = len(obj.unique())
-        assert obj.nunique() == max(0, num_unique_values - 1)
-        assert obj.nunique(dropna=False) == max(0, num_unique_values)
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]",
+        ):
+            num_unique_values = len(obj.unique())
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]",
+        ):
+            assert obj.nunique() == max(0, num_unique_values - 1)
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under2p0 and str(index_or_series_obj.dtype) == "string[pyarrow]",
+        ):
+            assert obj.nunique(dropna=False) == max(0, num_unique_values)
 
 
 @pytest.mark.single_cpu
diff --git a/pandas/tests/extension/test_string.py b/pandas/tests/extension/test_string.py
@@ -18,7 +18,11 @@
 import numpy as np
 import pytest
 
+from pandas.compat import pa_version_under6p0
+from pandas.errors import PerformanceWarning
+
 import pandas as pd
+import pandas._testing as tm
 from pandas.core.arrays import ArrowStringArray
 from pandas.core.arrays.string_ import StringDtype
 from pandas.tests.extension import base
@@ -139,7 +143,14 @@ class TestIndex(base.BaseIndexTests):
 
 
 class TestMissing(base.BaseMissingTests):
-    pass
+    def test_dropna_array(self, data_missing):
+        with tm.maybe_produces_warning(
+            PerformanceWarning,
+            pa_version_under6p0 and data_missing.dtype.storage == "pyarrow",
+        ):
+            result = data_missing.dropna()
+        expected = data_missing[[1]]
+        self.assert_extension_array_equal(result, expected)
 
 
 class TestNoReduce(base.BaseNoReduceTests):
diff --git a/pandas/tests/indexes/test_common.py b/pandas/tests/indexes/test_common.py
@@ -8,7 +8,10 @@
 import numpy as np
 import pytest
 
-from pandas.compat import IS64
+from pandas.compat import (
+    IS64,
+    pa_version_under2p0,
+)
 
 from pandas.core.dtypes.common import is_integer_dtype
 
@@ -395,7 +398,10 @@ def test_astype_preserves_name(self, index, dtype):
 
         try:
             # Some of these conversions cannot succeed so we use a try / except
-            with tm.assert_produces_warning(warn):
+            with tm.assert_produces_warning(
+                warn,
+                raise_on_extra_warnings=not pa_version_under2p0,
+            ):
                 result = index.astype(dtype)
         except (ValueError, TypeError, NotImplementedError, SystemError):
             return
diff --git a/pandas/tests/strings/test_find_replace.py b/pandas/tests/strings/test_find_replace.py
diff --git a/pandas/tests/strings/test_strings.py b/pandas/tests/strings/test_strings.py

Original file line number	Diff line number	Diff line change
`@@ -23,6 +23,8 @@`
`23`	`23`	`pa_version_under3p0,`
`24`	`24`	`pa_version_under4p0,`
`25`	`25`	`pa_version_under5p0,`
	`26`	`+ pa_version_under6p0,`
	`27`	`+ pa_version_under7p0,`
`26`	`28`	`)`
`27`	`29`
`28`	`30`	`PY39 = sys.version_info >= (3, 9)`
`@@ -150,4 +152,6 @@ def get_lzma_file():`
`150`	`152`	`"pa_version_under3p0",`
`151`	`153`	`"pa_version_under4p0",`
`152`	`154`	`"pa_version_under5p0",`
	`155`	`+ "pa_version_under6p0",`
	`156`	`+ "pa_version_under7p0",`
`153`	`157`	`]`