From cc7667ab3f9c65d03e12b0f77695b4d8bc3bcb79 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Wed, 20 Jul 2022 13:32:47 -0700 Subject: [PATCH 1/2] ENH/TST: Add isin, _hasna --- pandas/core/arrays/arrow/array.py | 50 +++++++++++++++++++++++++++++-- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 8957ea493e9ad..38042521b4459 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -18,6 +18,7 @@ from pandas.compat import ( pa_version_under1p01, pa_version_under2p0, + pa_version_under3p0, pa_version_under4p0, pa_version_under5p0, pa_version_under6p0, @@ -388,6 +389,10 @@ def __len__(self) -> int: """ return len(self._data) + @property + def _hasna(self) -> bool: + return self._data.null_count > 0 + def isna(self) -> npt.NDArray[np.bool_]: """ Boolean NumPy array indicating if each value is missing. @@ -425,6 +430,49 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: else: return type(self)(pc.drop_null(self._data)) + def isin(self, values): + if pa_version_under2p0: + fallback_performancewarning(version="2") + return super().isin(values) + + # for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True + # for null values, so we short-circuit to return all False array. + if not len(values): + return np.zeros(len(self), dtype=bool) + + kwargs = {} + if pa_version_under3p0: + # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises + # with unexpected keyword argument in pyarrow 3.0.0+ + kwargs["skip_null"] = True + + result = pc.is_in( + self._data, value_set=pa.array(values, from_pandas=True), **kwargs + ) + # pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls + # to False + return np.array(result, dtype=np.bool_) + + def _values_for_factorize(self) -> tuple[np.ndarray, Any]: + """ + Return an array and missing value suitable for factorization. + + Returns + ------- + values : ndarray + na_value : pd.NA + + Notes + ----- + The values returned by this method are also used in + :func:`pandas.util.hash_pandas_object`. + """ + if pa_version_under2p0: + values = self._data.to_pandas().values + else: + values = self._data.to_numpy() + return values, self.dtype.na_value + @doc(ExtensionArray.factorize) def factorize( self, @@ -622,8 +670,6 @@ def _concat_same_type( ------- ArrowExtensionArray """ - import pyarrow as pa - chunks = [array for ea in to_concat for array in ea._data.iterchunks()] arr = pa.chunked_array(chunks) return cls(arr) From a3a7d9c8f67c356d7b6476fa436b2e8caf5dc809 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke Date: Thu, 21 Jul 2022 15:12:20 -0700 Subject: [PATCH 2/2] type isin --- pandas/core/arrays/arrow/array.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/arrays/arrow/array.py b/pandas/core/arrays/arrow/array.py index 38042521b4459..ee8192cfca1ec 100644 --- a/pandas/core/arrays/arrow/array.py +++ b/pandas/core/arrays/arrow/array.py @@ -430,7 +430,7 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT: else: return type(self)(pc.drop_null(self._data)) - def isin(self, values): + def isin(self: ArrowExtensionArrayT, values) -> npt.NDArray[np.bool_]: if pa_version_under2p0: fallback_performancewarning(version="2") return super().isin(values)