diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index af2dfe796f82d..8d64bf8852946 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -20,6 +20,12 @@ np_version_under1p19, np_version_under1p20, ) +from pandas.compat.pyarrow import ( + pa_version_under1p0, + pa_version_under2p0, + pa_version_under3p0, + pa_version_under4p0, +) PY38 = sys.version_info >= (3, 8) PY39 = sys.version_info >= (3, 9) @@ -136,4 +142,8 @@ def get_lzma_file(lzma): "np_version_under1p18", "np_version_under1p19", "np_version_under1p20", + "pa_version_under1p0", + "pa_version_under2p0", + "pa_version_under3p0", + "pa_version_under4p0", ] diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py new file mode 100644 index 0000000000000..e9ca9b99d4380 --- /dev/null +++ b/pandas/compat/pyarrow.py @@ -0,0 +1,18 @@ +""" support pyarrow compatibility across versions """ + +from distutils.version import LooseVersion + +try: + import pyarrow as pa + + _pa_version = pa.__version__ + _palv = LooseVersion(_pa_version) + pa_version_under1p0 = _palv < LooseVersion("1.0.0") + pa_version_under2p0 = _palv < LooseVersion("2.0.0") + pa_version_under3p0 = _palv < LooseVersion("3.0.0") + pa_version_under4p0 = _palv < LooseVersion("4.0.0") +except ImportError: + pa_version_under1p0 = True + pa_version_under2p0 = True + pa_version_under3p0 = True + pa_version_under4p0 = True diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index de987b8d34f08..a1278a129c40f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -22,6 +22,11 @@ Scalar, type_t, ) +from pandas.compat import ( + pa_version_under2p0, + pa_version_under3p0, + pa_version_under4p0, +) from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs @@ -667,9 +672,7 @@ def take( return type(self)(self._data.take(indices_array)) def isin(self, values): - - # pyarrow.compute.is_in added in pyarrow 2.0.0 - if not hasattr(pc, "is_in"): + if pa_version_under2p0: return super().isin(values) value_set = [ @@ -684,7 +687,7 @@ def isin(self, values): return np.zeros(len(self), dtype=bool) kwargs = {} - if LooseVersion(pa.__version__) < "3.0.0": + if pa_version_under3p0: # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises # with unexpected keyword argument in pyarrow 3.0.0+ kwargs["skip_null"] = True @@ -802,11 +805,10 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): return super()._str_contains(pat, case, flags, na, regex) if regex: - # match_substring_regex added in pyarrow 4.0.0 - if hasattr(pc, "match_substring_regex") and case: - result = pc.match_substring_regex(self._data, pat) - else: + if pa_version_under4p0 or case is False: return super()._str_contains(pat, case, flags, na, regex) + else: + result = pc.match_substring_regex(self._data, pat) else: if case: result = pc.match_substring(self._data, pat) @@ -818,27 +820,25 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): return result def _str_startswith(self, pat, na=None): - # match_substring_regex added in pyarrow 4.0.0 - if hasattr(pc, "match_substring_regex"): - result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result - else: + if pa_version_under4p0: return super()._str_startswith(pat, na) + result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result + def _str_endswith(self, pat, na=None): - # match_substring_regex added in pyarrow 4.0.0 - if hasattr(pc, "match_substring_regex"): - result = pc.match_substring_regex(self._data, re.escape(pat) + "$") - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result - else: + if pa_version_under4p0: return super()._str_endswith(pat, na) + result = pc.match_substring_regex(self._data, re.escape(pat) + "$") + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result + def _str_match( self, pat: str, case: bool = True, flags: int = 0, na: Scalar = None ): @@ -871,13 +871,12 @@ def _str_isnumeric(self): return BooleanDtype().__from_arrow__(result) def _str_isspace(self): - # utf8_is_space added in pyarrow 2.0.0 - if hasattr(pc, "utf8_is_space"): - result = pc.utf8_is_space(self._data) - return BooleanDtype().__from_arrow__(result) - else: + if pa_version_under2p0: return super()._str_isspace() + result = pc.utf8_is_space(self._data) + return BooleanDtype().__from_arrow__(result) + def _str_istitle(self): result = pc.utf8_is_title(self._data) return BooleanDtype().__from_arrow__(result) @@ -887,13 +886,12 @@ def _str_isupper(self): return BooleanDtype().__from_arrow__(result) def _str_len(self): - # utf8_length added in pyarrow 4.0.0 - if hasattr(pc, "utf8_length"): - result = pc.utf8_length(self._data) - return Int64Dtype().__from_arrow__(result) - else: + if pa_version_under4p0: return super()._str_len() + result = pc.utf8_length(self._data) + return Int64Dtype().__from_arrow__(result) + def _str_lower(self): return type(self)(pc.utf8_lower(self._data)) @@ -901,34 +899,31 @@ def _str_upper(self): return type(self)(pc.utf8_upper(self._data)) def _str_strip(self, to_strip=None): + if pa_version_under4p0: + return super()._str_strip(to_strip) + if to_strip is None: - # utf8_trim_whitespace added in pyarrow 4.0.0 - if hasattr(pc, "utf8_trim_whitespace"): - return type(self)(pc.utf8_trim_whitespace(self._data)) + result = pc.utf8_trim_whitespace(self._data) else: - # utf8_trim added in pyarrow 4.0.0 - if hasattr(pc, "utf8_trim"): - return type(self)(pc.utf8_trim(self._data, characters=to_strip)) - return super()._str_strip(to_strip) + result = pc.utf8_trim(self._data, characters=to_strip) + return type(self)(result) def _str_lstrip(self, to_strip=None): + if pa_version_under4p0: + return super()._str_lstrip(to_strip) + if to_strip is None: - # utf8_ltrim_whitespace added in pyarrow 4.0.0 - if hasattr(pc, "utf8_ltrim_whitespace"): - return type(self)(pc.utf8_ltrim_whitespace(self._data)) + result = pc.utf8_ltrim_whitespace(self._data) else: - # utf8_ltrim added in pyarrow 4.0.0 - if hasattr(pc, "utf8_ltrim"): - return type(self)(pc.utf8_ltrim(self._data, characters=to_strip)) - return super()._str_lstrip(to_strip) + result = pc.utf8_ltrim(self._data, characters=to_strip) + return type(self)(result) def _str_rstrip(self, to_strip=None): + if pa_version_under4p0: + return super()._str_rstrip(to_strip) + if to_strip is None: - # utf8_rtrim_whitespace added in pyarrow 4.0.0 - if hasattr(pc, "utf8_rtrim_whitespace"): - return type(self)(pc.utf8_rtrim_whitespace(self._data)) + result = pc.utf8_rtrim_whitespace(self._data) else: - # utf8_rtrim added in pyarrow 4.0.0 - if hasattr(pc, "utf8_rtrim"): - return type(self)(pc.utf8_rtrim(self._data, characters=to_strip)) - return super()._str_rstrip(to_strip) + result = pc.utf8_rtrim(self._data, characters=to_strip) + return type(self)(result)