From 625ceae0b800b67d16b3e644da5f688c1a42fa13 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 5 May 2021 12:49:08 +0100 Subject: [PATCH 1/3] [ArrowStringArray] CLN: remove hasattr checks --- pandas/compat/__init__.py | 21 +++++++ pandas/core/arrays/string_arrow.py | 98 ++++++++++++++---------------- 2 files changed, 68 insertions(+), 51 deletions(-) diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index af2dfe796f82d..33c33201e3b16 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -7,6 +7,7 @@ Other items: * platform checker """ +from distutils.version import LooseVersion import platform import sys import warnings @@ -129,6 +130,22 @@ def get_lzma_file(lzma): return lzma.LZMAFile +# pyarrow versioning +try: + import pyarrow as pa + + _pa_version = pa.__version__ + _palv = LooseVersion(_pa_version) + pa_version_under1p0 = _palv < LooseVersion("1.0.0") + pa_version_under2p0 = _palv < LooseVersion("2.0.0") + pa_version_under3p0 = _palv < LooseVersion("3.0.0") + pa_version_under4p0 = _palv < LooseVersion("4.0.0") +except ImportError: + pa_version_under1p0 = True + pa_version_under2p0 = True + pa_version_under3p0 = True + pa_version_under4p0 = True + __all__ = [ "is_numpy_dev", "np_array_datetime64_compat", @@ -136,4 +153,8 @@ def get_lzma_file(lzma): "np_version_under1p18", "np_version_under1p19", "np_version_under1p20", + "pa_version_under1p0", + "pa_version_under2p0", + "pa_version_under3p0", + "pa_version_under4p0", ] diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 6f23457c04dd4..f46db9b262b67 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -21,6 +21,10 @@ PositionalIndexer, type_t, ) +from pandas.compat import ( + pa_version_under2p0, + pa_version_under4p0, +) from pandas.util._decorators import doc from pandas.util._validators import validate_fillna_kwargs @@ -773,11 +777,10 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): return super()._str_contains(pat, case, flags, na, regex) if regex: - # match_substring_regex added in pyarrow 4.0.0 - if hasattr(pc, "match_substring_regex") and case: - result = pc.match_substring_regex(self._data, pat) - else: + if pa_version_under4p0 or case is False: return super()._str_contains(pat, case, flags, na, regex) + else: + result = pc.match_substring_regex(self._data, pat) else: if case: result = pc.match_substring(self._data, pat) @@ -789,27 +792,25 @@ def _str_contains(self, pat, case=True, flags=0, na=np.nan, regex: bool = True): return result def _str_startswith(self, pat, na=None): - # match_substring_regex added in pyarrow 4.0.0 - if hasattr(pc, "match_substring_regex"): - result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result - else: + if pa_version_under4p0: return super()._str_startswith(pat, na) + result = pc.match_substring_regex(self._data, "^" + re.escape(pat)) + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result + def _str_endswith(self, pat, na=None): - # match_substring_regex added in pyarrow 4.0.0 - if hasattr(pc, "match_substring_regex"): - result = pc.match_substring_regex(self._data, re.escape(pat) + "$") - result = BooleanDtype().__from_arrow__(result) - if not isna(na): - result[isna(result)] = bool(na) - return result - else: + if pa_version_under4p0: return super()._str_endswith(pat, na) + result = pc.match_substring_regex(self._data, re.escape(pat) + "$") + result = BooleanDtype().__from_arrow__(result) + if not isna(na): + result[isna(result)] = bool(na) + return result + def _str_isalnum(self): result = pc.utf8_is_alnum(self._data) return BooleanDtype().__from_arrow__(result) @@ -835,13 +836,12 @@ def _str_isnumeric(self): return BooleanDtype().__from_arrow__(result) def _str_isspace(self): - # utf8_is_space added in pyarrow 2.0.0 - if hasattr(pc, "utf8_is_space"): - result = pc.utf8_is_space(self._data) - return BooleanDtype().__from_arrow__(result) - else: + if pa_version_under2p0: return super()._str_isspace() + result = pc.utf8_is_space(self._data) + return BooleanDtype().__from_arrow__(result) + def _str_istitle(self): result = pc.utf8_is_title(self._data) return BooleanDtype().__from_arrow__(result) @@ -851,13 +851,12 @@ def _str_isupper(self): return BooleanDtype().__from_arrow__(result) def _str_len(self): - # utf8_length added in pyarrow 4.0.0 - if hasattr(pc, "utf8_length"): - result = pc.utf8_length(self._data) - return Int64Dtype().__from_arrow__(result) - else: + if pa_version_under4p0: return super()._str_len() + result = pc.utf8_length(self._data) + return Int64Dtype().__from_arrow__(result) + def _str_lower(self): return type(self)(pc.utf8_lower(self._data)) @@ -865,34 +864,31 @@ def _str_upper(self): return type(self)(pc.utf8_upper(self._data)) def _str_strip(self, to_strip=None): + if pa_version_under4p0: + return super()._str_strip(to_strip) + if to_strip is None: - # utf8_trim_whitespace added in pyarrow 4.0.0 - if hasattr(pc, "utf8_trim_whitespace"): - return type(self)(pc.utf8_trim_whitespace(self._data)) + result = pc.utf8_trim_whitespace(self._data) else: - # utf8_trim added in pyarrow 4.0.0 - if hasattr(pc, "utf8_trim"): - return type(self)(pc.utf8_trim(self._data, characters=to_strip)) - return super()._str_strip(to_strip) + result = pc.utf8_trim(self._data, characters=to_strip) + return type(self)(result) def _str_lstrip(self, to_strip=None): + if pa_version_under4p0: + return super()._str_lstrip(to_strip) + if to_strip is None: - # utf8_ltrim_whitespace added in pyarrow 4.0.0 - if hasattr(pc, "utf8_ltrim_whitespace"): - return type(self)(pc.utf8_ltrim_whitespace(self._data)) + result = pc.utf8_ltrim_whitespace(self._data) else: - # utf8_ltrim added in pyarrow 4.0.0 - if hasattr(pc, "utf8_ltrim"): - return type(self)(pc.utf8_ltrim(self._data, characters=to_strip)) - return super()._str_lstrip(to_strip) + result = pc.utf8_ltrim(self._data, characters=to_strip) + return type(self)(result) def _str_rstrip(self, to_strip=None): + if pa_version_under4p0: + return super()._str_rstrip(to_strip) + if to_strip is None: - # utf8_rtrim_whitespace added in pyarrow 4.0.0 - if hasattr(pc, "utf8_rtrim_whitespace"): - return type(self)(pc.utf8_rtrim_whitespace(self._data)) + result = pc.utf8_rtrim_whitespace(self._data) else: - # utf8_rtrim added in pyarrow 4.0.0 - if hasattr(pc, "utf8_rtrim"): - return type(self)(pc.utf8_rtrim(self._data, characters=to_strip)) - return super()._str_rstrip(to_strip) + result = pc.utf8_rtrim(self._data, characters=to_strip) + return type(self)(result) From f57833030c50bc8c102b11c2ec3c01d10e938dca Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 5 May 2021 13:38:19 +0100 Subject: [PATCH 2/3] move to pandas/compat/pyarrow.py --- pandas/compat/__init__.py | 23 ++++++----------------- pandas/compat/pyarrow.py | 18 ++++++++++++++++++ 2 files changed, 24 insertions(+), 17 deletions(-) create mode 100644 pandas/compat/pyarrow.py diff --git a/pandas/compat/__init__.py b/pandas/compat/__init__.py index 33c33201e3b16..8d64bf8852946 100644 --- a/pandas/compat/__init__.py +++ b/pandas/compat/__init__.py @@ -7,7 +7,6 @@ Other items: * platform checker """ -from distutils.version import LooseVersion import platform import sys import warnings @@ -21,6 +20,12 @@ np_version_under1p19, np_version_under1p20, ) +from pandas.compat.pyarrow import ( + pa_version_under1p0, + pa_version_under2p0, + pa_version_under3p0, + pa_version_under4p0, +) PY38 = sys.version_info >= (3, 8) PY39 = sys.version_info >= (3, 9) @@ -130,22 +135,6 @@ def get_lzma_file(lzma): return lzma.LZMAFile -# pyarrow versioning -try: - import pyarrow as pa - - _pa_version = pa.__version__ - _palv = LooseVersion(_pa_version) - pa_version_under1p0 = _palv < LooseVersion("1.0.0") - pa_version_under2p0 = _palv < LooseVersion("2.0.0") - pa_version_under3p0 = _palv < LooseVersion("3.0.0") - pa_version_under4p0 = _palv < LooseVersion("4.0.0") -except ImportError: - pa_version_under1p0 = True - pa_version_under2p0 = True - pa_version_under3p0 = True - pa_version_under4p0 = True - __all__ = [ "is_numpy_dev", "np_array_datetime64_compat", diff --git a/pandas/compat/pyarrow.py b/pandas/compat/pyarrow.py new file mode 100644 index 0000000000000..e9ca9b99d4380 --- /dev/null +++ b/pandas/compat/pyarrow.py @@ -0,0 +1,18 @@ +""" support pyarrow compatibility across versions """ + +from distutils.version import LooseVersion + +try: + import pyarrow as pa + + _pa_version = pa.__version__ + _palv = LooseVersion(_pa_version) + pa_version_under1p0 = _palv < LooseVersion("1.0.0") + pa_version_under2p0 = _palv < LooseVersion("2.0.0") + pa_version_under3p0 = _palv < LooseVersion("3.0.0") + pa_version_under4p0 = _palv < LooseVersion("4.0.0") +except ImportError: + pa_version_under1p0 = True + pa_version_under2p0 = True + pa_version_under3p0 = True + pa_version_under4p0 = True From df7b613397006d63097e8382362cb1f39e5cbcf9 Mon Sep 17 00:00:00 2001 From: Simon Hawkins Date: Wed, 5 May 2021 14:06:10 +0100 Subject: [PATCH 3/3] update isin to use pa_version_under --- pandas/core/arrays/string_arrow.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index 273c94879ebb0..a1278a129c40f 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -24,6 +24,7 @@ ) from pandas.compat import ( pa_version_under2p0, + pa_version_under3p0, pa_version_under4p0, ) from pandas.util._decorators import doc @@ -671,9 +672,7 @@ def take( return type(self)(self._data.take(indices_array)) def isin(self, values): - - # pyarrow.compute.is_in added in pyarrow 2.0.0 - if not hasattr(pc, "is_in"): + if pa_version_under2p0: return super().isin(values) value_set = [ @@ -688,7 +687,7 @@ def isin(self, values): return np.zeros(len(self), dtype=bool) kwargs = {} - if LooseVersion(pa.__version__) < "3.0.0": + if pa_version_under3p0: # in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises # with unexpected keyword argument in pyarrow 3.0.0+ kwargs["skip_null"] = True