Skip to content

DEPS: Bump PyArrow to 6.0 #49096

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Oct 17, 2022
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .github/workflows/ubuntu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,9 +31,7 @@ jobs:
matrix:
env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
pattern: ["not single_cpu", "single_cpu"]
# Don't test pyarrow v2/3: Causes timeouts in read_csv engine
# even if tests are skipped/xfailed
pyarrow_version: ["5", "6", "7"]
pyarrow_version: ["6", "7", "8"]
include:
- name: "Downstream Compat"
env_file: actions-38-downstream_compat.yaml
Expand Down
2 changes: 1 addition & 1 deletion ci/deps/actions-38-minimum_versions.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ dependencies:
- openpyxl=3.0.7
- pandas-gbq=0.15.0
- psycopg2=2.8.6
- pyarrow=1.0.1
- pyarrow=5.0.0
- pymysql=1.0.2
- pyreadstat=1.1.2
- pytables=3.6.1
Expand Down
4 changes: 2 additions & 2 deletions doc/source/getting_started/install.rst
Original file line number Diff line number Diff line change
Expand Up @@ -388,7 +388,7 @@ PyTables 3.6.1 HDF5-based reading / writing
blosc 1.21.0 Compression for HDF5
zlib Compression for HDF5
fastparquet 0.4.0 Parquet reading / writing
pyarrow 1.0.1 Parquet, ORC, and feather reading / writing
pyarrow 5.0.0 Parquet, ORC, and feather reading / writing
pyreadstat 1.1.2 SPSS files (.sav) reading
========================= ================== =============================================================

Expand All @@ -402,7 +402,7 @@ pyreadstat 1.1.2 SPSS files (.sav) reading
========================= ================== =============================================================
System Conda PyPI
========================= ================== =============================================================
Linux Successful Failed(pyarrow==3.0 Successful)
Linux Successful Failed
macOS Successful Failed
Windows Failed Failed
========================= ================== =============================================================
Expand Down
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v2.0.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ Optional libraries below the lowest tested version may still work, but are not c
+-----------------+-----------------+---------+
| Package | Minimum Version | Changed |
+=================+=================+=========+
| | | X |
| pyarrow | 5.0.0 | X |
+-----------------+-----------------+---------+

See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.
Expand Down
4 changes: 2 additions & 2 deletions pandas/_testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
Dtype,
Frequency,
)
from pandas.compat import pa_version_under1p01
from pandas.compat import pa_version_under5p0

from pandas.core.dtypes.common import (
is_float_dtype,
Expand Down Expand Up @@ -195,7 +195,7 @@
]
]

if not pa_version_under1p01:
if not pa_version_under5p0:
import pyarrow as pa

UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]
Expand Down
8 changes: 0 additions & 8 deletions pandas/compat/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,6 @@
np_version_under1p21,
)
from pandas.compat.pyarrow import (
pa_version_under1p01,
pa_version_under2p0,
pa_version_under3p0,
pa_version_under4p0,
pa_version_under5p0,
pa_version_under6p0,
pa_version_under7p0,
Expand Down Expand Up @@ -154,10 +150,6 @@ def get_lzma_file() -> type[lzma.LZMAFile]:
__all__ = [
"is_numpy_dev",
"np_version_under1p21",
"pa_version_under1p01",
"pa_version_under2p0",
"pa_version_under3p0",
"pa_version_under4p0",
"pa_version_under5p0",
"pa_version_under6p0",
"pa_version_under7p0",
Expand Down
2 changes: 1 addition & 1 deletion pandas/compat/_optional.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
"pandas_gbq": "0.15.0",
"psycopg2": "2.8.6", # (dt dec pq3 ext lo64)
"pymysql": "1.0.2",
"pyarrow": "1.0.1",
"pyarrow": "5.0.0",
"pyreadstat": "1.1.2",
"pytest": "6.0",
"pyxlsb": "1.0.8",
Expand Down
8 changes: 0 additions & 8 deletions pandas/compat/pyarrow.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,12 @@

_pa_version = pa.__version__
_palv = Version(_pa_version)
pa_version_under1p01 = _palv < Version("1.0.1")
pa_version_under2p0 = _palv < Version("2.0.0")
pa_version_under3p0 = _palv < Version("3.0.0")
pa_version_under4p0 = _palv < Version("4.0.0")
pa_version_under5p0 = _palv < Version("5.0.0")
pa_version_under6p0 = _palv < Version("6.0.0")
pa_version_under7p0 = _palv < Version("7.0.0")
pa_version_under8p0 = _palv < Version("8.0.0")
pa_version_under9p0 = _palv < Version("9.0.0")
except ImportError:
pa_version_under1p01 = True
pa_version_under2p0 = True
pa_version_under3p0 = True
pa_version_under4p0 = True
pa_version_under5p0 = True
pa_version_under6p0 = True
pa_version_under7p0 = True
Expand Down
132 changes: 30 additions & 102 deletions pandas/core/arrays/arrow/array.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,6 @@
npt,
)
from pandas.compat import (
pa_version_under1p01,
pa_version_under2p0,
pa_version_under3p0,
pa_version_under4p0,
pa_version_under5p0,
pa_version_under6p0,
pa_version_under7p0,
Expand Down Expand Up @@ -48,7 +44,7 @@
validate_indices,
)

if not pa_version_under1p01:
if not pa_version_under5p0:
import pyarrow as pa
import pyarrow.compute as pc

Expand All @@ -65,16 +61,12 @@
}

ARROW_LOGICAL_FUNCS = {
"and": NotImplemented if pa_version_under2p0 else pc.and_kleene,
"rand": NotImplemented
if pa_version_under2p0
else lambda x, y: pc.and_kleene(y, x),
"or": NotImplemented if pa_version_under2p0 else pc.or_kleene,
"ror": NotImplemented
if pa_version_under2p0
else lambda x, y: pc.or_kleene(y, x),
"xor": NotImplemented if pa_version_under2p0 else pc.xor,
"rxor": NotImplemented if pa_version_under2p0 else lambda x, y: pc.xor(y, x),
"and": pc.and_kleene,
"rand": lambda x, y: pc.and_kleene(y, x),
"or": pc.or_kleene,
"ror": lambda x, y: pc.or_kleene(y, x),
"xor": pc.xor,
"rxor": lambda x, y: pc.xor(y, x),
}

def cast_for_truediv(
Expand All @@ -100,38 +92,22 @@ def floordiv_compat(
return result

ARROW_ARITHMETIC_FUNCS = {
"add": NotImplemented if pa_version_under2p0 else pc.add_checked,
"radd": NotImplemented
if pa_version_under2p0
else lambda x, y: pc.add_checked(y, x),
"sub": NotImplemented if pa_version_under2p0 else pc.subtract_checked,
"rsub": NotImplemented
if pa_version_under2p0
else lambda x, y: pc.subtract_checked(y, x),
"mul": NotImplemented if pa_version_under2p0 else pc.multiply_checked,
"rmul": NotImplemented
if pa_version_under2p0
else lambda x, y: pc.multiply_checked(y, x),
"truediv": NotImplemented
if pa_version_under2p0
else lambda x, y: pc.divide_checked(cast_for_truediv(x, y), y),
"rtruediv": NotImplemented
if pa_version_under2p0
else lambda x, y: pc.divide_checked(y, cast_for_truediv(x, y)),
"floordiv": NotImplemented
if pa_version_under2p0
else lambda x, y: floordiv_compat(x, y),
"rfloordiv": NotImplemented
if pa_version_under2p0
else lambda x, y: floordiv_compat(y, x),
"add": pc.add_checked,
"radd": lambda x, y: pc.add_checked(y, x),
"sub": pc.subtract_checked,
"rsub": lambda x, y: pc.subtract_checked(y, x),
"mul": pc.multiply_checked,
"rmul": lambda x, y: pc.multiply_checked(y, x),
"truediv": lambda x, y: pc.divide_checked(cast_for_truediv(x, y), y),
"rtruediv": lambda x, y: pc.divide_checked(y, cast_for_truediv(x, y)),
"floordiv": lambda x, y: floordiv_compat(x, y),
"rfloordiv": lambda x, y: floordiv_compat(y, x),
"mod": NotImplemented,
"rmod": NotImplemented,
"divmod": NotImplemented,
"rdivmod": NotImplemented,
"pow": NotImplemented if pa_version_under4p0 else pc.power_checked,
"rpow": NotImplemented
if pa_version_under4p0
else lambda x, y: pc.power_checked(y, x),
"pow": pc.power_checked,
"rpow": lambda x, y: pc.power_checked(y, x),
}

if TYPE_CHECKING:
Expand Down Expand Up @@ -206,8 +182,8 @@ class ArrowExtensionArray(OpsMixin, ExtensionArray):
_dtype: ArrowDtype

def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
if pa_version_under1p01:
msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."
if pa_version_under5p0:
msg = "pyarrow>=5.0.0 is required for PyArrow backed ArrowExtensionArray."
raise ImportError(msg)
if isinstance(values, pa.Array):
self._data = pa.chunked_array([values])
Expand Down Expand Up @@ -360,8 +336,6 @@ def __arrow_array__(self, type=None):
return self._data

def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
if pa_version_under2p0:
raise NotImplementedError("__invert__ not implement for pyarrow < 2.0")
return type(self)(pc.invert(self._data))

def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
Expand Down Expand Up @@ -395,10 +369,7 @@ def _cmp_method(self, other, op):
f"{op.__name__} not implemented for {type(other)}"
)

if pa_version_under2p0:
result = result.to_pandas().values
else:
result = result.to_numpy()
result = result.to_numpy()
return BooleanArray._from_sequence(result)

def _evaluate_op_method(self, other, op, arrow_funcs):
Expand Down Expand Up @@ -464,10 +435,7 @@ def isna(self) -> npt.NDArray[np.bool_]:

This should return a 1-D array the same length as 'self'.
"""
if pa_version_under2p0:
return self._data.is_null().to_pandas().values
else:
return self._data.is_null().to_numpy()
return self._data.is_null().to_numpy()

@deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
def argsort(
Expand All @@ -492,10 +460,7 @@ def argsort(
result = pc.array_sort_indices(
self._data, order=order, null_placement=null_placement
)
if pa_version_under2p0:
np_result = result.to_pandas().values
else:
np_result = result.to_numpy()
np_result = result.to_numpy()
return np_result.astype(np.intp, copy=False)

def _argmin_max(self, skipna: bool, method: str) -> int:
Expand Down Expand Up @@ -548,24 +513,11 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
return type(self)(pc.drop_null(self._data))

def isin(self, values) -> npt.NDArray[np.bool_]:
if pa_version_under2p0:
fallback_performancewarning(version="2")
return super().isin(values)

# for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
# for null values, so we short-circuit to return all False array.
# short-circuit to return all False array.
if not len(values):
return np.zeros(len(self), dtype=bool)

kwargs = {}
if pa_version_under3p0:
# in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
# with unexpected keyword argument in pyarrow 3.0.0+
kwargs["skip_null"] = True

result = pc.is_in(
self._data, value_set=pa.array(values, from_pandas=True), **kwargs
)
result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True))
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
# to False
return np.array(result, dtype=np.bool_)
Expand All @@ -584,10 +536,7 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
The values returned by this method are also used in
:func:`pandas.util.hash_pandas_object`.
"""
if pa_version_under2p0:
values = self._data.to_pandas().values
else:
values = self._data.to_numpy()
values = self._data.to_numpy()
return values, self.dtype.na_value

@doc(ExtensionArray.factorize)
Expand All @@ -597,11 +546,8 @@ def factorize(
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
) -> tuple[np.ndarray, ExtensionArray]:
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
if pa_version_under4p0:
encoded = self._data.dictionary_encode()
else:
null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
encoded = self._data.dictionary_encode(null_encoding=null_encoding)
null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
encoded = self._data.dictionary_encode(null_encoding=null_encoding)
indices = pa.chunked_array(
[c.indices for c in encoded.chunks], type=encoded.type.index_type
).to_pandas()
Expand All @@ -613,16 +559,6 @@ def factorize(

if encoded.num_chunks:
uniques = type(self)(encoded.chunk(0).dictionary)
if resolved_na_sentinel is None and pa_version_under4p0:
# TODO: share logic with BaseMaskedArray.factorize
# Insert na with the proper code
na_mask = indices.values == -1
na_index = na_mask.argmax()
if na_mask[na_index]:
na_code = 0 if na_index == 0 else indices[:na_index].max() + 1
uniques = uniques.insert(na_code, self.dtype.na_value)
indices[indices >= na_code] += 1
indices[indices == -1] = na_code
else:
uniques = type(self)(pa.array([], type=encoded.type.value_type))

Expand Down Expand Up @@ -740,11 +676,7 @@ def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
-------
ArrowExtensionArray
"""
if pa_version_under2p0:
fallback_performancewarning(version="2")
return super().unique()
else:
return type(self)(pc.unique(self._data))
return type(self)(pc.unique(self._data))

def value_counts(self, dropna: bool = True) -> Series:
"""
Expand Down Expand Up @@ -957,10 +889,6 @@ def _quantile(
-------
same type as self
"""
if pa_version_under4p0:
raise NotImplementedError(
"quantile only supported for pyarrow version >= 4.0"
)
result = pc.quantile(self._data, q=qs, interpolation=interpolation)
return type(self)(result)

Expand Down
8 changes: 4 additions & 4 deletions pandas/core/arrays/arrow/dtype.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import numpy as np

from pandas._typing import DtypeObj
from pandas.compat import pa_version_under1p01
from pandas.compat import pa_version_under5p0
from pandas.util._decorators import cache_readonly

from pandas.core.dtypes.base import (
StorageExtensionDtype,
register_extension_dtype,
)

if not pa_version_under1p01:
if not pa_version_under5p0:
import pyarrow as pa


Expand Down Expand Up @@ -66,8 +66,8 @@ class ArrowDtype(StorageExtensionDtype):

def __init__(self, pyarrow_dtype: pa.DataType) -> None:
super().__init__("pyarrow")
if pa_version_under1p01:
raise ImportError("pyarrow>=1.0.1 is required for ArrowDtype")
if pa_version_under5p0:
raise ImportError("pyarrow>=5.0.0 is required for ArrowDtype")
if not isinstance(pyarrow_dtype, pa.DataType):
raise ValueError(
f"pyarrow_dtype ({pyarrow_dtype}) must be an instance "
Expand Down
Loading