Skip to content

Commit 525f688

Browse files
mroeschkenoatamir
authored andcommitted
DEPS: Bump PyArrow to 6.0 (pandas-dev#49096)
1 parent 47c7aaf commit 525f688

24 files changed

+168
-1122
lines changed

.github/workflows/ubuntu.yml

+1-3
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,7 @@ jobs:
3131
matrix:
3232
env_file: [actions-38.yaml, actions-39.yaml, actions-310.yaml]
3333
pattern: ["not single_cpu", "single_cpu"]
34-
# Don't test pyarrow v2/3: Causes timeouts in read_csv engine
35-
# even if tests are skipped/xfailed
36-
pyarrow_version: ["5", "6", "7"]
34+
pyarrow_version: ["7", "8", "9"]
3735
include:
3836
- name: "Downstream Compat"
3937
env_file: actions-38-downstream_compat.yaml

ci/deps/actions-38-minimum_versions.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ dependencies:
3939
- openpyxl=3.0.7
4040
- pandas-gbq=0.15.0
4141
- psycopg2=2.8.6
42-
- pyarrow=1.0.1
42+
- pyarrow=6.0.0
4343
- pymysql=1.0.2
4444
- pyreadstat=1.1.2
4545
- pytables=3.6.1

doc/source/getting_started/install.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -388,7 +388,7 @@ PyTables 3.6.1 HDF5-based reading / writing
388388
blosc 1.21.0 Compression for HDF5
389389
zlib Compression for HDF5
390390
fastparquet 0.4.0 Parquet reading / writing
391-
pyarrow 1.0.1 Parquet, ORC, and feather reading / writing
391+
pyarrow 6.0.0 Parquet, ORC, and feather reading / writing
392392
pyreadstat 1.1.2 SPSS files (.sav) reading
393393
========================= ================== =============================================================
394394

@@ -402,7 +402,7 @@ pyreadstat 1.1.2 SPSS files (.sav) reading
402402
========================= ================== =============================================================
403403
System Conda PyPI
404404
========================= ================== =============================================================
405-
Linux Successful Failed(pyarrow==3.0 Successful)
405+
Linux Successful Failed
406406
macOS Successful Failed
407407
Windows Failed Failed
408408
========================= ================== =============================================================

doc/source/whatsnew/v2.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ Optional libraries below the lowest tested version may still work, but are not c
109109
+-----------------+-----------------+---------+
110110
| Package | Minimum Version | Changed |
111111
+=================+=================+=========+
112-
| | | X |
112+
| pyarrow | 6.0.0 | X |
113113
+-----------------+-----------------+---------+
114114

115115
See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more.

pandas/_testing/__init__.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
Dtype,
2929
Frequency,
3030
)
31-
from pandas.compat import pa_version_under1p01
31+
from pandas.compat import pa_version_under6p0
3232

3333
from pandas.core.dtypes.common import (
3434
is_float_dtype,
@@ -194,7 +194,7 @@
194194
]
195195
]
196196

197-
if not pa_version_under1p01:
197+
if not pa_version_under6p0:
198198
import pyarrow as pa
199199

200200
UNSIGNED_INT_PYARROW_DTYPES = [pa.uint8(), pa.uint16(), pa.uint32(), pa.uint64()]

pandas/compat/__init__.py

-10
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,6 @@
2020
np_version_under1p21,
2121
)
2222
from pandas.compat.pyarrow import (
23-
pa_version_under1p01,
24-
pa_version_under2p0,
25-
pa_version_under3p0,
26-
pa_version_under4p0,
27-
pa_version_under5p0,
2823
pa_version_under6p0,
2924
pa_version_under7p0,
3025
pa_version_under8p0,
@@ -154,11 +149,6 @@ def get_lzma_file() -> type[lzma.LZMAFile]:
154149
__all__ = [
155150
"is_numpy_dev",
156151
"np_version_under1p21",
157-
"pa_version_under1p01",
158-
"pa_version_under2p0",
159-
"pa_version_under3p0",
160-
"pa_version_under4p0",
161-
"pa_version_under5p0",
162152
"pa_version_under6p0",
163153
"pa_version_under7p0",
164154
"pa_version_under8p0",

pandas/compat/_optional.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@
3131
"pandas_gbq": "0.15.0",
3232
"psycopg2": "2.8.6", # (dt dec pq3 ext lo64)
3333
"pymysql": "1.0.2",
34-
"pyarrow": "1.0.1",
34+
"pyarrow": "6.0.0",
3535
"pyreadstat": "1.1.2",
3636
"pytest": "6.0",
3737
"pyxlsb": "1.0.8",

pandas/compat/pyarrow.py

-10
Original file line numberDiff line numberDiff line change
@@ -9,21 +9,11 @@
99

1010
_pa_version = pa.__version__
1111
_palv = Version(_pa_version)
12-
pa_version_under1p01 = _palv < Version("1.0.1")
13-
pa_version_under2p0 = _palv < Version("2.0.0")
14-
pa_version_under3p0 = _palv < Version("3.0.0")
15-
pa_version_under4p0 = _palv < Version("4.0.0")
16-
pa_version_under5p0 = _palv < Version("5.0.0")
1712
pa_version_under6p0 = _palv < Version("6.0.0")
1813
pa_version_under7p0 = _palv < Version("7.0.0")
1914
pa_version_under8p0 = _palv < Version("8.0.0")
2015
pa_version_under9p0 = _palv < Version("9.0.0")
2116
except ImportError:
22-
pa_version_under1p01 = True
23-
pa_version_under2p0 = True
24-
pa_version_under3p0 = True
25-
pa_version_under4p0 = True
26-
pa_version_under5p0 = True
2717
pa_version_under6p0 = True
2818
pa_version_under7p0 = True
2919
pa_version_under8p0 = True

pandas/core/arrays/arrow/array.py

+31-104
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,6 @@
1717
npt,
1818
)
1919
from pandas.compat import (
20-
pa_version_under1p01,
21-
pa_version_under2p0,
22-
pa_version_under3p0,
23-
pa_version_under4p0,
24-
pa_version_under5p0,
2520
pa_version_under6p0,
2621
pa_version_under7p0,
2722
)
@@ -48,7 +43,7 @@
4843
validate_indices,
4944
)
5045

51-
if not pa_version_under1p01:
46+
if not pa_version_under6p0:
5247
import pyarrow as pa
5348
import pyarrow.compute as pc
5449

@@ -65,16 +60,12 @@
6560
}
6661

6762
ARROW_LOGICAL_FUNCS = {
68-
"and": NotImplemented if pa_version_under2p0 else pc.and_kleene,
69-
"rand": NotImplemented
70-
if pa_version_under2p0
71-
else lambda x, y: pc.and_kleene(y, x),
72-
"or": NotImplemented if pa_version_under2p0 else pc.or_kleene,
73-
"ror": NotImplemented
74-
if pa_version_under2p0
75-
else lambda x, y: pc.or_kleene(y, x),
76-
"xor": NotImplemented if pa_version_under2p0 else pc.xor,
77-
"rxor": NotImplemented if pa_version_under2p0 else lambda x, y: pc.xor(y, x),
63+
"and": pc.and_kleene,
64+
"rand": lambda x, y: pc.and_kleene(y, x),
65+
"or": pc.or_kleene,
66+
"ror": lambda x, y: pc.or_kleene(y, x),
67+
"xor": pc.xor,
68+
"rxor": lambda x, y: pc.xor(y, x),
7869
}
7970

8071
def cast_for_truediv(
@@ -100,38 +91,22 @@ def floordiv_compat(
10091
return result
10192

10293
ARROW_ARITHMETIC_FUNCS = {
103-
"add": NotImplemented if pa_version_under2p0 else pc.add_checked,
104-
"radd": NotImplemented
105-
if pa_version_under2p0
106-
else lambda x, y: pc.add_checked(y, x),
107-
"sub": NotImplemented if pa_version_under2p0 else pc.subtract_checked,
108-
"rsub": NotImplemented
109-
if pa_version_under2p0
110-
else lambda x, y: pc.subtract_checked(y, x),
111-
"mul": NotImplemented if pa_version_under2p0 else pc.multiply_checked,
112-
"rmul": NotImplemented
113-
if pa_version_under2p0
114-
else lambda x, y: pc.multiply_checked(y, x),
115-
"truediv": NotImplemented
116-
if pa_version_under2p0
117-
else lambda x, y: pc.divide_checked(cast_for_truediv(x, y), y),
118-
"rtruediv": NotImplemented
119-
if pa_version_under2p0
120-
else lambda x, y: pc.divide_checked(y, cast_for_truediv(x, y)),
121-
"floordiv": NotImplemented
122-
if pa_version_under2p0
123-
else lambda x, y: floordiv_compat(x, y),
124-
"rfloordiv": NotImplemented
125-
if pa_version_under2p0
126-
else lambda x, y: floordiv_compat(y, x),
94+
"add": pc.add_checked,
95+
"radd": lambda x, y: pc.add_checked(y, x),
96+
"sub": pc.subtract_checked,
97+
"rsub": lambda x, y: pc.subtract_checked(y, x),
98+
"mul": pc.multiply_checked,
99+
"rmul": lambda x, y: pc.multiply_checked(y, x),
100+
"truediv": lambda x, y: pc.divide_checked(cast_for_truediv(x, y), y),
101+
"rtruediv": lambda x, y: pc.divide_checked(y, cast_for_truediv(x, y)),
102+
"floordiv": lambda x, y: floordiv_compat(x, y),
103+
"rfloordiv": lambda x, y: floordiv_compat(y, x),
127104
"mod": NotImplemented,
128105
"rmod": NotImplemented,
129106
"divmod": NotImplemented,
130107
"rdivmod": NotImplemented,
131-
"pow": NotImplemented if pa_version_under4p0 else pc.power_checked,
132-
"rpow": NotImplemented
133-
if pa_version_under4p0
134-
else lambda x, y: pc.power_checked(y, x),
108+
"pow": pc.power_checked,
109+
"rpow": lambda x, y: pc.power_checked(y, x),
135110
}
136111

137112
if TYPE_CHECKING:
@@ -206,8 +181,8 @@ class ArrowExtensionArray(OpsMixin, ExtensionArray):
206181
_dtype: ArrowDtype
207182

208183
def __init__(self, values: pa.Array | pa.ChunkedArray) -> None:
209-
if pa_version_under1p01:
210-
msg = "pyarrow>=1.0.0 is required for PyArrow backed ArrowExtensionArray."
184+
if pa_version_under6p0:
185+
msg = "pyarrow>=6.0.0 is required for PyArrow backed ArrowExtensionArray."
211186
raise ImportError(msg)
212187
if isinstance(values, pa.Array):
213188
self._data = pa.chunked_array([values])
@@ -360,8 +335,6 @@ def __arrow_array__(self, type=None):
360335
return self._data
361336

362337
def __invert__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
363-
if pa_version_under2p0:
364-
raise NotImplementedError("__invert__ not implement for pyarrow < 2.0")
365338
return type(self)(pc.invert(self._data))
366339

367340
def __neg__(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
@@ -395,10 +368,7 @@ def _cmp_method(self, other, op):
395368
f"{op.__name__} not implemented for {type(other)}"
396369
)
397370

398-
if pa_version_under2p0:
399-
result = result.to_pandas().values
400-
else:
401-
result = result.to_numpy()
371+
result = result.to_numpy()
402372
return BooleanArray._from_sequence(result)
403373

404374
def _evaluate_op_method(self, other, op, arrow_funcs):
@@ -464,10 +434,7 @@ def isna(self) -> npt.NDArray[np.bool_]:
464434
465435
This should return a 1-D array the same length as 'self'.
466436
"""
467-
if pa_version_under2p0:
468-
return self._data.is_null().to_pandas().values
469-
else:
470-
return self._data.is_null().to_numpy()
437+
return self._data.is_null().to_numpy()
471438

472439
@deprecate_nonkeyword_arguments(version=None, allowed_args=["self"])
473440
def argsort(
@@ -492,10 +459,7 @@ def argsort(
492459
result = pc.array_sort_indices(
493460
self._data, order=order, null_placement=null_placement
494461
)
495-
if pa_version_under2p0:
496-
np_result = result.to_pandas().values
497-
else:
498-
np_result = result.to_numpy()
462+
np_result = result.to_numpy()
499463
return np_result.astype(np.intp, copy=False)
500464

501465
def _argmin_max(self, skipna: bool, method: str) -> int:
@@ -548,24 +512,11 @@ def dropna(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
548512
return type(self)(pc.drop_null(self._data))
549513

550514
def isin(self, values) -> npt.NDArray[np.bool_]:
551-
if pa_version_under2p0:
552-
fallback_performancewarning(version="2")
553-
return super().isin(values)
554-
555-
# for an empty value_set pyarrow 3.0.0 segfaults and pyarrow 2.0.0 returns True
556-
# for null values, so we short-circuit to return all False array.
515+
# short-circuit to return all False array.
557516
if not len(values):
558517
return np.zeros(len(self), dtype=bool)
559518

560-
kwargs = {}
561-
if pa_version_under3p0:
562-
# in pyarrow 2.0.0 skip_null is ignored but is a required keyword and raises
563-
# with unexpected keyword argument in pyarrow 3.0.0+
564-
kwargs["skip_null"] = True
565-
566-
result = pc.is_in(
567-
self._data, value_set=pa.array(values, from_pandas=True), **kwargs
568-
)
519+
result = pc.is_in(self._data, value_set=pa.array(values, from_pandas=True))
569520
# pyarrow 2.0.0 returned nulls, so we explicitly specify dtype to convert nulls
570521
# to False
571522
return np.array(result, dtype=np.bool_)
@@ -584,10 +535,7 @@ def _values_for_factorize(self) -> tuple[np.ndarray, Any]:
584535
The values returned by this method are also used in
585536
:func:`pandas.util.hash_pandas_object`.
586537
"""
587-
if pa_version_under2p0:
588-
values = self._data.to_pandas().values
589-
else:
590-
values = self._data.to_numpy()
538+
values = self._data.to_numpy()
591539
return values, self.dtype.na_value
592540

593541
@doc(ExtensionArray.factorize)
@@ -597,11 +545,8 @@ def factorize(
597545
use_na_sentinel: bool | lib.NoDefault = lib.no_default,
598546
) -> tuple[np.ndarray, ExtensionArray]:
599547
resolved_na_sentinel = resolve_na_sentinel(na_sentinel, use_na_sentinel)
600-
if pa_version_under4p0:
601-
encoded = self._data.dictionary_encode()
602-
else:
603-
null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
604-
encoded = self._data.dictionary_encode(null_encoding=null_encoding)
548+
null_encoding = "mask" if resolved_na_sentinel is not None else "encode"
549+
encoded = self._data.dictionary_encode(null_encoding=null_encoding)
605550
indices = pa.chunked_array(
606551
[c.indices for c in encoded.chunks], type=encoded.type.index_type
607552
).to_pandas()
@@ -613,16 +558,6 @@ def factorize(
613558

614559
if encoded.num_chunks:
615560
uniques = type(self)(encoded.chunk(0).dictionary)
616-
if resolved_na_sentinel is None and pa_version_under4p0:
617-
# TODO: share logic with BaseMaskedArray.factorize
618-
# Insert na with the proper code
619-
na_mask = indices.values == -1
620-
na_index = na_mask.argmax()
621-
if na_mask[na_index]:
622-
na_code = 0 if na_index == 0 else indices[:na_index].max() + 1
623-
uniques = uniques.insert(na_code, self.dtype.na_value)
624-
indices[indices >= na_code] += 1
625-
indices[indices == -1] = na_code
626561
else:
627562
uniques = type(self)(pa.array([], type=encoded.type.value_type))
628563

@@ -740,11 +675,7 @@ def unique(self: ArrowExtensionArrayT) -> ArrowExtensionArrayT:
740675
-------
741676
ArrowExtensionArray
742677
"""
743-
if pa_version_under2p0:
744-
fallback_performancewarning(version="2")
745-
return super().unique()
746-
else:
747-
return type(self)(pc.unique(self._data))
678+
return type(self)(pc.unique(self._data))
748679

749680
def value_counts(self, dropna: bool = True) -> Series:
750681
"""
@@ -957,10 +888,6 @@ def _quantile(
957888
-------
958889
same type as self
959890
"""
960-
if pa_version_under4p0:
961-
raise NotImplementedError(
962-
"quantile only supported for pyarrow version >= 4.0"
963-
)
964891
result = pc.quantile(self._data, q=qs, interpolation=interpolation)
965892
return type(self)(result)
966893

@@ -1076,7 +1003,7 @@ def _replace_with_indices(
10761003
mask = np.zeros(len(chunk), dtype=np.bool_)
10771004
mask[indices] = True
10781005

1079-
if pa_version_under5p0:
1006+
if pa_version_under6p0:
10801007
arr = chunk.to_numpy(zero_copy_only=False)
10811008
arr[mask] = value
10821009
return pa.array(arr, type=chunk.type)

0 commit comments

Comments
 (0)