Skip to content

Commit 5db3fae

Browse files
anjsudhTomAugspurger
authored andcommitted
Bumping up min version for pyarrow and fastparquet (#23482)
* Bumping up min version for pyarrow
1 parent f0877ec commit 5db3fae

File tree

7 files changed

+32
-104
lines changed

7 files changed

+32
-104
lines changed

ci/requirements-optional-conda.txt

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
beautifulsoup4>=4.2.1
22
blosc
33
bottleneck>=1.2.0
4-
fastparquet
4+
fastparquet>=0.1.2
55
gcsfs
66
html5lib
77
ipython>=5.6.0
@@ -12,7 +12,7 @@ matplotlib>=2.0.0
1212
nbsphinx
1313
numexpr>=2.6.1
1414
openpyxl
15-
pyarrow>=0.4.1
15+
pyarrow>=0.7.0
1616
pymysql
1717
pytables>=3.4.2
1818
pytest-cov

ci/requirements-optional-pip.txt

+4-4
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
beautifulsoup4>=4.2.1
44
blosc
55
bottleneck>=1.2.0
6-
fastparquet
6+
fastparquet>=0.1.2
77
gcsfs
88
html5lib
99
ipython>=5.6.0
@@ -14,9 +14,9 @@ matplotlib>=2.0.0
1414
nbsphinx
1515
numexpr>=2.6.1
1616
openpyxl
17-
pyarrow>=0.4.1
17+
pyarrow>=0.7.0
1818
pymysql
19-
tables
19+
pytables>=3.4.2
2020
pytest-cov
2121
pytest-xdist
2222
s3fs
@@ -27,4 +27,4 @@ statsmodels
2727
xarray
2828
xlrd
2929
xlsxwriter
30-
xlwt
30+
xlwt

ci/travis-27.yaml

+1-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ dependencies:
2222
- patsy
2323
- psycopg2
2424
- py
25-
- pyarrow=0.4.1
25+
- pyarrow=0.7.0
2626
- PyCrypto
2727
- pymysql=0.6.3
2828
- pytables

doc/source/install.rst

+2-2
Original file line numberDiff line numberDiff line change
@@ -258,8 +258,8 @@ Optional Dependencies
258258
* `SciPy <http://www.scipy.org>`__: miscellaneous statistical functions, Version 0.18.1 or higher
259259
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
260260
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage, Version 3.4.2 or higher
261-
* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1): necessary for feather-based storage.
262-
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
261+
* `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0): necessary for feather-based storage.
262+
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest>`__ (>= 0.1.2) for parquet-based storage. The `snappy <https://pypi.org/project/python-snappy>`__ and `brotli <https://pypi.org/project/brotlipy>`__ are available for compression support.
263263
* `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
264264

265265
* `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL

doc/source/whatsnew/v0.24.0.txt

+5-1
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ Backwards incompatible API changes
250250
Dependencies have increased minimum versions
251251
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
252252

253-
We have updated our minimum supported versions of dependencies (:issue:`21242`).
253+
We have updated our minimum supported versions of dependencies (:issue:`21242`, `18742`).
254254
If installed, we now require:
255255

256256
+-----------------+-----------------+----------+
@@ -268,6 +268,10 @@ If installed, we now require:
268268
+-----------------+-----------------+----------+
269269
| scipy | 0.18.1 | |
270270
+-----------------+-----------------+----------+
271+
| pyarrow | 0.7.0 | |
272+
+-----------------+-----------------+----------+
273+
| fastparquet | 0.1.2 | |
274+
+-----------------+-----------------+----------+
271275

272276
Additionally we no longer depend on `feather-format` for feather based storage
273277
and replaced it with references to `pyarrow` (:issue:`21639` and :issue:`23053`).

pandas/io/parquet.py

+13-65
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from pandas.compat import string_types
77

8-
from pandas import DataFrame, Int64Index, RangeIndex, get_option
8+
from pandas import DataFrame, get_option
99
import pandas.core.common as com
1010

1111
from pandas.io.common import get_filepath_or_buffer, is_s3_url
@@ -89,57 +89,38 @@ def __init__(self):
8989
"\nor via pip\n"
9090
"pip install -U pyarrow\n"
9191
)
92-
if LooseVersion(pyarrow.__version__) < '0.4.1':
92+
if LooseVersion(pyarrow.__version__) < '0.7.0':
9393
raise ImportError(
94-
"pyarrow >= 0.4.1 is required for parquet support\n\n"
94+
"pyarrow >= 0.7.0 is required for parquet support\n\n"
9595
"you can install via conda\n"
9696
"conda install pyarrow -c conda-forge\n"
9797
"\nor via pip\n"
9898
"pip install -U pyarrow\n"
9999
)
100100

101-
self._pyarrow_lt_060 = (
102-
LooseVersion(pyarrow.__version__) < LooseVersion('0.6.0'))
103-
self._pyarrow_lt_070 = (
104-
LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'))
105-
106101
self.api = pyarrow
107102

108103
def write(self, df, path, compression='snappy',
109104
coerce_timestamps='ms', index=None, **kwargs):
110105
self.validate_dataframe(df)
111-
112-
# Only validate the index if we're writing it.
113-
if self._pyarrow_lt_070 and index is not False:
114-
self._validate_write_lt_070(df)
115106
path, _, _, _ = get_filepath_or_buffer(path, mode='wb')
116107

117108
if index is None:
118109
from_pandas_kwargs = {}
119110
else:
120111
from_pandas_kwargs = {'preserve_index': index}
121112

122-
if self._pyarrow_lt_060:
123-
table = self.api.Table.from_pandas(df, timestamps_to_ms=True,
124-
**from_pandas_kwargs)
125-
self.api.parquet.write_table(
126-
table, path, compression=compression, **kwargs)
127-
128-
else:
129-
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
130-
self.api.parquet.write_table(
131-
table, path, compression=compression,
132-
coerce_timestamps=coerce_timestamps, **kwargs)
113+
table = self.api.Table.from_pandas(df, **from_pandas_kwargs)
114+
self.api.parquet.write_table(
115+
table, path, compression=compression,
116+
coerce_timestamps=coerce_timestamps, **kwargs)
133117

134118
def read(self, path, columns=None, **kwargs):
135119
path, _, _, should_close = get_filepath_or_buffer(path)
136-
if self._pyarrow_lt_070:
137-
result = self.api.parquet.read_pandas(path, columns=columns,
138-
**kwargs).to_pandas()
139-
else:
140-
kwargs['use_pandas_metadata'] = True
141-
result = self.api.parquet.read_table(path, columns=columns,
142-
**kwargs).to_pandas()
120+
121+
kwargs['use_pandas_metadata'] = True
122+
result = self.api.parquet.read_table(path, columns=columns,
123+
**kwargs).to_pandas()
143124
if should_close:
144125
try:
145126
path.close()
@@ -148,39 +129,6 @@ def read(self, path, columns=None, **kwargs):
148129

149130
return result
150131

151-
def _validate_write_lt_070(self, df):
152-
# Compatibility shim for pyarrow < 0.7.0
153-
# TODO: Remove in pandas 0.23.0
154-
from pandas.core.indexes.multi import MultiIndex
155-
if isinstance(df.index, MultiIndex):
156-
msg = (
157-
"Multi-index DataFrames are only supported "
158-
"with pyarrow >= 0.7.0"
159-
)
160-
raise ValueError(msg)
161-
# Validate index
162-
if not isinstance(df.index, Int64Index):
163-
msg = (
164-
"pyarrow < 0.7.0 does not support serializing {} for the "
165-
"index; you can .reset_index() to make the index into "
166-
"column(s), or install the latest version of pyarrow or "
167-
"fastparquet."
168-
)
169-
raise ValueError(msg.format(type(df.index)))
170-
if not df.index.equals(RangeIndex(len(df))):
171-
raise ValueError(
172-
"pyarrow < 0.7.0 does not support serializing a non-default "
173-
"index; you can .reset_index() to make the index into "
174-
"column(s), or install the latest version of pyarrow or "
175-
"fastparquet."
176-
)
177-
if df.index.name is not None:
178-
raise ValueError(
179-
"pyarrow < 0.7.0 does not serialize indexes with a name; you "
180-
"can set the index.name to None or install the latest version "
181-
"of pyarrow or fastparquet."
182-
)
183-
184132

185133
class FastParquetImpl(BaseImpl):
186134

@@ -197,9 +145,9 @@ def __init__(self):
197145
"\nor via pip\n"
198146
"pip install -U fastparquet"
199147
)
200-
if LooseVersion(fastparquet.__version__) < '0.1.0':
148+
if LooseVersion(fastparquet.__version__) < '0.1.2':
201149
raise ImportError(
202-
"fastparquet >= 0.1.0 is required for parquet "
150+
"fastparquet >= 0.1.2 is required for parquet "
203151
"support\n\n"
204152
"you can install via conda\n"
205153
"conda install fastparquet -c conda-forge\n"

pandas/tests/io/test_parquet.py

+5-29
Original file line numberDiff line numberDiff line change
@@ -41,22 +41,6 @@ def engine(request):
4141

4242
@pytest.fixture
4343
def pa():
44-
if not _HAVE_PYARROW:
45-
pytest.skip("pyarrow is not installed")
46-
return 'pyarrow'
47-
48-
49-
@pytest.fixture
50-
def pa_lt_070():
51-
if not _HAVE_PYARROW:
52-
pytest.skip("pyarrow is not installed")
53-
if LooseVersion(pyarrow.__version__) >= LooseVersion('0.7.0'):
54-
pytest.skip("pyarrow is >= 0.7.0")
55-
return 'pyarrow'
56-
57-
58-
@pytest.fixture
59-
def pa_ge_070():
6044
if not _HAVE_PYARROW:
6145
pytest.skip("pyarrow is not installed")
6246
if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
@@ -337,9 +321,9 @@ def test_write_index(self, engine):
337321
df.index.name = 'foo'
338322
check_round_trip(df, engine)
339323

340-
def test_write_multiindex(self, pa_ge_070):
324+
def test_write_multiindex(self, pa):
341325
# Not suppoprted in fastparquet as of 0.1.3 or older pyarrow version
342-
engine = pa_ge_070
326+
engine = pa
343327

344328
df = pd.DataFrame({'A': [1, 2, 3]})
345329
index = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)])
@@ -352,8 +336,8 @@ def test_write_column_multiindex(self, engine):
352336
df = pd.DataFrame(np.random.randn(4, 3), columns=mi_columns)
353337
self.check_error_on_write(df, engine, ValueError)
354338

355-
def test_multiindex_with_columns(self, pa_ge_070):
356-
engine = pa_ge_070
339+
def test_multiindex_with_columns(self, pa):
340+
engine = pa
357341
dates = pd.date_range('01-Jan-2018', '01-Dec-2018', freq='MS')
358342
df = pd.DataFrame(np.random.randn(2 * len(dates), 3),
359343
columns=list('ABC'))
@@ -456,8 +440,7 @@ def test_unsupported(self, pa):
456440
# older pyarrows raise ArrowInvalid
457441
self.check_error_on_write(df, pa, Exception)
458442

459-
def test_categorical(self, pa_ge_070):
460-
pa = pa_ge_070
443+
def test_categorical(self, pa):
461444

462445
# supported in >= 0.7.0
463446
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
@@ -466,13 +449,6 @@ def test_categorical(self, pa_ge_070):
466449
expected = df.assign(a=df.a.astype(object))
467450
check_round_trip(df, pa, expected=expected)
468451

469-
def test_categorical_unsupported(self, pa_lt_070):
470-
pa = pa_lt_070
471-
472-
# supported in >= 0.7.0
473-
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
474-
self.check_error_on_write(df, pa, NotImplementedError)
475-
476452
def test_s3_roundtrip(self, df_compat, s3_resource, pa):
477453
# GH #19134
478454
check_round_trip(df_compat, pa,

0 commit comments

Comments
 (0)