Skip to content

Commit 44b0fe8

Browse files
committed
Updated pyarrow dep to 0.7.0
Addressed review comments
1 parent b4d0bfe commit 44b0fe8

File tree

6 files changed

+86
-150
lines changed

6 files changed

+86
-150
lines changed

ci/requirements-2.7.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ source activate pandas
44

55
echo "install 27"
66

7-
conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet
7+
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 fastparquet

ci/requirements-3.5.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ echo "install 35"
88
conda remove -n pandas python-dateutil --force
99
pip install python-dateutil
1010

11-
conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0
11+
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0

doc/source/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ Optional Dependencies
233233
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
234234
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
235235
* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
236-
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
236+
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.1.0) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
237237
* `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
238238

239239
* `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL

doc/source/whatsnew/v0.22.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,7 @@ Other Enhancements
163163
- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
164164
- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
165165
- :func:`DataFrame.corrwith` now silently drops non-numeric columns when passed a Series. Before, an exception was raised (:issue:`18570`).
166-
166+
- Enabled the use of non-default indexes in :func:`DataFrame.to_parquet` where the underlying engine supports it (:issue:`18581`)
167167

168168
.. _whatsnew_0220.api_breaking:
169169

pandas/io/parquet.py

+48-47
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from distutils.version import LooseVersion
55
from pandas import DataFrame, RangeIndex, Int64Index, get_option
66
from pandas.compat import range
7+
from pandas.core.common import AbstractMethodError
78
from pandas.io.common import get_filepath_or_buffer
89

910

@@ -39,54 +40,30 @@ class BaseImpl(object):
3940
api = None # module
4041

4142
@staticmethod
42-
def _validate_index(df):
43-
if not isinstance(df.index, Int64Index):
44-
msg = (
45-
"parquet does not support serializing {} for the index;"
46-
"you can .reset_index() to make the index into column(s)"
47-
)
48-
raise ValueError(msg.format(type(df.index)))
49-
if not df.index.equals(RangeIndex(len(df))):
50-
raise ValueError(
51-
"parquet does not support serializing a non-default index "
52-
"for the index; you can .reset_index() to make the index "
53-
"into column(s)"
54-
)
55-
if df.index.name is not None:
56-
raise ValueError(
57-
"parquet does not serialize index meta-data "
58-
"on a default index"
59-
)
60-
61-
@staticmethod
62-
def _validate_columns(df):
43+
def validate_dataframe(df):
44+
if not isinstance(df, DataFrame):
45+
raise ValueError("to_parquet only support IO with DataFrames")
6346
# must have value column names (strings only)
6447
if df.columns.inferred_type not in {'string', 'unicode'}:
6548
raise ValueError("parquet must have string column names")
6649

67-
def validate_dataframe(self, df):
68-
if not isinstance(df, DataFrame):
69-
raise ValueError("to_parquet only support IO with DataFrames")
70-
self._validate_columns(df)
71-
self._validate_index(df)
72-
7350
def write(self, df, path, compression, **kwargs):
74-
raise NotImplementedError()
51+
raise AbstractMethodError()
7552

7653
def read(self, path, columns=None, **kwargs):
77-
raise NotImplementedError()
54+
raise AbstractMethodError()
7855

7956

8057
class PyArrowImpl(BaseImpl):
8158

8259
def __init__(self):
8360
# since pandas is a dependency of pyarrow
8461
# we need to import on first use
85-
8662
try:
8763
import pyarrow
8864
import pyarrow.parquet
8965
except ImportError:
66+
<<<<<<< HEAD
9067
raise ImportError("pyarrow is required for parquet support\n\n"
9168
"you can install via conda\n"
9269
"conda install pyarrow -c conda-forge\n"
@@ -104,32 +81,38 @@ def __init__(self):
10481
self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0'
10582
self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0'
10683
self._pyarrow_lt_070 = LooseVersion(pyarrow.__version__) < '0.7.0'
84+
=======
85+
raise ImportError(
86+
"pyarrow is required for parquet support\n\n"
87+
"you can install via conda\n"
88+
"conda install pyarrow -c conda-forge\n"
89+
"\nor via pip\n"
90+
"pip install -U pyarrow\n"
91+
)
92+
if LooseVersion(pyarrow.__version__) < '0.7.0':
93+
raise ImportError(
94+
"pyarrow >= 0.4.1 is required for parquet support\n\n"
95+
"you can install via conda\n"
96+
"conda install pyarrow -c conda-forge\n"
97+
"\nor via pip\n"
98+
"pip install -U pyarrow\n"
99+
)
100+
>>>>>>> Updated pyarrow dep to 0.7.0
107101
self.api = pyarrow
108102

109-
def _validate_index(self, df):
110-
# pyarrow >= 0.7.0 supports multi-indexes so no need to validate
111-
if self._pyarrow_lt_070:
112-
super(PyArrowImpl, self)._validate_index(df)
113-
114103
def write(self, df, path, compression='snappy',
115104
coerce_timestamps='ms', **kwargs):
116105
self.validate_dataframe(df)
117106
path, _, _ = get_filepath_or_buffer(path)
118-
if self._pyarrow_lt_060:
119-
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
120-
self.api.parquet.write_table(
121-
table, path, compression=compression, **kwargs)
122-
123-
else:
124-
table = self.api.Table.from_pandas(df)
125-
self.api.parquet.write_table(
126-
table, path, compression=compression,
127-
coerce_timestamps=coerce_timestamps, **kwargs)
107+
table = self.api.Table.from_pandas(df)
108+
self.api.parquet.write_table(
109+
table, path, compression=compression,
110+
coerce_timestamps=coerce_timestamps, **kwargs)
128111

129112
def read(self, path, columns=None, **kwargs):
130113
path, _, _ = get_filepath_or_buffer(path)
131-
return self.api.parquet.read_table(path, columns=columns,
132-
**kwargs).to_pandas()
114+
return self.api.parquet.read_table(
115+
path, columns=columns, **kwargs).to_pandas()
133116

134117

135118
class FastParquetImpl(BaseImpl):
@@ -140,6 +123,7 @@ def __init__(self):
140123
try:
141124
import fastparquet
142125
except ImportError:
126+
<<<<<<< HEAD
143127
raise ImportError("fastparquet is required for parquet support\n\n"
144128
"you can install via conda\n"
145129
"conda install fastparquet -c conda-forge\n"
@@ -154,6 +138,23 @@ def __init__(self):
154138
"\nor via pip\n"
155139
"pip install -U fastparquet")
156140

141+
=======
142+
raise ImportError(
143+
"fastparquet is required for parquet support\n\n"
144+
"you can install via conda\n"
145+
"conda install fastparquet -c conda-forge\n"
146+
"\nor via pip\n"
147+
"pip install -U fastparquet"
148+
)
149+
if LooseVersion(fastparquet.__version__) < '0.1.0':
150+
raise ImportError(
151+
"fastparquet >= 0.1.0 is required for parquet "
152+
"support\n\n"
153+
"you can install via conda\n"
154+
"conda install fastparquet -c conda-forge\n"
155+
"\nor via pip\n"
156+
"pip install -U fastparquet")
157+
>>>>>>> Updated pyarrow dep to 0.7.0
157158
self.api = fastparquet
158159

159160
def write(self, df, path, compression='snappy', **kwargs):

pandas/tests/io/test_parquet.py

+34-99
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ def engine(request):
4343
def pa():
4444
if not _HAVE_PYARROW:
4545
pytest.skip("pyarrow is not installed")
46+
<<<<<<< HEAD
4647
return 'pyarrow'
4748

4849

@@ -60,6 +61,9 @@ def pa_ge_070():
6061
if not _HAVE_PYARROW:
6162
pytest.skip("pyarrow is not installed")
6263
if LooseVersion(pyarrow.__version__) < LooseVersion('0.7.0'):
64+
=======
65+
if LooseVersion(pyarrow.__version__) < '0.7.0':
66+
>>>>>>> Updated pyarrow dep to 0.7.0
6367
pytest.skip("pyarrow is < 0.7.0")
6468
return 'pyarrow'
6569

@@ -68,6 +72,8 @@ def pa_ge_070():
6872
def fp():
6973
if not _HAVE_FASTPARQUET:
7074
pytest.skip("fastparquet is not installed")
75+
if LooseVersion(fastparquet.__version__) < '0.1.0':
76+
pytest.skip("fastparquet is < 0.1.0")
7177
return 'fastparquet'
7278

7379

@@ -181,9 +187,7 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
181187
class Base(object):
182188

183189
def check_error_on_write(self, df, engine, exc):
184-
# check that we are raising the exception
185-
# on writing
186-
190+
# check that we are raising the exception on writing
187191
with pytest.raises(exc):
188192
with tm.ensure_clean() as path:
189193
to_parquet(df, path, engine, compression=None)
@@ -270,6 +274,32 @@ def test_read_columns(self, engine):
270274
write_kwargs={'compression': None},
271275
read_kwargs={'columns': ['string']})
272276

277+
def test_write_with_index(self, engine):
278+
df = pd.DataFrame({'A': [1, 2, 3]})
279+
self.check_round_trip(df, engine, write_kwargs={'compression': None})
280+
281+
# non-default index
282+
for index in [[2, 3, 4],
283+
pd.date_range('20130101', periods=3),
284+
list('abc'),
285+
[1, 3, 4],
286+
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
287+
('b', 1)]),
288+
]:
289+
290+
df.index = index
291+
self.check_round_trip(df, engine)
292+
293+
# index with meta-data
294+
df.index = [0, 1, 2]
295+
df.index.name = 'foo'
296+
self.check_round_trip(df, engine)
297+
298+
# column multi-index
299+
df.index = [0, 1, 2]
300+
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
301+
self.check_error_on_write(df, engine, ValueError)
302+
273303

274304
class TestParquetPyArrow(Base):
275305

@@ -295,14 +325,12 @@ def test_basic(self, pa):
295325
self.check_round_trip(df, pa)
296326

297327
def test_duplicate_columns(self, pa):
298-
299328
# not currently able to handle duplicate columns
300329
df = pd.DataFrame(np.arange(12).reshape(4, 3),
301330
columns=list('aaa')).copy()
302331
self.check_error_on_write(df, pa, ValueError)
303332

304333
def test_unsupported(self, pa):
305-
306334
# period
307335
df = pd.DataFrame({'a': pd.period_range('2013', freq='M', periods=3)})
308336
self.check_error_on_write(df, pa, ValueError)
@@ -316,78 +344,13 @@ def test_unsupported(self, pa):
316344
df = pd.DataFrame({'a': ['a', 1, 2.0]})
317345
self.check_error_on_write(df, pa, ValueError)
318346

319-
def test_categorical(self, pa_ge_070):
320-
pa = pa_ge_070
321-
322-
# supported in >= 0.7.0
347+
def test_categorical(self, pa):
323348
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
324349

325350
# de-serialized as object
326351
expected = df.assign(a=df.a.astype(object))
327352
self.check_round_trip(df, pa, expected)
328353

329-
def test_categorical_unsupported(self, pa_lt_070):
330-
pa = pa_lt_070
331-
332-
# supported in >= 0.7.0
333-
df = pd.DataFrame({'a': pd.Categorical(list('abc'))})
334-
self.check_error_on_write(df, pa, NotImplementedError)
335-
336-
def test_write_with_index(self, pa_lt_070):
337-
engine = pa_lt_070
338-
df = pd.DataFrame({'A': [1, 2, 3]})
339-
self.check_round_trip(df, engine, write_kwargs={'compression': None})
340-
341-
# non-default index
342-
for index in [[2, 3, 4],
343-
pd.date_range('20130101', periods=3),
344-
list('abc'),
345-
[1, 3, 4],
346-
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
347-
('b', 1)]),
348-
]:
349-
350-
df.index = index
351-
self.check_error_on_write(df, engine, ValueError)
352-
353-
# index with meta-data
354-
df.index = [0, 1, 2]
355-
df.index.name = 'foo'
356-
self.check_error_on_write(df, engine, ValueError)
357-
358-
# column multi-index
359-
df.index = [0, 1, 2]
360-
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
361-
self.check_error_on_write(df, engine, ValueError)
362-
363-
def test_write_with_non_default_index(self, pa_ge_070):
364-
engine = pa_ge_070
365-
366-
df = pd.DataFrame({'A': [1, 2, 3]})
367-
self.check_round_trip(df, engine, write_kwargs={'compression': None})
368-
369-
# non-default index
370-
for index in [[2, 3, 4],
371-
pd.date_range('20130101', periods=3),
372-
list('abc'),
373-
[1, 3, 4],
374-
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
375-
('b', 1)]),
376-
]:
377-
378-
df.index = index
379-
self.check_round_trip(df, engine)
380-
381-
# index with meta-data
382-
df.index = [0, 1, 2]
383-
df.index.name = 'foo'
384-
self.check_round_trip(df, engine)
385-
386-
# column multi-index
387-
df.index = [0, 1, 2]
388-
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
389-
self.check_error_on_write(df, engine, ValueError)
390-
391354

392355
class TestParquetFastParquet(Base):
393356

@@ -413,34 +376,6 @@ def test_basic(self, fp):
413376

414377
self.check_round_trip(df, fp, write_kwargs={'compression': None})
415378

416-
def test_write_with_index(self, fp):
417-
engine = fp
418-
419-
df = pd.DataFrame({'A': [1, 2, 3]})
420-
self.check_round_trip(df, engine, write_kwargs={'compression': None})
421-
422-
# non-default index
423-
for index in [[2, 3, 4],
424-
pd.date_range('20130101', periods=3),
425-
list('abc'),
426-
[1, 3, 4],
427-
pd.MultiIndex.from_tuples([('a', 1), ('a', 2),
428-
('b', 1)]),
429-
]:
430-
431-
df.index = index
432-
self.check_error_on_write(df, engine, ValueError)
433-
434-
# index with meta-data
435-
df.index = [0, 1, 2]
436-
df.index.name = 'foo'
437-
self.check_error_on_write(df, engine, ValueError)
438-
439-
# column multi-index
440-
df.index = [0, 1, 2]
441-
df.columns = pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1)]),
442-
self.check_error_on_write(df, engine, ValueError)
443-
444379
@pytest.mark.skip(reason="not supported")
445380
def test_duplicate_columns(self, fp):
446381

0 commit comments

Comments
 (0)