Skip to content

Commit 1345847

Browse files
committed
Updated pyarrow dep to 0.7.0
Addressed review comments
1 parent 9f16982 commit 1345847

File tree

6 files changed

+75
-196
lines changed

6 files changed

+75
-196
lines changed

ci/requirements-2.7.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,4 @@ source activate pandas
44

55
echo "install 27"
66

7-
conda install -n pandas -c conda-forge feather-format pyarrow=0.4.1 fastparquet
7+
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0 fastparquet

ci/requirements-3.5.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -8,4 +8,4 @@ echo "install 35"
88
conda remove -n pandas python-dateutil --force
99
pip install python-dateutil
1010

11-
conda install -n pandas -c conda-forge feather-format pyarrow=0.5.0
11+
conda install -n pandas -c conda-forge feather-format pyarrow=0.7.0

doc/source/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -233,7 +233,7 @@ Optional Dependencies
233233
* `xarray <http://xarray.pydata.org>`__: pandas like handling for > 2 dims, needed for converting Panels to xarray objects. Version 0.7.0 or higher is recommended.
234234
* `PyTables <http://www.pytables.org>`__: necessary for HDF5-based storage. Version 3.0.0 or higher required, Version 3.2.1 or higher highly recommended.
235235
* `Feather Format <https://github.com/wesm/feather>`__: necessary for feather-based storage, version 0.3.1 or higher.
236-
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.4.1) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.0.6) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
236+
* `Apache Parquet <https://parquet.apache.org/>`__, either `pyarrow <http://arrow.apache.org/docs/python/>`__ (>= 0.7.0) or `fastparquet <https://fastparquet.readthedocs.io/en/latest/necessary>`__ (>= 0.1.0) for parquet-based storage. The `snappy <https://pypi.python.org/pypi/python-snappy>`__ and `brotli <https://pypi.python.org/pypi/brotlipy>`__ are available for compression support.
237237
* `SQLAlchemy <http://www.sqlalchemy.org>`__: for SQL database support. Version 0.8.1 or higher recommended. Besides SQLAlchemy, you also need a database specific driver. You can find an overview of supported drivers for each SQL dialect in the `SQLAlchemy docs <http://docs.sqlalchemy.org/en/latest/dialects/index.html>`__. Some common drivers are:
238238

239239
* `psycopg2 <http://initd.org/psycopg/>`__: for PostgreSQL

doc/source/whatsnew/v0.22.0.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@ Other Enhancements
7777
- :func:`Series.fillna` now accepts a Series or a dict as a ``value`` for a categorical dtype (:issue:`17033`)
7878
- :func:`pandas.read_clipboard` updated to use qtpy, falling back to PyQt5 and then PyQt4, adding compatibility with Python3 and multiple python-qt bindings (:issue:`17722`)
7979
- Improved wording of ``ValueError`` raised in :func:`read_csv` when the ``usecols`` argument cannot match all columns. (:issue:`17301`)
80-
- Enabled the use of non-default indexes in ``to_parquet`` with pyarrow>=0.7.0 (:issue:`18581`)
80+
- Enabled the use of non-default indexes in :func:`DataFrame.to_parquet` where the underlying engine supports it (:issue:`18581`)
8181

8282
.. _whatsnew_0220.api_breaking:
8383

pandas/io/parquet.py

+41-77
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from distutils.version import LooseVersion
55
from pandas import DataFrame, RangeIndex, Int64Index, get_option
66
from pandas.compat import range
7+
from pandas.core.common import AbstractMethodError
78
from pandas.io.common import get_filepath_or_buffer
89

910

@@ -39,97 +40,59 @@ class BaseImpl(object):
3940
api = None # module
4041

4142
@staticmethod
42-
def _validate_index(df):
43-
if not isinstance(df.index, Int64Index):
44-
msg = (
45-
"parquet does not support serializing {} for the index;"
46-
"you can .reset_index() to make the index into column(s)"
47-
)
48-
raise ValueError(msg.format(type(df.index)))
49-
if not df.index.equals(RangeIndex(len(df))):
50-
raise ValueError(
51-
"parquet does not support serializing a non-default index "
52-
"for the index; you can .reset_index() to make the index "
53-
"into column(s)"
54-
)
55-
if df.index.name is not None:
56-
raise ValueError(
57-
"parquet does not serialize index meta-data "
58-
"on a default index"
59-
)
60-
61-
@staticmethod
62-
def _validate_columns(df):
43+
def validate_dataframe(df):
44+
if not isinstance(df, DataFrame):
45+
raise ValueError("to_parquet only support IO with DataFrames")
6346
# must have value column names (strings only)
6447
if df.columns.inferred_type not in {'string', 'unicode'}:
6548
raise ValueError("parquet must have string column names")
6649

67-
def validate_dataframe(self, df):
68-
if not isinstance(df, DataFrame):
69-
raise ValueError("to_parquet only support IO with DataFrames")
70-
self._validate_columns(df)
71-
self._validate_index(df)
72-
7350
def write(self, df, path, compression, **kwargs):
74-
raise NotImplementedError()
51+
raise AbstractMethodError()
7552

7653
def read(self, path, columns=None, **kwargs):
77-
raise NotImplementedError()
54+
raise AbstractMethodError()
7855

7956

8057
class PyArrowImpl(BaseImpl):
8158

8259
def __init__(self):
8360
# since pandas is a dependency of pyarrow
8461
# we need to import on first use
85-
8662
try:
8763
import pyarrow
8864
import pyarrow.parquet
8965
except ImportError:
90-
raise ImportError("pyarrow is required for parquet support\n\n"
91-
"you can install via conda\n"
92-
"conda install pyarrow -c conda-forge\n"
93-
"\nor via pip\n"
94-
"pip install -U pyarrow\n")
95-
96-
if LooseVersion(pyarrow.__version__) < '0.4.1':
97-
raise ImportError("pyarrow >= 0.4.1 is required for parquet"
98-
"support\n\n"
99-
"you can install via conda\n"
100-
"conda install pyarrow -c conda-forge\n"
101-
"\nor via pip\n"
102-
"pip install -U pyarrow\n")
103-
104-
self._pyarrow_lt_050 = LooseVersion(pyarrow.__version__) < '0.5.0'
105-
self._pyarrow_lt_060 = LooseVersion(pyarrow.__version__) < '0.6.0'
106-
self._pyarrow_lt_070 = LooseVersion(pyarrow.__version__) < '0.7.0'
66+
raise ImportError(
67+
"pyarrow is required for parquet support\n\n"
68+
"you can install via conda\n"
69+
"conda install pyarrow -c conda-forge\n"
70+
"\nor via pip\n"
71+
"pip install -U pyarrow\n"
72+
)
73+
if LooseVersion(pyarrow.__version__) < '0.7.0':
74+
raise ImportError(
75+
"pyarrow >= 0.4.1 is required for parquet support\n\n"
76+
"you can install via conda\n"
77+
"conda install pyarrow -c conda-forge\n"
78+
"\nor via pip\n"
79+
"pip install -U pyarrow\n"
80+
)
10781
self.api = pyarrow
10882

109-
def _validate_index(self, df):
110-
# pyarrow >= 0.7.0 supports multi-indexes so no need to validate
111-
if self._pyarrow_lt_070:
112-
super(PyArrowImpl, self)._validate_index(df)
113-
11483
def write(self, df, path, compression='snappy',
11584
coerce_timestamps='ms', **kwargs):
11685
self.validate_dataframe(df)
11786
path, _, _ = get_filepath_or_buffer(path)
118-
if self._pyarrow_lt_060:
119-
table = self.api.Table.from_pandas(df, timestamps_to_ms=True)
120-
self.api.parquet.write_table(
121-
table, path, compression=compression, **kwargs)
122-
123-
else:
124-
table = self.api.Table.from_pandas(df)
125-
self.api.parquet.write_table(
126-
table, path, compression=compression,
127-
coerce_timestamps=coerce_timestamps, **kwargs)
87+
table = self.api.Table.from_pandas(df)
88+
self.api.parquet.write_table(
89+
table, path, compression=compression,
90+
coerce_timestamps=coerce_timestamps, **kwargs)
12891

12992
def read(self, path, columns=None, **kwargs):
13093
path, _, _ = get_filepath_or_buffer(path)
131-
return self.api.parquet.read_table(path, columns=columns,
132-
**kwargs).to_pandas()
94+
return self.api.parquet.read_table(
95+
path, columns=columns, **kwargs).to_pandas()
13396

13497

13598
class FastParquetImpl(BaseImpl):
@@ -140,20 +103,21 @@ def __init__(self):
140103
try:
141104
import fastparquet
142105
except ImportError:
143-
raise ImportError("fastparquet is required for parquet support\n\n"
144-
"you can install via conda\n"
145-
"conda install fastparquet -c conda-forge\n"
146-
"\nor via pip\n"
147-
"pip install -U fastparquet")
148-
106+
raise ImportError(
107+
"fastparquet is required for parquet support\n\n"
108+
"you can install via conda\n"
109+
"conda install fastparquet -c conda-forge\n"
110+
"\nor via pip\n"
111+
"pip install -U fastparquet"
112+
)
149113
if LooseVersion(fastparquet.__version__) < '0.1.0':
150-
raise ImportError("fastparquet >= 0.1.0 is required for parquet "
151-
"support\n\n"
152-
"you can install via conda\n"
153-
"conda install fastparquet -c conda-forge\n"
154-
"\nor via pip\n"
155-
"pip install -U fastparquet")
156-
114+
raise ImportError(
115+
"fastparquet >= 0.1.0 is required for parquet "
116+
"support\n\n"
117+
"you can install via conda\n"
118+
"conda install fastparquet -c conda-forge\n"
119+
"\nor via pip\n"
120+
"pip install -U fastparquet")
157121
self.api = fastparquet
158122

159123
def write(self, df, path, compression='snappy', **kwargs):

0 commit comments

Comments
 (0)