5
5
from pandas .compat import range
6
6
7
7
8
- def _try_import_pyarrow ():
9
- # since pandas is a dependency of pyarrow
10
- # we need to import on first use
8
+ def get_engine (engine ):
9
+ """ return our implementation """
11
10
12
- try :
11
+ if engine not in ['pyarrow' , 'fastparquet' ]:
12
+ raise ValueError ("engine must be one of 'pyarrow', 'fastparquet'" )
13
+
14
+ if engine == 'pyarrow' :
15
+ return PyArrowImpl ()
16
+ elif engine == 'fastparquet' :
17
+ return FastParquetImpl ()
18
+
19
+
20
+ class PyArrowImpl (object ):
21
+
22
+ def __init__ (self ):
23
+ # since pandas is a dependency of pyarrow
24
+ # we need to import on first use
25
+
26
+ try :
27
+ import pyarrow # noqa
28
+ except ImportError :
29
+ raise ImportError ("pyarrow is required for parquet support\n \n "
30
+ "you can install via conda\n "
31
+ "conda install pyarrow -c conda-forge\n "
32
+ "\n or via pip\n "
33
+ "pip install pyarrow\n " )
34
+
35
+ def write (self , df , path , compression = None , ** kwargs ):
13
36
import pyarrow
14
- except ImportError :
15
- raise ImportError ("pyarrow is required for parquet support\n \n "
16
- "you can install via conda\n "
17
- "conda install pyarrow -c conda-forge\n "
18
- "\n or via pip\n "
19
- "pip install pyarrow\n " )
37
+ from pyarrow import parquet as pq
20
38
21
- return pyarrow
39
+ table = pyarrow .Table .from_pandas (df )
40
+ pq .write_table (table , path ,
41
+ compression = compression , ** kwargs )
22
42
43
+ def read (self , path ):
44
+ import pyarrow
45
+ return pyarrow .parquet .read_table (path ).to_pandas ()
23
46
24
- def _try_import_fastparquet ():
25
- # since pandas is a dependency of fastparquet
26
- # we need to import on first use
27
47
28
- try :
29
- import fastparquet
30
- except ImportError :
31
- raise ImportError ("fastparquet is required for parquet support\n \n "
32
- "you can install via conda\n "
33
- "conda install fastparquet -c conda-forge\n "
34
- "\n or via pip\n "
35
- "pip install fastparquet" )
48
+ class FastParquetImpl (object ):
36
49
37
- return fastparquet
50
+ def __init__ (self ):
51
+ # since pandas is a dependency of fastparquet
52
+ # we need to import on first use
38
53
54
+ try :
55
+ import fastparquet # noqa
56
+ except ImportError :
57
+ raise ImportError ("fastparquet is required for parquet support\n \n "
58
+ "you can install via conda\n "
59
+ "conda install fastparquet -c conda-forge\n "
60
+ "\n or via pip\n "
61
+ "pip install fastparquet" )
39
62
40
- def _validate_engine (engine ):
41
- if engine not in ['pyarrow' , 'fastparquet' ]:
42
- raise ValueError ("engine must be one of 'pyarrow', 'fastparquet'" )
63
+ def write (self , df , path , compression = None , ** kwargs ):
64
+ import fastparquet
43
65
66
+ # thriftpy/protocol/compact.py:339:
67
+ # DeprecationWarning: tostring() is deprecated.
68
+ # Use tobytes() instead.
69
+ with catch_warnings (record = True ):
70
+ fastparquet .write (path , df ,
71
+ compression = compression , ** kwargs )
44
72
45
- def to_parquet (df , path , engine , compression = None ):
73
+ def read (self , path ):
74
+ import fastparquet
75
+ pf = fastparquet .ParquetFile (path )
76
+ return pf .to_pandas ()
77
+
78
+
79
+ def to_parquet (df , path , engine , compression = None , ** kwargs ):
46
80
"""
47
81
Write a DataFrame to the pyarrow
48
82
@@ -55,9 +89,10 @@ def to_parquet(df, path, engine, compression=None):
55
89
supported are {'pyarrow', 'fastparquet'}
56
90
compression : str, optional
57
91
compression method, includes {'gzip', 'snappy', 'brotli'}
92
+ kwargs are passed to the engine
58
93
"""
59
94
60
- _validate_engine (engine )
95
+ impl = get_engine (engine )
61
96
62
97
if not isinstance (df , DataFrame ):
63
98
raise ValueError ("to_parquet only support IO with DataFrames" )
@@ -92,24 +127,10 @@ def to_parquet(df, path, engine, compression=None):
92
127
if df .columns .inferred_type not in valid_types :
93
128
raise ValueError ("parquet must have string column names" )
94
129
95
- if engine == 'pyarrow' :
96
- pyarrow = _try_import_pyarrow ()
97
- from pyarrow import parquet as pq
98
-
99
- table = pyarrow .Table .from_pandas (df )
100
- pq .write_table (table , path , compression = compression )
101
-
102
- elif engine == 'fastparquet' :
103
- fastparquet = _try_import_fastparquet ()
104
-
105
- # thriftpy/protocol/compact.py:339:
106
- # DeprecationWarning: tostring() is deprecated.
107
- # Use tobytes() instead.
108
- with catch_warnings (record = True ):
109
- fastparquet .write (path , df , compression = compression )
130
+ return impl .write (df , path , compression = compression )
110
131
111
132
112
- def read_parquet (path , engine ):
133
+ def read_parquet (path , engine , ** kwargs ):
113
134
"""
114
135
Load a parquet object from the file path
115
136
@@ -121,20 +142,13 @@ def read_parquet(path, engine):
121
142
File path
122
143
engine : parquet engine
123
144
supported are {'pyarrow', 'fastparquet'}
145
+ kwargs are passed to the engine
124
146
125
147
Returns
126
148
-------
127
149
type of object stored in file
128
150
129
151
"""
130
152
131
- _validate_engine (engine )
132
-
133
- if engine == 'pyarrow' :
134
- pyarrow = _try_import_pyarrow ()
135
- return pyarrow .parquet .read_table (path ).to_pandas ()
136
-
137
- elif engine == 'fastparquet' :
138
- fastparquet = _try_import_fastparquet ()
139
- pf = fastparquet .ParquetFile (path )
140
- return pf .to_pandas ()
153
+ impl = get_engine (engine )
154
+ return impl .read (path )
0 commit comments