7
7
8
8
import numpy as np
9
9
import pandas as pd
10
- from pandas .compat import PY3
10
+ from pandas .compat import PY3 , is_platform_windows
11
11
from pandas .io .parquet import (to_parquet , read_parquet , get_engine ,
12
12
PyArrowImpl , FastParquetImpl )
13
13
from pandas .util import testing as tm
@@ -80,16 +80,36 @@ def df_compat():
80
80
def df_cross_compat ():
81
81
df = pd .DataFrame ({'a' : list ('abc' ),
82
82
'b' : list (range (1 , 4 )),
83
- 'c' : np .arange (3 , 6 ).astype ('u1' ),
83
+ # 'c': np.arange(3, 6).astype('u1'),
84
84
'd' : np .arange (4.0 , 7.0 , dtype = 'float64' ),
85
85
'e' : [True , False , True ],
86
86
'f' : pd .date_range ('20130101' , periods = 3 ),
87
- 'g' : pd .date_range ('20130101' , periods = 3 ,
88
- tz = 'US/Eastern' ),
89
- 'h' : pd .date_range ('20130101' , periods = 3 , freq = 'ns' )})
87
+ # 'g': pd.date_range('20130101', periods=3,
88
+ # tz='US/Eastern'),
89
+ # 'h': pd.date_range('20130101', periods=3, freq='ns')
90
+ })
90
91
return df
91
92
92
93
94
+ @pytest .fixture
95
+ def df_full ():
96
+ return pd .DataFrame (
97
+ {'string' : list ('abc' ),
98
+ 'string_with_nan' : ['a' , np .nan , 'c' ],
99
+ 'string_with_none' : ['a' , None , 'c' ],
100
+ 'bytes' : [b'foo' , b'bar' , b'baz' ],
101
+ 'unicode' : [u'foo' , u'bar' , u'baz' ],
102
+ 'int' : list (range (1 , 4 )),
103
+ 'uint' : np .arange (3 , 6 ).astype ('u1' ),
104
+ 'float' : np .arange (4.0 , 7.0 , dtype = 'float64' ),
105
+ 'float_with_nan' : [2. , np .nan , 3. ],
106
+ 'bool' : [True , False , True ],
107
+ 'datetime' : pd .date_range ('20130101' , periods = 3 ),
108
+ 'datetime_with_nat' : [pd .Timestamp ('20130101' ),
109
+ pd .NaT ,
110
+ pd .Timestamp ('20130103' )]})
111
+
112
+
93
113
def test_invalid_engine (df_compat ):
94
114
95
115
with pytest .raises (ValueError ):
@@ -154,7 +174,8 @@ def test_options_get_engine(fp, pa):
154
174
assert isinstance (get_engine ('fastparquet' ), FastParquetImpl )
155
175
156
176
157
- @pytest .mark .xfail (reason = "fp does not ignore pa index __index_level_0__" )
177
+ @pytest .mark .xfail (is_platform_windows (),
178
+ reason = "reading pa metadata failing on Windows" )
158
179
def test_cross_engine_pa_fp (df_cross_compat , pa , fp ):
159
180
# cross-compat with differing reading/writing engines
160
181
@@ -165,8 +186,10 @@ def test_cross_engine_pa_fp(df_cross_compat, pa, fp):
165
186
result = read_parquet (path , engine = fp )
166
187
tm .assert_frame_equal (result , df )
167
188
189
+ result = read_parquet (path , engine = fp , columns = ['a' , 'd' ])
190
+ tm .assert_frame_equal (result , df [['a' , 'd' ]])
191
+
168
192
169
- @pytest .mark .xfail (reason = "pyarrow reading fp in some cases" )
170
193
def test_cross_engine_fp_pa (df_cross_compat , pa , fp ):
171
194
# cross-compat with differing reading/writing engines
172
195
@@ -177,6 +200,9 @@ def test_cross_engine_fp_pa(df_cross_compat, pa, fp):
177
200
result = read_parquet (path , engine = pa )
178
201
tm .assert_frame_equal (result , df )
179
202
203
+ result = read_parquet (path , engine = pa , columns = ['a' , 'd' ])
204
+ tm .assert_frame_equal (result , df [['a' , 'd' ]])
205
+
180
206
181
207
class Base (object ):
182
208
@@ -300,27 +326,31 @@ def test_read_columns(self, engine):
300
326
301
327
class TestParquetPyArrow (Base ):
302
328
303
- def test_basic (self , pa ):
329
+ def test_basic (self , pa , df_full ):
304
330
305
- df = pd .DataFrame ({'string' : list ('abc' ),
306
- 'string_with_nan' : ['a' , np .nan , 'c' ],
307
- 'string_with_none' : ['a' , None , 'c' ],
308
- 'bytes' : [b'foo' , b'bar' , b'baz' ],
309
- 'unicode' : [u'foo' , u'bar' , u'baz' ],
310
- 'int' : list (range (1 , 4 )),
311
- 'uint' : np .arange (3 , 6 ).astype ('u1' ),
312
- 'float' : np .arange (4.0 , 7.0 , dtype = 'float64' ),
313
- 'float_with_nan' : [2. , np .nan , 3. ],
314
- 'bool' : [True , False , True ],
315
- 'bool_with_none' : [True , None , True ],
316
- 'datetime_ns' : pd .date_range ('20130101' , periods = 3 ),
317
- 'datetime_with_nat' : [pd .Timestamp ('20130101' ),
318
- pd .NaT ,
319
- pd .Timestamp ('20130103' )]
320
- })
331
+ df = df_full
332
+
333
+ # additional supported types for pyarrow
334
+ import pyarrow
335
+ if LooseVersion (pyarrow .__version__ ) >= LooseVersion ('0.7.0' ):
336
+ df ['datetime_tz' ] = pd .date_range ('20130101' , periods = 3 ,
337
+ tz = 'Europe/Brussels' )
338
+ df ['bool_with_none' ] = [True , None , True ]
321
339
322
340
self .check_round_trip (df , pa )
323
341
342
+ @pytest .mark .xfail (reason = "pyarrow fails on this (ARROW-1883)" )
343
+ def test_basic_subset_columns (self , pa , df_full ):
344
+ # GH18628
345
+
346
+ df = df_full
347
+ # additional supported types for pyarrow
348
+ df ['datetime_tz' ] = pd .date_range ('20130101' , periods = 3 ,
349
+ tz = 'Europe/Brussels' )
350
+
351
+ self .check_round_trip (df , pa , expected = df [['string' , 'int' ]],
352
+ read_kwargs = {'columns' : ['string' , 'int' ]})
353
+
324
354
def test_duplicate_columns (self , pa ):
325
355
326
356
# not currently able to handle duplicate columns
@@ -363,25 +393,12 @@ def test_categorical_unsupported(self, pa_lt_070):
363
393
364
394
class TestParquetFastParquet (Base ):
365
395
366
- def test_basic (self , fp ):
367
-
368
- df = pd .DataFrame (
369
- {'string' : list ('abc' ),
370
- 'string_with_nan' : ['a' , np .nan , 'c' ],
371
- 'string_with_none' : ['a' , None , 'c' ],
372
- 'bytes' : [b'foo' , b'bar' , b'baz' ],
373
- 'unicode' : [u'foo' , u'bar' , u'baz' ],
374
- 'int' : list (range (1 , 4 )),
375
- 'uint' : np .arange (3 , 6 ).astype ('u1' ),
376
- 'float' : np .arange (4.0 , 7.0 , dtype = 'float64' ),
377
- 'float_with_nan' : [2. , np .nan , 3. ],
378
- 'bool' : [True , False , True ],
379
- 'datetime' : pd .date_range ('20130101' , periods = 3 ),
380
- 'datetime_with_nat' : [pd .Timestamp ('20130101' ),
381
- pd .NaT ,
382
- pd .Timestamp ('20130103' )],
383
- 'timedelta' : pd .timedelta_range ('1 day' , periods = 3 ),
384
- })
396
+ def test_basic (self , fp , df_full ):
397
+
398
+ df = df_full
399
+
400
+ # additional supported types for fastparquet
401
+ df ['timedelta' ] = pd .timedelta_range ('1 day' , periods = 3 )
385
402
386
403
self .check_round_trip (df , fp , write_kwargs = {'compression' : None })
387
404
0 commit comments