@@ -90,12 +90,39 @@ def _get_path_or_handle(
90
90
]:
91
91
"""File handling for PyArrow."""
92
92
path_or_handle = stringify_path (path )
93
+ if fs is not None :
94
+ pa_fs = import_optional_dependency ("pyarrow.fs" , errors = "ignore" )
95
+ fsspec = import_optional_dependency ("fsspec" , errors = "ignore" )
96
+ if pa_fs is None and fsspec is None :
97
+ raise ValueError (
98
+ f"filesystem must be a pyarrow or fsspec FileSystem, "
99
+ f"not a { type (fs ).__name__ } "
100
+ )
101
+ elif (pa_fs is not None and not isinstance (fs , pa_fs .FileSystem )) and (
102
+ fsspec is not None and not isinstance (fs , fsspec .spec .AbstractFileSystem )
103
+ ):
104
+ raise ValueError (
105
+ f"filesystem must be a pyarrow or fsspec FileSystem, "
106
+ f"not a { type (fs ).__name__ } "
107
+ )
108
+ elif pa_fs is not None and isinstance (fs , pa_fs .FileSystem ) and storage_options :
109
+ raise NotImplementedError (
110
+ "storage_options not supported with a pyarrow FileSystem."
111
+ )
93
112
if is_fsspec_url (path_or_handle ) and fs is None :
94
- fsspec = import_optional_dependency ("fsspec" )
113
+ if storage_options is None :
114
+ pa = import_optional_dependency ("pyarrow" )
115
+ pa_fs = import_optional_dependency ("pyarrow.fs" )
95
116
96
- fs , path_or_handle = fsspec .core .url_to_fs (
97
- path_or_handle , ** (storage_options or {})
98
- )
117
+ try :
118
+ fs , path_or_handle = pa_fs .FileSystem .from_uri (path )
119
+ except (TypeError , pa .ArrowInvalid ):
120
+ pass
121
+ if fs is None :
122
+ fsspec = import_optional_dependency ("fsspec" )
123
+ fs , path_or_handle = fsspec .core .url_to_fs (
124
+ path_or_handle , ** (storage_options or {})
125
+ )
99
126
elif storage_options and (not is_url (path_or_handle ) or mode != "rb" ):
100
127
# can't write to a remote url
101
128
# without making use of fsspec at the moment
@@ -173,6 +200,7 @@ def write(
173
200
index : bool | None = None ,
174
201
storage_options : StorageOptions = None ,
175
202
partition_cols : list [str ] | None = None ,
203
+ filesystem = None ,
176
204
** kwargs ,
177
205
) -> None :
178
206
self .validate_dataframe (df )
@@ -183,9 +211,9 @@ def write(
183
211
184
212
table = self .api .Table .from_pandas (df , ** from_pandas_kwargs )
185
213
186
- path_or_handle , handles , kwargs [ " filesystem" ] = _get_path_or_handle (
214
+ path_or_handle , handles , filesystem = _get_path_or_handle (
187
215
path ,
188
- kwargs . pop ( " filesystem" , None ) ,
216
+ filesystem ,
189
217
storage_options = storage_options ,
190
218
mode = "wb" ,
191
219
is_dir = partition_cols is not None ,
@@ -207,12 +235,17 @@ def write(
207
235
path_or_handle ,
208
236
compression = compression ,
209
237
partition_cols = partition_cols ,
238
+ filesystem = filesystem ,
210
239
** kwargs ,
211
240
)
212
241
else :
213
242
# write to single output file
214
243
self .api .parquet .write_table (
215
- table , path_or_handle , compression = compression , ** kwargs
244
+ table ,
245
+ path_or_handle ,
246
+ compression = compression ,
247
+ filesystem = filesystem ,
248
+ ** kwargs ,
216
249
)
217
250
finally :
218
251
if handles is not None :
@@ -225,6 +258,7 @@ def read(
225
258
use_nullable_dtypes : bool = False ,
226
259
dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
227
260
storage_options : StorageOptions = None ,
261
+ filesystem = None ,
228
262
** kwargs ,
229
263
) -> DataFrame :
230
264
kwargs ["use_pandas_metadata" ] = True
@@ -242,15 +276,15 @@ def read(
242
276
if manager == "array" :
243
277
to_pandas_kwargs ["split_blocks" ] = True # type: ignore[assignment]
244
278
245
- path_or_handle , handles , kwargs [ " filesystem" ] = _get_path_or_handle (
279
+ path_or_handle , handles , filesystem = _get_path_or_handle (
246
280
path ,
247
- kwargs . pop ( " filesystem" , None ) ,
281
+ filesystem ,
248
282
storage_options = storage_options ,
249
283
mode = "rb" ,
250
284
)
251
285
try :
252
286
pa_table = self .api .parquet .read_table (
253
- path_or_handle , columns = columns , ** kwargs
287
+ path_or_handle , columns = columns , filesystem = filesystem , ** kwargs
254
288
)
255
289
result = pa_table .to_pandas (** to_pandas_kwargs )
256
290
@@ -279,6 +313,7 @@ def write(
279
313
index = None ,
280
314
partition_cols = None ,
281
315
storage_options : StorageOptions = None ,
316
+ filesystem = None ,
282
317
** kwargs ,
283
318
) -> None :
284
319
self .validate_dataframe (df )
@@ -294,6 +329,11 @@ def write(
294
329
if partition_cols is not None :
295
330
kwargs ["file_scheme" ] = "hive"
296
331
332
+ if filesystem is not None :
333
+ raise NotImplementedError (
334
+ "filesystem is not implemented for the fastparquet engine."
335
+ )
336
+
297
337
# cannot use get_handle as write() does not accept file buffers
298
338
path = stringify_path (path )
299
339
if is_fsspec_url (path ):
@@ -319,7 +359,12 @@ def write(
319
359
)
320
360
321
361
def read (
322
- self , path , columns = None , storage_options : StorageOptions = None , ** kwargs
362
+ self ,
363
+ path ,
364
+ columns = None ,
365
+ storage_options : StorageOptions = None ,
366
+ filesystem = None ,
367
+ ** kwargs ,
323
368
) -> DataFrame :
324
369
parquet_kwargs : dict [str , Any ] = {}
325
370
use_nullable_dtypes = kwargs .pop ("use_nullable_dtypes" , False )
@@ -337,6 +382,10 @@ def read(
337
382
"The 'dtype_backend' argument is not supported for the "
338
383
"fastparquet engine"
339
384
)
385
+ if filesystem is not None :
386
+ raise NotImplementedError (
387
+ "filesystem is not implemented for the fastparquet engine."
388
+ )
340
389
path = stringify_path (path )
341
390
handles = None
342
391
if is_fsspec_url (path ):
@@ -376,6 +425,7 @@ def to_parquet(
376
425
index : bool | None = None ,
377
426
storage_options : StorageOptions = None ,
378
427
partition_cols : list [str ] | None = None ,
428
+ filesystem : Any = None ,
379
429
** kwargs ,
380
430
) -> bytes | None :
381
431
"""
@@ -398,6 +448,12 @@ def to_parquet(
398
448
``io.parquet.engine`` is used. The default ``io.parquet.engine``
399
449
behavior is to try 'pyarrow', falling back to 'fastparquet' if
400
450
'pyarrow' is unavailable.
451
+
452
+ When using the ``'pyarrow'`` engine and no storage options are provided
453
+ and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
454
+ (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
455
+ Use the filesystem keyword with an instantiated fsspec filesystem
456
+ if you wish to use its implementation.
401
457
compression : {{'snappy', 'gzip', 'brotli', 'lz4', 'zstd', None}},
402
458
default 'snappy'. Name of the compression to use. Use ``None``
403
459
for no compression. The supported compression methods actually
@@ -420,6 +476,12 @@ def to_parquet(
420
476
421
477
.. versionadded:: 1.2.0
422
478
479
+ filesystem : fsspec or pyarrow filesystem, default None
480
+ Filesystem object to use when reading the parquet file. Only implemented
481
+ for ``engine="pyarrow"``.
482
+
483
+ .. versionadded:: 2.1.0
484
+
423
485
kwargs
424
486
Additional keyword arguments passed to the engine
425
487
@@ -440,6 +502,7 @@ def to_parquet(
440
502
index = index ,
441
503
partition_cols = partition_cols ,
442
504
storage_options = storage_options ,
505
+ filesystem = filesystem ,
443
506
** kwargs ,
444
507
)
445
508
@@ -458,6 +521,7 @@ def read_parquet(
458
521
storage_options : StorageOptions = None ,
459
522
use_nullable_dtypes : bool | lib .NoDefault = lib .no_default ,
460
523
dtype_backend : DtypeBackend | lib .NoDefault = lib .no_default ,
524
+ filesystem : Any = None ,
461
525
** kwargs ,
462
526
) -> DataFrame :
463
527
"""
@@ -480,6 +544,12 @@ def read_parquet(
480
544
``io.parquet.engine`` is used. The default ``io.parquet.engine``
481
545
behavior is to try 'pyarrow', falling back to 'fastparquet' if
482
546
'pyarrow' is unavailable.
547
+
548
+ When using the ``'pyarrow'`` engine and no storage options are provided
549
+ and a filesystem is implemented by both ``pyarrow.fs`` and ``fsspec``
550
+ (e.g. "s3://"), then the ``pyarrow.fs`` filesystem is attempted first.
551
+ Use the filesystem keyword with an instantiated fsspec filesystem
552
+ if you wish to use its implementation.
483
553
columns : list, default=None
484
554
If not None, only these columns will be read from the file.
485
555
@@ -508,6 +578,12 @@ def read_parquet(
508
578
509
579
.. versionadded:: 2.0
510
580
581
+ filesystem : fsspec or pyarrow filesystem, default None
582
+ Filesystem object to use when reading the parquet file. Only implemented
583
+ for ``engine="pyarrow"``.
584
+
585
+ .. versionadded:: 2.1.0
586
+
511
587
**kwargs
512
588
Any additional kwargs are passed to the engine.
513
589
@@ -537,5 +613,6 @@ def read_parquet(
537
613
storage_options = storage_options ,
538
614
use_nullable_dtypes = use_nullable_dtypes ,
539
615
dtype_backend = dtype_backend ,
616
+ filesystem = filesystem ,
540
617
** kwargs ,
541
618
)
0 commit comments