27
27
uses_params ,
28
28
uses_relative ,
29
29
)
30
+ import warnings
30
31
import zipfile
31
32
32
33
from pandas ._typing import (
33
34
CompressionDict ,
34
35
CompressionOptions ,
36
+ EncodingVar ,
37
+ FileOrBuffer ,
35
38
FilePathOrBuffer ,
39
+ IOargs ,
40
+ ModeVar ,
36
41
StorageOptions ,
37
42
)
38
43
from pandas .compat import _get_lzma_file , _import_lzma
@@ -69,9 +74,7 @@ def is_url(url) -> bool:
69
74
return parse_url (url ).scheme in _VALID_URLS
70
75
71
76
72
- def _expand_user (
73
- filepath_or_buffer : FilePathOrBuffer [AnyStr ],
74
- ) -> FilePathOrBuffer [AnyStr ]:
77
+ def _expand_user (filepath_or_buffer : FileOrBuffer [AnyStr ]) -> FileOrBuffer [AnyStr ]:
75
78
"""
76
79
Return the argument with an initial component of ~ or ~user
77
80
replaced by that user's home directory.
@@ -101,7 +104,7 @@ def validate_header_arg(header) -> None:
101
104
102
105
def stringify_path (
103
106
filepath_or_buffer : FilePathOrBuffer [AnyStr ],
104
- ) -> FilePathOrBuffer [AnyStr ]:
107
+ ) -> FileOrBuffer [AnyStr ]:
105
108
"""
106
109
Attempt to convert a path-like object to a string.
107
110
@@ -134,9 +137,9 @@ def stringify_path(
134
137
# "__fspath__" [union-attr]
135
138
# error: Item "IO[bytes]" of "Union[str, Path, IO[bytes]]" has no
136
139
# attribute "__fspath__" [union-attr]
137
- return filepath_or_buffer .__fspath__ () # type: ignore[union-attr]
140
+ filepath_or_buffer = filepath_or_buffer .__fspath__ () # type: ignore[union-attr]
138
141
elif isinstance (filepath_or_buffer , pathlib .Path ):
139
- return str (filepath_or_buffer )
142
+ filepath_or_buffer = str (filepath_or_buffer )
140
143
return _expand_user (filepath_or_buffer )
141
144
142
145
@@ -162,13 +165,13 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool:
162
165
)
163
166
164
167
165
- def get_filepath_or_buffer (
168
+ def get_filepath_or_buffer ( # type: ignore[assignment]
166
169
filepath_or_buffer : FilePathOrBuffer ,
167
- encoding : Optional [ str ] = None ,
170
+ encoding : EncodingVar = None ,
168
171
compression : CompressionOptions = None ,
169
- mode : Optional [ str ] = None ,
172
+ mode : ModeVar = None ,
170
173
storage_options : StorageOptions = None ,
171
- ):
174
+ ) -> IOargs [ ModeVar , EncodingVar ] :
172
175
"""
173
176
If the filepath_or_buffer is a url, translate and return the buffer.
174
177
Otherwise passthrough.
@@ -191,14 +194,35 @@ def get_filepath_or_buffer(
191
194
192
195
.. versionadded:: 1.2.0
193
196
194
- Returns
195
- -------
196
- Tuple[FilePathOrBuffer, str, CompressionOptions, bool]
197
- Tuple containing the filepath or buffer, the encoding, the compression
198
- and should_close.
197
+ ..versionchange:: 1.2.0
198
+
199
+ Returns the dataclass IOargs.
199
200
"""
200
201
filepath_or_buffer = stringify_path (filepath_or_buffer )
201
202
203
+ # bz2 and xz do not write the byte order mark for utf-16 and utf-32
204
+ # print a warning when writing such files
205
+ compression_method = infer_compression (
206
+ filepath_or_buffer , get_compression_method (compression )[0 ]
207
+ )
208
+ if (
209
+ mode
210
+ and "w" in mode
211
+ and compression_method in ["bz2" , "xz" ]
212
+ and encoding in ["utf-16" , "utf-32" ]
213
+ ):
214
+ warnings .warn (
215
+ f"{ compression } will not write the byte order mark for { encoding } " ,
216
+ UnicodeWarning ,
217
+ )
218
+
219
+ # Use binary mode when converting path-like objects to file-like objects (fsspec)
220
+ # except when text mode is explicitly requested. The original mode is returned if
221
+ # fsspec is not used.
222
+ fsspec_mode = mode or "rb"
223
+ if "t" not in fsspec_mode and "b" not in fsspec_mode :
224
+ fsspec_mode += "b"
225
+
202
226
if isinstance (filepath_or_buffer , str ) and is_url (filepath_or_buffer ):
203
227
# TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
204
228
if storage_options :
@@ -212,7 +236,13 @@ def get_filepath_or_buffer(
212
236
compression = "gzip"
213
237
reader = BytesIO (req .read ())
214
238
req .close ()
215
- return reader , encoding , compression , True
239
+ return IOargs (
240
+ filepath_or_buffer = reader ,
241
+ encoding = encoding ,
242
+ compression = compression ,
243
+ should_close = True ,
244
+ mode = fsspec_mode ,
245
+ )
216
246
217
247
if is_fsspec_url (filepath_or_buffer ):
218
248
assert isinstance (
@@ -244,7 +274,7 @@ def get_filepath_or_buffer(
244
274
245
275
try :
246
276
file_obj = fsspec .open (
247
- filepath_or_buffer , mode = mode or "rb" , ** (storage_options or {})
277
+ filepath_or_buffer , mode = fsspec_mode , ** (storage_options or {})
248
278
).open ()
249
279
# GH 34626 Reads from Public Buckets without Credentials needs anon=True
250
280
except tuple (err_types_to_retry_with_anon ):
@@ -255,23 +285,41 @@ def get_filepath_or_buffer(
255
285
storage_options = dict (storage_options )
256
286
storage_options ["anon" ] = True
257
287
file_obj = fsspec .open (
258
- filepath_or_buffer , mode = mode or "rb" , ** (storage_options or {})
288
+ filepath_or_buffer , mode = fsspec_mode , ** (storage_options or {})
259
289
).open ()
260
290
261
- return file_obj , encoding , compression , True
291
+ return IOargs (
292
+ filepath_or_buffer = file_obj ,
293
+ encoding = encoding ,
294
+ compression = compression ,
295
+ should_close = True ,
296
+ mode = fsspec_mode ,
297
+ )
262
298
elif storage_options :
263
299
raise ValueError (
264
300
"storage_options passed with file object or non-fsspec file path"
265
301
)
266
302
267
303
if isinstance (filepath_or_buffer , (str , bytes , mmap .mmap )):
268
- return _expand_user (filepath_or_buffer ), None , compression , False
304
+ return IOargs (
305
+ filepath_or_buffer = _expand_user (filepath_or_buffer ),
306
+ encoding = encoding ,
307
+ compression = compression ,
308
+ should_close = False ,
309
+ mode = mode ,
310
+ )
269
311
270
312
if not is_file_like (filepath_or_buffer ):
271
313
msg = f"Invalid file path or buffer object type: { type (filepath_or_buffer )} "
272
314
raise ValueError (msg )
273
315
274
- return filepath_or_buffer , None , compression , False
316
+ return IOargs (
317
+ filepath_or_buffer = filepath_or_buffer ,
318
+ encoding = encoding ,
319
+ compression = compression ,
320
+ should_close = False ,
321
+ mode = mode ,
322
+ )
275
323
276
324
277
325
def file_path_to_url (path : str ) -> str :
@@ -452,6 +500,15 @@ def get_handle(
452
500
need_text_wrapping = (BufferedIOBase , RawIOBase , S3File )
453
501
except ImportError :
454
502
need_text_wrapping = (BufferedIOBase , RawIOBase )
503
+ # fsspec is an optional dependency. If it is available, add its file-object
504
+ # class to the list of classes that need text wrapping. If fsspec is too old and is
505
+ # needed, get_filepath_or_buffer would already have thrown an exception.
506
+ try :
507
+ from fsspec .spec import AbstractFileSystem
508
+
509
+ need_text_wrapping = (* need_text_wrapping , AbstractFileSystem )
510
+ except ImportError :
511
+ pass
455
512
456
513
handles : List [Union [IO , _MMapWrapper ]] = list ()
457
514
f = path_or_buf
@@ -583,12 +640,15 @@ def __init__(
583
640
self .archive_name = archive_name
584
641
kwargs_zip : Dict [str , Any ] = {"compression" : zipfile .ZIP_DEFLATED }
585
642
kwargs_zip .update (kwargs )
586
- super ().__init__ (file , mode , ** kwargs_zip )
643
+ super ().__init__ (file , mode , ** kwargs_zip ) # type: ignore[arg-type]
587
644
588
645
def write (self , data ):
589
646
archive_name = self .filename
590
647
if self .archive_name is not None :
591
648
archive_name = self .archive_name
649
+ if archive_name is None :
650
+ # ZipFile needs a non-empty string
651
+ archive_name = "zip"
592
652
super ().writestr (archive_name , data )
593
653
594
654
@property
0 commit comments