From 20bd678df790670d9cf9a0c0a8fb63845cc6ca30 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 11 Aug 2020 21:10:42 -0400 Subject: [PATCH 1/3] to_csv: infer compression before potentially converting to file object; get_handle: fsspec file objects need to be wrapped get_filepath_or_buffer: path-like objects that are internally converted to file-like objects are opened in binary mode; named tuple _BytesZipFile: work with filename-less objects --- doc/source/whatsnew/v1.2.0.rst | 2 + pandas/_typing.py | 18 ++++++ pandas/core/frame.py | 3 +- pandas/core/generic.py | 2 +- pandas/io/common.py | 83 ++++++++++++++++++++++---- pandas/io/excel/_base.py | 4 +- pandas/io/feather_format.py | 4 +- pandas/io/formats/csvs.py | 13 +++- pandas/io/json/_json.py | 6 +- pandas/io/orc.py | 2 +- pandas/io/parquet.py | 6 +- pandas/io/parsers.py | 4 +- pandas/io/pickle.py | 8 +-- pandas/io/sas/sas7bdat.py | 2 +- pandas/io/sas/sas_xport.py | 1 + pandas/io/sas/sasreader.py | 2 +- pandas/io/stata.py | 4 +- pandas/tests/io/test_common.py | 25 +++++++- pandas/tests/io/test_compression.py | 10 ++++ pandas/tests/io/test_gcs.py | 92 +++++++++++++++++++++-------- 20 files changed, 227 insertions(+), 64 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 55570341cf4e8..dae2f98bc0b76 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -240,6 +240,8 @@ I/O - In :meth:`read_csv` `float_precision='round_trip'` now handles `decimal` and `thousands` parameters (:issue:`35365`) - :meth:`to_pickle` and :meth:`read_pickle` were closing user-provided file objects (:issue:`35679`) - :meth:`to_csv` passes compression arguments for `'gzip'` always to `gzip.GzipFile` (:issue:`28103`) +- :meth:`to_csv` did not support zip compression for binary file object not having a filename (:issue: `35058`) +- :meth:`to_csv` and :meth:`read_csv` did not honor `compression` and `encoding` for path-like objects that are internally converted to file-like objects (:issue:`35677`, :issue:`26124`, and :issue:`32392`) Plotting ^^^^^^^^ diff --git a/pandas/_typing.py b/pandas/_typing.py index 1b972030ef5a5..1dc4cf33e5656 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -11,6 +11,7 @@ Hashable, List, Mapping, + NamedTuple, Optional, Type, TypeVar, @@ -114,3 +115,20 @@ # compression keywords and compression CompressionDict = Mapping[str, Optional[Union[str, int, bool]]] CompressionOptions = Optional[Union[str, CompressionDict]] + + +class IOargs(NamedTuple): + """ + Return value of io/common.py:get_filepath_or_buffer. + + Note (copy&past from io/parsers): + filepath_or_buffer can be Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] + though mypy handling of conditional imports is difficult. + See https://github.com/python/mypy/issues/1297 + """ + + filepath_or_buffer: FilePathOrBuffer + encoding: Optional[str] + compression: CompressionOptions = None + should_close: bool = False + mode: Optional[str] = None diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 312d449e36022..c2e36408d1908 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2281,10 +2281,9 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, should_close = get_filepath_or_buffer( + buf, _, _, should_close, _ = get_filepath_or_buffer( # type: ignore buf, mode=mode, storage_options=storage_options ) - assert buf is not None # Help mypy. assert not isinstance(buf, str) buf.writelines(result) if should_close: diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3bad2d6dd18b9..5b3c13cfe332e 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3249,7 +3249,7 @@ def to_csv( formatter.save() if path_or_buf is None: - return formatter.path_or_buf.getvalue() + return formatter.path_or_buf.getvalue() # type: ignore return None diff --git a/pandas/io/common.py b/pandas/io/common.py index d1305c9cabe0e..85033142238bf 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -27,12 +27,14 @@ uses_params, uses_relative, ) +import warnings import zipfile from pandas._typing import ( CompressionDict, CompressionOptions, FilePathOrBuffer, + IOargs, StorageOptions, ) from pandas.compat import _get_lzma_file, _import_lzma @@ -168,7 +170,7 @@ def get_filepath_or_buffer( compression: CompressionOptions = None, mode: Optional[str] = None, storage_options: StorageOptions = None, -): +) -> IOargs: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. @@ -191,14 +193,37 @@ def get_filepath_or_buffer( .. versionadded:: 1.2.0 - Returns - ------- - Tuple[FilePathOrBuffer, str, CompressionOptions, bool] - Tuple containing the filepath or buffer, the encoding, the compression - and should_close. + ..versionchange:: 1.2.0 + + A named tuple is returned. In addition to previously returned values it also + returns `mode`. If a path-like object is converted to a file-like object, the + returned mode is binary, otherwise it is the provided `mode`. """ filepath_or_buffer = stringify_path(filepath_or_buffer) + # bz2 and xz do not write the byte order mark for utf-16 and utf-32 + # print a warning when writing such files + compression_method = infer_compression( + filepath_or_buffer, get_compression_method(compression)[0] + ) + if ( + mode + and "w" in mode + and compression_method in ["bz2", "xz"] + and encoding in ["utf-16", "utf-32"] + ): + warnings.warn( + f"{compression} will not write the byte order mark for {encoding}", + UnicodeWarning, + ) + + # Use binary mode when converting path-like objects to file-like objects (fsspec) + # except when text mode is explicitly requested. The original mode is returned if + # fsspec is not used. + fsspec_mode = mode or "rb" + if "t" not in fsspec_mode and "b" not in fsspec_mode: + fsspec_mode += "b" + if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged if storage_options: @@ -212,7 +237,13 @@ def get_filepath_or_buffer( compression = "gzip" reader = BytesIO(req.read()) req.close() - return reader, encoding, compression, True + return IOargs( + filepath_or_buffer=reader, + encoding=encoding, + compression=compression, + should_close=True, + mode=fsspec_mode, + ) if is_fsspec_url(filepath_or_buffer): assert isinstance( @@ -244,7 +275,7 @@ def get_filepath_or_buffer( try: file_obj = fsspec.open( - filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() # GH 34626 Reads from Public Buckets without Credentials needs anon=True except tuple(err_types_to_retry_with_anon): @@ -255,23 +286,41 @@ def get_filepath_or_buffer( storage_options = dict(storage_options) storage_options["anon"] = True file_obj = fsspec.open( - filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) + filepath_or_buffer, mode=fsspec_mode, **(storage_options or {}) ).open() - return file_obj, encoding, compression, True + return IOargs( + filepath_or_buffer=file_obj, + encoding=encoding, + compression=compression, + should_close=True, + mode=fsspec_mode, + ) elif storage_options: raise ValueError( "storage_options passed with file object or non-fsspec file path" ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): - return _expand_user(filepath_or_buffer), None, compression, False + return IOargs( + filepath_or_buffer=_expand_user(filepath_or_buffer), + encoding=None, + compression=compression, + should_close=False, + mode=mode, + ) if not is_file_like(filepath_or_buffer): msg = f"Invalid file path or buffer object type: {type(filepath_or_buffer)}" raise ValueError(msg) - return filepath_or_buffer, None, compression, False + return IOargs( + filepath_or_buffer=filepath_or_buffer, + encoding=None, + compression=compression, + should_close=False, + mode=mode, + ) def file_path_to_url(path: str) -> str: @@ -452,6 +501,13 @@ def get_handle( need_text_wrapping = (BufferedIOBase, RawIOBase, S3File) except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) + # fsspec is an optional dependency. If it is available, add its file-object + # class to the list of classes that need text wrapping. + fsspec = import_optional_dependency("fsspec", raise_on_missing=False) + if fsspec is not None: + need_text_wrapping = tuple( + list(need_text_wrapping) + [fsspec.spec.AbstractFileSystem] + ) handles: List[Union[IO, _MMapWrapper]] = list() f = path_or_buf @@ -589,6 +645,9 @@ def write(self, data): archive_name = self.filename if self.archive_name is not None: archive_name = self.archive_name + if archive_name is None: + # ZipFile needs a non-empty string + archive_name = "zip" super().writestr(archive_name, data) @property diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index ead36c95556b1..79f2ee9a060a8 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -352,9 +352,9 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer = get_filepath_or_buffer( filepath_or_buffer, storage_options=storage_options - ) + )[0] if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index fb606b5ec8aef..937eda21aee12 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -34,7 +34,7 @@ def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kw import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer( + path, _, _, should_close, _ = get_filepath_or_buffer( path, mode="wb", storage_options=storage_options ) @@ -122,7 +122,7 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer( + path, _, _, should_close, _ = get_filepath_or_buffer( path, storage_options=storage_options ) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index c462a96da7133..86b4eb9d2fc93 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -62,14 +62,22 @@ def __init__( # Extract compression mode as given, if dict compression, self.compression_args = get_compression_method(compression) + self.compression = infer_compression(path_or_buf, compression) - self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( + ( + self.path_or_buf, + _, + _, + self.should_close, + mode, # type: ignore + ) = get_filepath_or_buffer( path_or_buf, encoding=encoding, - compression=compression, + compression=self.compression, mode=mode, storage_options=storage_options, ) + assert self.path_or_buf is not None self.sep = sep self.na_rep = na_rep self.float_format = float_format @@ -83,7 +91,6 @@ def __init__( encoding = "utf-8" self.encoding = encoding self.errors = errors - self.compression = infer_compression(self.path_or_buf, compression) if quoting is None: quoting = csvlib.QUOTE_MINIMAL diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index fe5e172655ae1..5ecdd312103a9 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -58,7 +58,7 @@ def to_json( ) if path_or_buf is not None: - path_or_buf, _, _, should_close = get_filepath_or_buffer( + path_or_buf, _, _, should_close, _ = get_filepath_or_buffer( path_or_buf, compression=compression, mode="wt", @@ -615,7 +615,7 @@ def read_json( compression_method, compression = get_compression_method(compression) compression_method = infer_compression(path_or_buf, compression_method) compression = dict(compression, method=compression_method) - filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( + filepath_or_buffer, _, compression, should_close, _ = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, @@ -645,7 +645,7 @@ def read_json( result = json_reader.read() if should_close: - filepath_or_buffer.close() + filepath_or_buffer.close() # type: ignore return result diff --git a/pandas/io/orc.py b/pandas/io/orc.py index b556732e4d116..0d87cbadd5225 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -50,7 +50,7 @@ def read_orc( import pyarrow.orc - path, _, _, _ = get_filepath_or_buffer(path) + path = get_filepath_or_buffer(path)[0] orc_file = pyarrow.orc.ORCFile(path) result = orc_file.read(columns=columns, **kwargs).to_pandas() return result diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7f0eef039a1e8..4036256f3d509 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -146,7 +146,7 @@ def read( path = _expand_user(path) if not fs: - path, _, _, should_close = get_filepath_or_buffer(path) + path, _, _, should_close, _ = get_filepath_or_buffer(path) kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table( @@ -205,7 +205,7 @@ def write( raise ValueError( "storage_options passed with file object or non-fsspec file path" ) - path, _, _, _ = get_filepath_or_buffer(path) + path = get_filepath_or_buffer(path)[0] with catch_warnings(record=True): self.api.write( @@ -228,7 +228,7 @@ def read( ).open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: - path, _, _, _ = get_filepath_or_buffer(path) + path = get_filepath_or_buffer(path)[0] parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 983aa56324083..c898c0ef78b15 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -432,7 +432,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 - fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + fp_or_buf, _, compression, should_close, _ = get_filepath_or_buffer( filepath_or_buffer, encoding, compression, storage_options=storage_options ) kwds["compression"] = compression @@ -462,7 +462,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if should_close: try: - fp_or_buf.close() + fp_or_buf.close() # type: ignore except ValueError: pass diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index fc1d2e385cf72..638b034077600 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -86,7 +86,7 @@ def to_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + fp_or_buf, _, compression, should_close, _ = get_filepath_or_buffer( filepath_or_buffer, compression=compression, mode="wb", @@ -107,7 +107,7 @@ def to_pickle( _f.close() if should_close: try: - fp_or_buf.close() + fp_or_buf.close() # type: ignore except ValueError: pass @@ -189,7 +189,7 @@ def read_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - fp_or_buf, _, compression, should_close = get_filepath_or_buffer( + fp_or_buf, _, compression, should_close, _ = get_filepath_or_buffer( filepath_or_buffer, compression=compression, storage_options=storage_options ) if not isinstance(fp_or_buf, str) and compression == "infer": @@ -224,6 +224,6 @@ def read_pickle( _f.close() if should_close: try: - fp_or_buf.close() + fp_or_buf.close() # type: ignore except ValueError: pass diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 3d9be7c15726b..4292016edf689 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -137,7 +137,7 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) + self._path_or_buf = get_filepath_or_buffer(path_or_buf)[0] if isinstance(self._path_or_buf, str): self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 6cf248b748107..5a8b20ed2742a 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -258,6 +258,7 @@ def __init__( encoding, compression, should_close, + _, ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) if isinstance(filepath_or_buffer, (str, bytes)): diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index fffdebda8c87a..8292d2d0fe8b0 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -109,7 +109,7 @@ def read_sas( else: raise ValueError("unable to infer format of SAS file") - filepath_or_buffer, _, _, should_close = get_filepath_or_buffer( + filepath_or_buffer, _, _, should_close, _ = get_filepath_or_buffer( filepath_or_buffer, encoding ) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ec3819f1673a8..24cb92c8a26bb 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1069,7 +1069,7 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close = get_filepath_or_buffer( + path_or_buf, encoding, _, should_close, _ = get_filepath_or_buffer( path_or_buf, storage_options=storage_options ) @@ -1979,7 +1979,7 @@ def _open_file_binary_write( compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) compression = dict(compression_args, method=compression_typ) - path_or_buf, _, compression, _ = get_filepath_or_buffer( + path_or_buf, _, compression, _, _ = get_filepath_or_buffer( fname, mode="wb", compression=compression, storage_options=storage_options, ) f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False) diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 5ce2233bc0cd0..e4d5db444e186 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -107,7 +107,9 @@ def test_infer_compression_from_path(self, extension, expected, path_type): def test_get_filepath_or_buffer_with_path(self): filename = "~/sometest" - filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer(filename) + filepath_or_buffer, _, _, should_close, _ = icom.get_filepath_or_buffer( + filename + ) assert filepath_or_buffer != filename assert os.path.isabs(filepath_or_buffer) assert os.path.expanduser(filename) == filepath_or_buffer @@ -115,7 +117,7 @@ def test_get_filepath_or_buffer_with_path(self): def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() - filepath_or_buffer, _, _, should_close = icom.get_filepath_or_buffer( + filepath_or_buffer, _, _, should_close, _ = icom.get_filepath_or_buffer( input_buffer ) assert filepath_or_buffer == input_buffer @@ -389,6 +391,25 @@ def test_binary_mode(self): df.to_csv(path, mode="w+b") tm.assert_frame_equal(df, pd.read_csv(path, index_col=0)) + @pytest.mark.parametrize("encoding", ["utf-16", "utf-32"]) + @pytest.mark.parametrize("compression_", ["bz2", "xz"]) + def test_warning_missing_utf_bom(self, encoding, compression_): + """ + bz2 and xz do not write the byte order mark (BOM) for utf-16/32. + + https://stackoverflow.com/questions/55171439 + + GH 35681 + """ + df = tm.makeDataFrame() + with tm.ensure_clean() as path: + with tm.assert_produces_warning(UnicodeWarning): + df.to_csv(path, compression=compression_, encoding=encoding) + + # reading should fail (otherwise we wouldn't need the warning) + with pytest.raises(Exception): + pd.read_csv(path, compression=compression_, encoding=encoding) + def test_is_fsspec_url(): assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") diff --git a/pandas/tests/io/test_compression.py b/pandas/tests/io/test_compression.py index bc14b485f75e5..31e9ad4cf4416 100644 --- a/pandas/tests/io/test_compression.py +++ b/pandas/tests/io/test_compression.py @@ -124,6 +124,8 @@ def test_compression_binary(compression_only): GH22555 """ df = tm.makeDataFrame() + + # with a file with tm.ensure_clean() as path: with open(path, mode="wb") as file: df.to_csv(file, mode="wb", compression=compression_only) @@ -132,6 +134,14 @@ def test_compression_binary(compression_only): df, pd.read_csv(path, index_col=0, compression=compression_only) ) + # with BytesIO + file = io.BytesIO() + df.to_csv(file, mode="wb", compression=compression_only) + file.seek(0) # file shouldn't be closed + tm.assert_frame_equal( + df, pd.read_csv(file, index_col=0, compression=compression_only) + ) + def test_gzip_reproducibility_file_name(): """ diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index eacf4fa08545d..18b5743a3375a 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -9,12 +9,32 @@ from pandas.util import _test_decorators as td -@td.skip_if_no("gcsfs") -def test_read_csv_gcs(monkeypatch): +@pytest.fixture +def gcs_buffer(monkeypatch): + """Emulate GCS using a binary buffer.""" from fsspec import AbstractFileSystem, registry registry.target.clear() # noqa # remove state + gcs_buffer = BytesIO() + gcs_buffer.close = lambda: True + + class MockGCSFileSystem(AbstractFileSystem): + def open(*args, **kwargs): + gcs_buffer.seek(0) + return gcs_buffer + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + + return gcs_buffer + + +@td.skip_if_no("gcsfs") +def test_read_csv_gcs(gcs_buffer): + from fsspec import registry + + registry.target.clear() # noqa # remove state + df1 = DataFrame( { "int": [1, 3], @@ -24,21 +44,19 @@ def test_read_csv_gcs(monkeypatch): } ) - class MockGCSFileSystem(AbstractFileSystem): - def open(*args, **kwargs): - return BytesIO(df1.to_csv(index=False).encode()) + gcs_buffer.write(df1.to_csv(index=False).encode()) - monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) tm.assert_frame_equal(df1, df2) @td.skip_if_no("gcsfs") -def test_to_csv_gcs(monkeypatch): - from fsspec import AbstractFileSystem, registry +def test_to_csv_gcs(gcs_buffer): + from fsspec import registry registry.target.clear() # noqa # remove state + df1 = DataFrame( { "int": [1, 3], @@ -47,29 +65,57 @@ def test_to_csv_gcs(monkeypatch): "dt": date_range("2018-06-18", periods=2), } ) - s = BytesIO() - s.close = lambda: True - - class MockGCSFileSystem(AbstractFileSystem): - def open(*args, **kwargs): - s.seek(0) - return s - monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) df1.to_csv("gs://test/test.csv", index=True) - def mock_get_filepath_or_buffer(*args, **kwargs): - return BytesIO(df1.to_csv(index=True).encode()), None, None, False - - monkeypatch.setattr( - "pandas.io.common.get_filepath_or_buffer", mock_get_filepath_or_buffer - ) - df2 = read_csv("gs://test/test.csv", parse_dates=["dt"], index_col=0) tm.assert_frame_equal(df1, df2) +@td.skip_if_no("gcsfs") +@pytest.mark.parametrize("encoding", ["utf-8", "cp1251"]) +def test_to_csv_compression_encoding_gcs(gcs_buffer, compression_only, encoding): + """ + Compression and encoding should with GCS. + + GH 35677 (to_csv, compression), GH 26124 (to_csv, encoding), and + GH 32392 (read_csv, encoding) + """ + from fsspec import registry + + registry.target.clear() # noqa # remove state + df = tm.makeDataFrame() + + # reference of compressed and encoded file + compression = {"method": compression_only} + if compression_only == "gzip": + compression["mtime"] = 1 # be reproducible + buffer = BytesIO() + df.to_csv(buffer, compression=compression, encoding=encoding, mode="wb") + + # write compressed file with explicit compression + path_gcs = "gs://test/test.csv" + df.to_csv(path_gcs, compression=compression, encoding=encoding) + assert gcs_buffer.getvalue() == buffer.getvalue() + read_df = read_csv( + path_gcs, index_col=0, compression=compression_only, encoding=encoding + ) + tm.assert_frame_equal(df, read_df) + + # write compressed file with implicit compression + if compression_only == "gzip": + compression_only = "gz" + compression["method"] = "infer" + path_gcs += f".{compression_only}" + df.to_csv( + path_gcs, compression=compression, encoding=encoding, + ) + assert gcs_buffer.getvalue() == buffer.getvalue() + read_df = read_csv(path_gcs, index_col=0, compression="infer", encoding=encoding) + tm.assert_frame_equal(df, read_df) + + @td.skip_if_no("fastparquet") @td.skip_if_no("gcsfs") def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): From 935fc4bd676bcb38d003352c9ccb418dad8f1ea9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 25 Aug 2020 15:56:53 -0400 Subject: [PATCH 2/3] bind input type of encding and mode with the returned type; removed ignore statements (mypy will compile about filepath_or_buffer) --- pandas/_typing.py | 18 +++++++++++++----- pandas/core/frame.py | 2 +- pandas/core/generic.py | 2 +- pandas/io/common.py | 9 ++++++--- pandas/io/formats/csvs.py | 8 +------- pandas/io/json/_json.py | 2 +- pandas/io/parsers.py | 2 +- pandas/io/pickle.py | 2 +- 8 files changed, 25 insertions(+), 20 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 1dc4cf33e5656..859e959ceb756 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,3 +1,4 @@ +from dataclasses import dataclass from datetime import datetime, timedelta, tzinfo from pathlib import Path from typing import ( @@ -8,6 +9,7 @@ Callable, Collection, Dict, + Generic, Hashable, List, Mapping, @@ -117,7 +119,13 @@ CompressionOptions = Optional[Union[str, CompressionDict]] -class IOargs(NamedTuple): +# lets us bind types +ModeVar = TypeVar("ModeVar", str, None) +EncodingVar = TypeVar("EncodingVar", str, None) + + +@dataclass +class IOargs(Generic[ModeVar, EncodingVar]): """ Return value of io/common.py:get_filepath_or_buffer. @@ -128,7 +136,7 @@ class IOargs(NamedTuple): """ filepath_or_buffer: FilePathOrBuffer - encoding: Optional[str] - compression: CompressionOptions = None - should_close: bool = False - mode: Optional[str] = None + encoding: EncodingVar + compression: CompressionOptions + should_close: bool + mode: Union[ModeVar, str] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index c2e36408d1908..9ac6d8620874c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2281,7 +2281,7 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, should_close, _ = get_filepath_or_buffer( # type: ignore + buf, _, _, should_close, _ = get_filepath_or_buffer( buf, mode=mode, storage_options=storage_options ) assert not isinstance(buf, str) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 5b3c13cfe332e..3bad2d6dd18b9 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3249,7 +3249,7 @@ def to_csv( formatter.save() if path_or_buf is None: - return formatter.path_or_buf.getvalue() # type: ignore + return formatter.path_or_buf.getvalue() return None diff --git a/pandas/io/common.py b/pandas/io/common.py index 85033142238bf..8905bb5bd1ec3 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -33,8 +33,11 @@ from pandas._typing import ( CompressionDict, CompressionOptions, + EncodingVar, FilePathOrBuffer, + FilePathOrBufferVar, IOargs, + ModeVar, StorageOptions, ) from pandas.compat import _get_lzma_file, _import_lzma @@ -166,11 +169,11 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, - encoding: Optional[str] = None, + encoding: EncodingVar = None, # type: ignore[assignment] compression: CompressionOptions = None, - mode: Optional[str] = None, + mode: ModeVar = None, # type: ignore[assignment] storage_options: StorageOptions = None, -) -> IOargs: +) -> IOargs[ModeVar, EncodingVar]: """ If the filepath_or_buffer is a url, translate and return the buffer. Otherwise passthrough. diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 86b4eb9d2fc93..0081794def4d0 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -64,13 +64,7 @@ def __init__( compression, self.compression_args = get_compression_method(compression) self.compression = infer_compression(path_or_buf, compression) - ( - self.path_or_buf, - _, - _, - self.should_close, - mode, # type: ignore - ) = get_filepath_or_buffer( + (self.path_or_buf, _, _, self.should_close, mode,) = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=self.compression, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 5ecdd312103a9..5c7f7dcb4819a 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -645,7 +645,7 @@ def read_json( result = json_reader.read() if should_close: - filepath_or_buffer.close() # type: ignore + filepath_or_buffer.close() return result diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c898c0ef78b15..bbdc01e22213d 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -462,7 +462,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): if should_close: try: - fp_or_buf.close() # type: ignore + fp_or_buf.close() except ValueError: pass diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 638b034077600..06e371fa1a011 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -107,7 +107,7 @@ def to_pickle( _f.close() if should_close: try: - fp_or_buf.close() # type: ignore + fp_or_buf.close() except ValueError: pass From 475e8e8595370085d64a8c5e10a1f020f19f7ec7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Torsten=20W=C3=B6rtwein?= Date: Tue, 25 Aug 2020 17:07:34 -0400 Subject: [PATCH 3/3] use named tuple; remove some unused variables; closed some file handles; refine type for filepath_or_buffer --- pandas/_typing.py | 13 ++++++----- pandas/core/frame.py | 12 ++++------ pandas/core/generic.py | 2 ++ pandas/io/common.py | 42 ++++++++++++++++------------------ pandas/io/excel/_base.py | 2 +- pandas/io/feather_format.py | 23 +++++++++++-------- pandas/io/formats/csvs.py | 8 ++++--- pandas/io/json/_json.py | 19 +++++++++------ pandas/io/orc.py | 4 ++-- pandas/io/parquet.py | 14 +++++++----- pandas/io/parsers.py | 11 +++++---- pandas/io/pickle.py | 28 +++++++++++++++-------- pandas/io/sas/sas7bdat.py | 2 +- pandas/io/sas/sas_xport.py | 10 +++----- pandas/io/sas/sasreader.py | 16 ++++++++----- pandas/io/stata.py | 15 ++++++++---- pandas/tests/io/test_common.py | 26 ++++++++++----------- 17 files changed, 135 insertions(+), 112 deletions(-) diff --git a/pandas/_typing.py b/pandas/_typing.py index 859e959ceb756..f8af92e07c674 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -1,5 +1,6 @@ from dataclasses import dataclass from datetime import datetime, timedelta, tzinfo +from io import IOBase from pathlib import Path from typing import ( IO, @@ -13,7 +14,6 @@ Hashable, List, Mapping, - NamedTuple, Optional, Type, TypeVar, @@ -65,7 +65,8 @@ "ExtensionDtype", str, np.dtype, Type[Union[str, float, int, complex, bool]] ] DtypeObj = Union[np.dtype, "ExtensionDtype"] -FilePathOrBuffer = Union[str, Path, IO[AnyStr]] +FilePathOrBuffer = Union[str, Path, IO[AnyStr], IOBase] +FileOrBuffer = Union[str, IO[AnyStr], IOBase] # FrameOrSeriesUnion means either a DataFrame or a Series. E.g. # `def func(a: FrameOrSeriesUnion) -> FrameOrSeriesUnion: ...` means that if a Series @@ -119,9 +120,9 @@ CompressionOptions = Optional[Union[str, CompressionDict]] -# lets us bind types -ModeVar = TypeVar("ModeVar", str, None) -EncodingVar = TypeVar("EncodingVar", str, None) +# let's bind types +ModeVar = TypeVar("ModeVar", str, None, Optional[str]) +EncodingVar = TypeVar("EncodingVar", str, None, Optional[str]) @dataclass @@ -135,7 +136,7 @@ class IOargs(Generic[ModeVar, EncodingVar]): See https://github.com/python/mypy/issues/1297 """ - filepath_or_buffer: FilePathOrBuffer + filepath_or_buffer: FileOrBuffer encoding: EncodingVar compression: CompressionOptions should_close: bool diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 9ac6d8620874c..eaa27d3f2a857 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2281,13 +2281,11 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, should_close, _ = get_filepath_or_buffer( - buf, mode=mode, storage_options=storage_options - ) - assert not isinstance(buf, str) - buf.writelines(result) - if should_close: - buf.close() + ioargs = get_filepath_or_buffer(buf, mode=mode, storage_options=storage_options) + assert not isinstance(ioargs.filepath_or_buffer, str) + ioargs.filepath_or_buffer.writelines(result) + if ioargs.should_close: + ioargs.filepath_or_buffer.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3bad2d6dd18b9..94eef26e57592 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2,6 +2,7 @@ from datetime import timedelta import functools import gc +from io import StringIO import json import operator import pickle @@ -3249,6 +3250,7 @@ def to_csv( formatter.save() if path_or_buf is None: + assert isinstance(formatter.path_or_buf, StringIO) return formatter.path_or_buf.getvalue() return None diff --git a/pandas/io/common.py b/pandas/io/common.py index 8905bb5bd1ec3..97dbc7f1031a2 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -34,8 +34,8 @@ CompressionDict, CompressionOptions, EncodingVar, + FileOrBuffer, FilePathOrBuffer, - FilePathOrBufferVar, IOargs, ModeVar, StorageOptions, @@ -74,9 +74,7 @@ def is_url(url) -> bool: return parse_url(url).scheme in _VALID_URLS -def _expand_user( - filepath_or_buffer: FilePathOrBuffer[AnyStr], -) -> FilePathOrBuffer[AnyStr]: +def _expand_user(filepath_or_buffer: FileOrBuffer[AnyStr]) -> FileOrBuffer[AnyStr]: """ Return the argument with an initial component of ~ or ~user replaced by that user's home directory. @@ -106,7 +104,7 @@ def validate_header_arg(header) -> None: def stringify_path( filepath_or_buffer: FilePathOrBuffer[AnyStr], -) -> FilePathOrBuffer[AnyStr]: +) -> FileOrBuffer[AnyStr]: """ Attempt to convert a path-like object to a string. @@ -139,9 +137,9 @@ def stringify_path( # "__fspath__" [union-attr] # error: Item "IO[bytes]" of "Union[str, Path, IO[bytes]]" has no # attribute "__fspath__" [union-attr] - return filepath_or_buffer.__fspath__() # type: ignore[union-attr] + filepath_or_buffer = filepath_or_buffer.__fspath__() # type: ignore[union-attr] elif isinstance(filepath_or_buffer, pathlib.Path): - return str(filepath_or_buffer) + filepath_or_buffer = str(filepath_or_buffer) return _expand_user(filepath_or_buffer) @@ -167,11 +165,11 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: ) -def get_filepath_or_buffer( +def get_filepath_or_buffer( # type: ignore[assignment] filepath_or_buffer: FilePathOrBuffer, - encoding: EncodingVar = None, # type: ignore[assignment] + encoding: EncodingVar = None, compression: CompressionOptions = None, - mode: ModeVar = None, # type: ignore[assignment] + mode: ModeVar = None, storage_options: StorageOptions = None, ) -> IOargs[ModeVar, EncodingVar]: """ @@ -198,9 +196,7 @@ def get_filepath_or_buffer( ..versionchange:: 1.2.0 - A named tuple is returned. In addition to previously returned values it also - returns `mode`. If a path-like object is converted to a file-like object, the - returned mode is binary, otherwise it is the provided `mode`. + Returns the dataclass IOargs. """ filepath_or_buffer = stringify_path(filepath_or_buffer) @@ -307,7 +303,7 @@ def get_filepath_or_buffer( if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return IOargs( filepath_or_buffer=_expand_user(filepath_or_buffer), - encoding=None, + encoding=encoding, compression=compression, should_close=False, mode=mode, @@ -319,7 +315,7 @@ def get_filepath_or_buffer( return IOargs( filepath_or_buffer=filepath_or_buffer, - encoding=None, + encoding=encoding, compression=compression, should_close=False, mode=mode, @@ -505,12 +501,14 @@ def get_handle( except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) # fsspec is an optional dependency. If it is available, add its file-object - # class to the list of classes that need text wrapping. - fsspec = import_optional_dependency("fsspec", raise_on_missing=False) - if fsspec is not None: - need_text_wrapping = tuple( - list(need_text_wrapping) + [fsspec.spec.AbstractFileSystem] - ) + # class to the list of classes that need text wrapping. If fsspec is too old and is + # needed, get_filepath_or_buffer would already have thrown an exception. + try: + from fsspec.spec import AbstractFileSystem + + need_text_wrapping = (*need_text_wrapping, AbstractFileSystem) + except ImportError: + pass handles: List[Union[IO, _MMapWrapper]] = list() f = path_or_buf @@ -642,7 +640,7 @@ def __init__( self.archive_name = archive_name kwargs_zip: Dict[str, Any] = {"compression": zipfile.ZIP_DEFLATED} kwargs_zip.update(kwargs) - super().__init__(file, mode, **kwargs_zip) + super().__init__(file, mode, **kwargs_zip) # type: ignore[arg-type] def write(self, data): archive_name = self.filename diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 79f2ee9a060a8..9bc1d7fedcb31 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -354,7 +354,7 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): filepath_or_buffer = get_filepath_or_buffer( filepath_or_buffer, storage_options=storage_options - )[0] + ).filepath_or_buffer if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 937eda21aee12..a98eebe1c6a2a 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -34,9 +34,7 @@ def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kw import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close, _ = get_filepath_or_buffer( - path, mode="wb", storage_options=storage_options - ) + ioargs = get_filepath_or_buffer(path, mode="wb", storage_options=storage_options) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -74,7 +72,11 @@ def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kw if df.columns.inferred_type not in valid_types: raise ValueError("feather must have string column names") - feather.write_feather(df, path, **kwargs) + feather.write_feather(df, ioargs.filepath_or_buffer, **kwargs) + + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) + ioargs.filepath_or_buffer.close() def read_feather( @@ -122,14 +124,15 @@ def read_feather( import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close, _ = get_filepath_or_buffer( - path, storage_options=storage_options - ) + ioargs = get_filepath_or_buffer(path, storage_options=storage_options) - df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) + df = feather.read_feather( + ioargs.filepath_or_buffer, columns=columns, use_threads=bool(use_threads) + ) # s3fs only validates the credentials when the file is closed. - if should_close: - path.close() + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) + ioargs.filepath_or_buffer.close() return df diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 0081794def4d0..270caec022fef 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -64,14 +64,17 @@ def __init__( compression, self.compression_args = get_compression_method(compression) self.compression = infer_compression(path_or_buf, compression) - (self.path_or_buf, _, _, self.should_close, mode,) = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=self.compression, mode=mode, storage_options=storage_options, ) - assert self.path_or_buf is not None + self.path_or_buf = ioargs.filepath_or_buffer + self.should_close = ioargs.should_close + self.mode = ioargs.mode + self.sep = sep self.na_rep = na_rep self.float_format = float_format @@ -80,7 +83,6 @@ def __init__( self.header = header self.index = index self.index_label = index_label - self.mode = mode if encoding is None: encoding = "utf-8" self.encoding = encoding diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 5c7f7dcb4819a..7a3b76ff7e3d0 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -58,12 +58,14 @@ def to_json( ) if path_or_buf is not None: - path_or_buf, _, _, should_close, _ = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( path_or_buf, compression=compression, mode="wt", storage_options=storage_options, ) + path_or_buf = ioargs.filepath_or_buffer + should_close = ioargs.should_close if lines and orient != "records": raise ValueError("'lines' keyword only valid when 'orient' is records") @@ -102,6 +104,8 @@ def to_json( fh.write(s) finally: fh.close() + for handle in handles: + handle.close() elif path_or_buf is None: return s else: @@ -615,7 +619,7 @@ def read_json( compression_method, compression = get_compression_method(compression) compression_method = infer_compression(path_or_buf, compression_method) compression = dict(compression, method=compression_method) - filepath_or_buffer, _, compression, should_close, _ = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( path_or_buf, encoding=encoding, compression=compression, @@ -623,7 +627,7 @@ def read_json( ) json_reader = JsonReader( - filepath_or_buffer, + ioargs.filepath_or_buffer, orient=orient, typ=typ, dtype=dtype, @@ -633,10 +637,10 @@ def read_json( numpy=numpy, precise_float=precise_float, date_unit=date_unit, - encoding=encoding, + encoding=ioargs.encoding, lines=lines, chunksize=chunksize, - compression=compression, + compression=ioargs.compression, nrows=nrows, ) @@ -644,8 +648,9 @@ def read_json( return json_reader result = json_reader.read() - if should_close: - filepath_or_buffer.close() + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) + ioargs.filepath_or_buffer.close() return result diff --git a/pandas/io/orc.py b/pandas/io/orc.py index 0d87cbadd5225..f1b1aa6a43cb5 100644 --- a/pandas/io/orc.py +++ b/pandas/io/orc.py @@ -50,7 +50,7 @@ def read_orc( import pyarrow.orc - path = get_filepath_or_buffer(path)[0] - orc_file = pyarrow.orc.ORCFile(path) + ioargs = get_filepath_or_buffer(path) + orc_file = pyarrow.orc.ORCFile(ioargs.filepath_or_buffer) result = orc_file.read(columns=columns, **kwargs).to_pandas() return result diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 4036256f3d509..e5d6ac006e251 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -9,7 +9,7 @@ from pandas import DataFrame, get_option -from pandas.io.common import _expand_user, get_filepath_or_buffer, is_fsspec_url +from pandas.io.common import get_filepath_or_buffer, is_fsspec_url, stringify_path def get_engine(engine: str) -> "BaseImpl": @@ -113,7 +113,7 @@ def write( raise ValueError( "storage_options passed with file object or non-fsspec file path" ) - path = _expand_user(path) + path = stringify_path(path) if partition_cols is not None: # writes to multiple files under the given path self.api.parquet.write_to_dataset( @@ -143,10 +143,12 @@ def read( ) fs = kwargs.pop("filesystem", None) should_close = False - path = _expand_user(path) + path = stringify_path(path) if not fs: - path, _, _, should_close, _ = get_filepath_or_buffer(path) + ioargs = get_filepath_or_buffer(path) + path = ioargs.filepath_or_buffer + should_close = ioargs.should_close kwargs["use_pandas_metadata"] = True result = self.api.parquet.read_table( @@ -205,7 +207,7 @@ def write( raise ValueError( "storage_options passed with file object or non-fsspec file path" ) - path = get_filepath_or_buffer(path)[0] + path = get_filepath_or_buffer(path).filepath_or_buffer with catch_warnings(record=True): self.api.write( @@ -228,7 +230,7 @@ def read( ).open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: - path = get_filepath_or_buffer(path)[0] + path = get_filepath_or_buffer(path).filepath_or_buffer parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index bbdc01e22213d..a917bff9d7ca7 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -432,10 +432,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # Union[FilePathOrBuffer, s3fs.S3File, gcsfs.GCSFile] # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 - fp_or_buf, _, compression, should_close, _ = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( filepath_or_buffer, encoding, compression, storage_options=storage_options ) - kwds["compression"] = compression + kwds["compression"] = ioargs.compression if kwds.get("date_parser", None) is not None: if isinstance(kwds["parse_dates"], bool): @@ -450,7 +450,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): _validate_names(kwds.get("names", None)) # Create the parser. - parser = TextFileReader(fp_or_buf, **kwds) + parser = TextFileReader(ioargs.filepath_or_buffer, **kwds) if chunksize or iterator: return parser @@ -460,9 +460,10 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): finally: parser.close() - if should_close: + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) try: - fp_or_buf.close() + ioargs.filepath_or_buffer.close() except ValueError: pass diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 06e371fa1a011..857a2d1b69be4 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -86,15 +86,18 @@ def to_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - fp_or_buf, _, compression, should_close, _ = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( filepath_or_buffer, compression=compression, mode="wb", storage_options=storage_options, ) - if not isinstance(fp_or_buf, str) and compression == "infer": + compression = ioargs.compression + if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer": compression = None - f, fh = get_handle(fp_or_buf, "wb", compression=compression, is_text=False) + f, fh = get_handle( + ioargs.filepath_or_buffer, "wb", compression=compression, is_text=False + ) if protocol < 0: protocol = pickle.HIGHEST_PROTOCOL try: @@ -105,9 +108,10 @@ def to_pickle( f.close() for _f in fh: _f.close() - if should_close: + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) try: - fp_or_buf.close() + ioargs.filepath_or_buffer.close() except ValueError: pass @@ -189,12 +193,15 @@ def read_pickle( >>> import os >>> os.remove("./dummy.pkl") """ - fp_or_buf, _, compression, should_close, _ = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( filepath_or_buffer, compression=compression, storage_options=storage_options ) - if not isinstance(fp_or_buf, str) and compression == "infer": + compression = ioargs.compression + if not isinstance(ioargs.filepath_or_buffer, str) and compression == "infer": compression = None - f, fh = get_handle(fp_or_buf, "rb", compression=compression, is_text=False) + f, fh = get_handle( + ioargs.filepath_or_buffer, "rb", compression=compression, is_text=False + ) # 1) try standard library Pickle # 2) try pickle_compat (older pandas version) to handle subclass changes @@ -222,8 +229,9 @@ def read_pickle( f.close() for _f in fh: _f.close() - if should_close: + if ioargs.should_close: + assert not isinstance(ioargs.filepath_or_buffer, str) try: - fp_or_buf.close() # type: ignore + ioargs.filepath_or_buffer.close() except ValueError: pass diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 4292016edf689..76dac39d1889f 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -137,7 +137,7 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf = get_filepath_or_buffer(path_or_buf)[0] + self._path_or_buf = get_filepath_or_buffer(path_or_buf).filepath_or_buffer if isinstance(self._path_or_buf, str): self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 5a8b20ed2742a..e4d9324ce5130 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -253,13 +253,9 @@ def __init__( self._chunksize = chunksize if isinstance(filepath_or_buffer, str): - ( - filepath_or_buffer, - encoding, - compression, - should_close, - _, - ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) + filepath_or_buffer = get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding + ).filepath_or_buffer if isinstance(filepath_or_buffer, (str, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, "rb") diff --git a/pandas/io/sas/sasreader.py b/pandas/io/sas/sasreader.py index 8292d2d0fe8b0..ae9457a8e3147 100644 --- a/pandas/io/sas/sasreader.py +++ b/pandas/io/sas/sasreader.py @@ -109,22 +109,26 @@ def read_sas( else: raise ValueError("unable to infer format of SAS file") - filepath_or_buffer, _, _, should_close, _ = get_filepath_or_buffer( - filepath_or_buffer, encoding - ) + ioargs = get_filepath_or_buffer(filepath_or_buffer, encoding) reader: ReaderBase if format.lower() == "xport": from pandas.io.sas.sas_xport import XportReader reader = XportReader( - filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ioargs.filepath_or_buffer, + index=index, + encoding=ioargs.encoding, + chunksize=chunksize, ) elif format.lower() == "sas7bdat": from pandas.io.sas.sas7bdat import SAS7BDATReader reader = SAS7BDATReader( - filepath_or_buffer, index=index, encoding=encoding, chunksize=chunksize + ioargs.filepath_or_buffer, + index=index, + encoding=ioargs.encoding, + chunksize=chunksize, ) else: raise ValueError("unknown SAS format") @@ -134,6 +138,6 @@ def read_sas( data = reader.read() - if should_close: + if ioargs.should_close: reader.close() return data diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 24cb92c8a26bb..0074ebc4decb0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1069,9 +1069,9 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close, _ = get_filepath_or_buffer( + path_or_buf = get_filepath_or_buffer( path_or_buf, storage_options=storage_options - ) + ).filepath_or_buffer if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") @@ -1979,11 +1979,16 @@ def _open_file_binary_write( compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) compression = dict(compression_args, method=compression_typ) - path_or_buf, _, compression, _, _ = get_filepath_or_buffer( + ioargs = get_filepath_or_buffer( fname, mode="wb", compression=compression, storage_options=storage_options, ) - f, _ = get_handle(path_or_buf, "wb", compression=compression, is_text=False) - return f, True, compression + f, _ = get_handle( + ioargs.filepath_or_buffer, + "wb", + compression=ioargs.compression, + is_text=False, + ) + return f, True, ioargs.compression else: raise TypeError("fname must be a binary file, buffer or path-like.") diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index e4d5db444e186..85a12a13d19fb 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -105,23 +105,21 @@ def test_infer_compression_from_path(self, extension, expected, path_type): compression = icom.infer_compression(path, compression="infer") assert compression == expected - def test_get_filepath_or_buffer_with_path(self): - filename = "~/sometest" - filepath_or_buffer, _, _, should_close, _ = icom.get_filepath_or_buffer( - filename - ) - assert filepath_or_buffer != filename - assert os.path.isabs(filepath_or_buffer) - assert os.path.expanduser(filename) == filepath_or_buffer - assert not should_close + @pytest.mark.parametrize("path_type", [str, CustomFSPath, Path]) + def test_get_filepath_or_buffer_with_path(self, path_type): + # ignore LocalPath: it creates strange paths: /absolute/~/sometest + filename = path_type("~/sometest") + ioargs = icom.get_filepath_or_buffer(filename) + assert ioargs.filepath_or_buffer != filename + assert os.path.isabs(ioargs.filepath_or_buffer) + assert os.path.expanduser(filename) == ioargs.filepath_or_buffer + assert not ioargs.should_close def test_get_filepath_or_buffer_with_buffer(self): input_buffer = StringIO() - filepath_or_buffer, _, _, should_close, _ = icom.get_filepath_or_buffer( - input_buffer - ) - assert filepath_or_buffer == input_buffer - assert not should_close + ioargs = icom.get_filepath_or_buffer(input_buffer) + assert ioargs.filepath_or_buffer == input_buffer + assert not ioargs.should_close def test_iterator(self): reader = pd.read_csv(StringIO(self.data1), chunksize=1)