From 3a54dde1fb4d5fce731114e437786df2981161ef Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 10 Jul 2020 11:20:55 -0400 Subject: [PATCH 01/27] Pepper storage_options --- pandas/core/frame.py | 8 ++++++-- pandas/io/common.py | 4 ++++ pandas/io/excel/_base.py | 5 +++-- pandas/io/feather_format.py | 5 +++-- pandas/io/formats/csvs.py | 6 ++++-- pandas/io/json/_json.py | 8 +++++--- pandas/io/parquet.py | 7 ++++++- pandas/io/parsers.py | 3 ++- pandas/io/pickle.py | 11 +++++++---- pandas/io/sas/sas7bdat.py | 4 +++- pandas/io/sas/sas_xport.py | 6 ++++-- pandas/io/stata.py | 6 ++++-- 12 files changed, 51 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 87041341ac3a6..09bd24b5543a1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2217,7 +2217,8 @@ def to_feather(self, path, **kwargs) -> None: """, ) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, + storage_options: Optional[Dict[str, Any]] = None, **kwargs ) -> Optional[str]: kwargs.setdefault("headers", "keys") kwargs.setdefault("tablefmt", "pipe") @@ -2225,7 +2226,8 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) + buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode, + storage_options=storage_options) assert buf is not None # Help mypy. buf.writelines(result) return None @@ -2238,6 +2240,7 @@ def to_parquet( compression="snappy", index=None, partition_cols=None, + storage_options: Optional[Dict[str, Any]] = None, **kwargs, ) -> None: """ @@ -2328,6 +2331,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, + storage_options=storage_options, **kwargs, ) diff --git a/pandas/io/common.py b/pandas/io/common.py index 51323c5ff3ef5..336de46279325 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -180,6 +180,8 @@ def get_filepath_or_buffer( if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged + if storage_options: + raise ValueError("storage_options passed with non-fsspec URL") req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -206,6 +208,8 @@ def get_filepath_or_buffer( filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) ).open() return file_obj, encoding, compression, True + elif storage_options: + raise ValueError("storage_options passed with non-fsspec URL") if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 4fa4f158e9c3c..0b102af3e7ed8 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -336,12 +336,13 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options=None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer, + storage_options=storage_options) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index dfa43942fc8b3..e1d72e31b0bcf 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -64,7 +64,8 @@ def to_feather(df: DataFrame, path, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True): +def read_feather(path, columns=None, use_threads: bool = True, + storage_options=None): """ Load a feather-format object from the file path. @@ -98,7 +99,7 @@ def read_feather(path, columns=None, use_threads: bool = True): import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer(path) + path, _, _, should_close = get_filepath_or_buffer(path, storage_options=storage_options) df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 5bd51dc8351f6..61f58ff9579ea 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -12,7 +12,7 @@ import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, Dict, Any from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -54,6 +54,7 @@ def __init__( doublequote: bool = True, escapechar: Optional[str] = None, decimal=".", + storage_options: Optional[Dict[str, Any]] = None ): self.obj = obj @@ -64,7 +65,8 @@ def __init__( compression, self.compression_args = get_compression_method(compression) self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, mode=mode + path_or_buf, encoding=encoding, compression=compression, mode=mode, + storage_options=storage_options ) self.sep = sep self.na_rep = na_rep diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ff37c36962aec..93a9fa700e701 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,7 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import Any, Callable, Optional, Type +from typing import Any, Callable, Optional, Type, Dict import numpy as np @@ -44,6 +44,7 @@ def to_json( compression: Optional[str] = "infer", index: bool = True, indent: int = 0, + storage_options: Optional[Dict[str, Any]] = None ): if not index and orient not in ["split", "table"]: @@ -53,7 +54,7 @@ def to_json( if path_or_buf is not None: path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, compression=compression, mode="w" + path_or_buf, compression=compression, mode="w", storage_options=storage_options ) if lines and orient != "records": @@ -364,6 +365,7 @@ def read_json( chunksize: Optional[int] = None, compression="infer", nrows: Optional[int] = None, + storage_options: Optional[Dict[str, Any]] = None ): """ Convert a JSON string to pandas object. @@ -591,7 +593,7 @@ def read_json( compression = infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression + path_or_buf, encoding=encoding, compression=compression, storage_options=storage_options ) json_reader = JsonReader( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a0c9242684f0f..0a5b9ab77fd96 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -89,6 +89,7 @@ def write( compression="snappy", index: Optional[bool] = None, partition_cols=None, + storage_options=None, **kwargs, ): self.validate_dataframe(df) @@ -104,9 +105,11 @@ def write( import_optional_dependency("fsspec") import fsspec.core - fs, path = fsspec.core.url_to_fs(path) + fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) kwargs["filesystem"] = fs else: + if storage_options: + raise ValueError("storage_options passed with non-fsspec URL") path = _expand_user(path) if partition_cols is not None: # writes to multiple files under the given path @@ -218,6 +221,7 @@ def to_parquet( compression="snappy", index: Optional[bool] = None, partition_cols=None, + storage_options: Optional[Dict[str, Any]] = None, **kwargs, ): """ @@ -268,6 +272,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, + storage_options=storage_options, **kwargs, ) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c427d3a198b10..f8f912a01d742 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -420,6 +420,7 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" encoding = kwds.get("encoding", None) + storage_options = kwds.get('storage_options', None) if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding @@ -432,7 +433,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression + filepath_or_buffer, encoding, compression, storage_options=storage_options ) kwds["compression"] = compression diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 3b35b54a6dc16..5f591d50bdd6b 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -3,7 +3,7 @@ from typing import Any, Optional import warnings -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, Dict from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle @@ -14,6 +14,7 @@ def to_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: Optional[Dict[str, Any]] = None ): """ Pickle (serialize) object to file. @@ -76,7 +77,8 @@ def to_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression, mode="wb" + filepath_or_buffer, compression=compression, mode="wb", + storage_options=storage_options ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None @@ -97,7 +99,8 @@ def to_pickle( def read_pickle( - filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer" + filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", + storage_options: Optional[Dict[str, Any]] = None ): """ Load pickled pandas object (or any object) from file. @@ -162,7 +165,7 @@ def read_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression + filepath_or_buffer, compression=compression, storage_options=storage_options ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 3d9be7c15726b..7f788abceb5c0 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -110,6 +110,7 @@ def __init__( encoding=None, convert_text=True, convert_header_text=True, + storage_options=None ): self.index = index @@ -137,7 +138,8 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) + self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf, + storage_options=storage_options) if isinstance(self._path_or_buf, str): self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 7fc1bc6d3eb6c..56c827a944efa 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -244,7 +244,8 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None, + storage_options=None, ): self._encoding = encoding @@ -258,7 +259,8 @@ def __init__( encoding, compression, should_close, - ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) + ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, + storage_options=storage_options) if isinstance(filepath_or_buffer, (str, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, "rb") diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7677d8a94d521..39a3755a3bc5c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1035,6 +1035,7 @@ def __init__( columns: Optional[Sequence[str]] = None, order_categoricals: bool = True, chunksize: Optional[int] = None, + storage_options: Optional[Dict[str, Any]] = None, ): super().__init__() self.col_sizes: List[int] = [] @@ -1068,11 +1069,12 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf) + path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf, + storage_options=storage_options) if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") - elif isinstance(path_or_buf, IOBase): + elif hasattr(path_or_buf, "read"): # Copy to BytesIO, and ensure no encoding contents = path_or_buf.read() self.path_or_buf = BytesIO(contents) From e549f8dde28c7c6ebd81516cc0a2fb28d3a3a1f0 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 10:09:29 -0400 Subject: [PATCH 02/27] Add feather test --- pandas/conftest.py | 18 ++++++++++++++++++ pandas/io/feather_format.py | 5 +++-- pandas/tests/io/test_feather.py | 9 +++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index e0adb37e7d2f5..0979c80f3f0c9 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1224,3 +1224,21 @@ def sort_by_key(request): Tests None (no key) and the identity key. """ return request.param + + +@pytest.fixture() +def fsspectest(): + pytest.importorskip('fsspec') + from fsspec.implementations.memory import MemoryFileSystem + from fsspec import register_implementation + + class TestMemoryFS(MemoryFileSystem): + protocol = 'testmem' + test = [None] + + def __init__(self, **kwargs): + self.test[0] = kwargs.pop('test', None) + super().__init__(**kwargs) + + register_implementation('testmem', TestMemoryFS, True) + return TestMemoryFS() diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index e1d72e31b0bcf..653e3c51fab45 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -7,7 +7,7 @@ from pandas.io.common import get_filepath_or_buffer, stringify_path -def to_feather(df: DataFrame, path, **kwargs): +def to_feather(df: DataFrame, path, storage_options=None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -23,7 +23,8 @@ def to_feather(df: DataFrame, path, **kwargs): import_optional_dependency("pyarrow") from pyarrow import feather - path = stringify_path(path) + path, _, _, should_close = get_filepath_or_buffer( + path, mode='wb', storage_options=storage_options) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a8a5c8f00e6bf..767050a00ea7c 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -186,3 +186,12 @@ def test_http_path(self, feather_file): expected = pd.read_feather(feather_file) res = pd.read_feather(url) tm.assert_frame_equal(expected, res) + + +def test_fsspec_options(fsspectest): + df = pd.DataFrame({'a': [0]}) + df.to_feather('testmem://afile', storage_options={'test': 'feather_write'}) + assert fsspectest.test[0] == "feather_write" + out = pd.read_feather('testmem://afile', storage_options={'test': 'feather_read'}) + assert fsspectest.test[0] == "feather_read" + tm.assert_frame_equal(df, out) From 0034bff3cf8d04b9e4c969d0ea77bf10aad2b5e2 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 11:47:00 -0400 Subject: [PATCH 03/27] Add CSV and parquet options tests; lint --- pandas/conftest.py | 8 +++--- pandas/core/frame.py | 5 ++-- pandas/core/generic.py | 2 ++ pandas/io/excel/_base.py | 5 ++-- pandas/io/feather_format.py | 12 +++++---- pandas/io/formats/csvs.py | 9 ++++--- pandas/io/json/_json.py | 14 +++++++--- pandas/io/parquet.py | 30 ++++++++++++++++----- pandas/io/parsers.py | 4 ++- pandas/io/pickle.py | 13 +++++---- pandas/io/sas/sas7bdat.py | 7 ++--- pandas/io/sas/sas_xport.py | 11 +++++--- pandas/io/stata.py | 7 ++--- pandas/tests/io/test_feather.py | 6 ++--- pandas/tests/io/test_fsspec.py | 48 +++++++++++++++++++++++++++++++++ pandas/tests/io/test_s3.py | 2 +- 16 files changed, 137 insertions(+), 46 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0979c80f3f0c9..52a1a2678b6f6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1228,17 +1228,17 @@ def sort_by_key(request): @pytest.fixture() def fsspectest(): - pytest.importorskip('fsspec') + pytest.importorskip("fsspec") from fsspec.implementations.memory import MemoryFileSystem from fsspec import register_implementation class TestMemoryFS(MemoryFileSystem): - protocol = 'testmem' + protocol = "testmem" test = [None] def __init__(self, **kwargs): - self.test[0] = kwargs.pop('test', None) + self.test[0] = kwargs.pop("test", None) super().__init__(**kwargs) - register_implementation('testmem', TestMemoryFS, True) + register_implementation("testmem", TestMemoryFS, True) return TestMemoryFS() diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 141a24f3dbcb7..d3f07c0ec3ff2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2260,8 +2260,9 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode, - storage_options=storage_options) + buf, _, _, _ = get_filepath_or_buffer( + buf, mode=mode, storage_options=storage_options + ) assert buf is not None # Help mypy. buf.writelines(result) return None diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e46fde1f59f16..a129787ad6adf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3010,6 +3010,7 @@ def to_csv( escapechar: Optional[str] = None, decimal: Optional[str] = ".", errors: str = "strict", + storage_options: Optional[Dict[str, Any]] = None, ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3163,6 +3164,7 @@ def to_csv( doublequote=doublequote, escapechar=escapechar, decimal=decimal, + storage_options=storage_options, ) formatter.save() diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e1eb8bf5f05ea..962d974e353e1 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -341,8 +341,9 @@ def __init__(self, filepath_or_buffer, storage_options=None): if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer, - storage_options=storage_options) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer, storage_options=storage_options + ) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 653e3c51fab45..80a3ee1496a2e 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -4,7 +4,7 @@ from pandas import DataFrame, Int64Index, RangeIndex -from pandas.io.common import get_filepath_or_buffer, stringify_path +from pandas.io.common import get_filepath_or_buffer def to_feather(df: DataFrame, path, storage_options=None, **kwargs): @@ -24,7 +24,8 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): from pyarrow import feather path, _, _, should_close = get_filepath_or_buffer( - path, mode='wb', storage_options=storage_options) + path, mode="wb", storage_options=storage_options + ) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -65,8 +66,7 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True, - storage_options=None): +def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): """ Load a feather-format object from the file path. @@ -100,7 +100,9 @@ def read_feather(path, columns=None, use_threads: bool = True, import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer(path, storage_options=storage_options) + path, _, _, should_close = get_filepath_or_buffer( + path, storage_options=storage_options + ) df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 61f58ff9579ea..dba307c0fad95 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -54,7 +54,7 @@ def __init__( doublequote: bool = True, escapechar: Optional[str] = None, decimal=".", - storage_options: Optional[Dict[str, Any]] = None + storage_options: Optional[Dict[str, Any]] = None, ): self.obj = obj @@ -65,8 +65,11 @@ def __init__( compression, self.compression_args = get_compression_method(compression) self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, mode=mode, - storage_options=storage_options + path_or_buf, + encoding=encoding, + compression=compression, + mode=mode, + storage_options=storage_options, ) self.sep = sep self.na_rep = na_rep diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 93a9fa700e701..429edf323cb0e 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -44,7 +44,7 @@ def to_json( compression: Optional[str] = "infer", index: bool = True, indent: int = 0, - storage_options: Optional[Dict[str, Any]] = None + storage_options: Optional[Dict[str, Any]] = None, ): if not index and orient not in ["split", "table"]: @@ -54,7 +54,10 @@ def to_json( if path_or_buf is not None: path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, compression=compression, mode="w", storage_options=storage_options + path_or_buf, + compression=compression, + mode="w", + storage_options=storage_options, ) if lines and orient != "records": @@ -365,7 +368,7 @@ def read_json( chunksize: Optional[int] = None, compression="infer", nrows: Optional[int] = None, - storage_options: Optional[Dict[str, Any]] = None + storage_options: Optional[Dict[str, Any]] = None, ): """ Convert a JSON string to pandas object. @@ -593,7 +596,10 @@ def read_json( compression = infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, storage_options=storage_options + path_or_buf, + encoding=encoding, + compression=compression, + storage_options=storage_options, ) json_reader = JsonReader( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 31af355c55bf5..7a66ca1b19587 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -89,8 +89,7 @@ def write( path: FilePathOrBuffer[AnyStr], compression: Optional[str] = "snappy", index: Optional[bool] = None, - partition_cols=None, - storage_options=None, + storage_options: Optional[Dict[str, Any]] = None, partition_cols: Optional[List[str]] = None, **kwargs, ): @@ -126,12 +125,18 @@ def write( # write to single output file self.api.parquet.write_table(table, path, compression=compression, **kwargs) - def read(self, path, columns=None, **kwargs): + def read( + self, + path, + columns=None, + storage_options: Optional[Dict[str, Any]] = None, + **kwargs, + ): if is_fsspec_url(path) and "filesystem" not in kwargs: import_optional_dependency("fsspec") import fsspec.core - fs, path = fsspec.core.url_to_fs(path) + fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) should_close = False else: fs = kwargs.pop("filesystem", None) @@ -167,6 +172,7 @@ def write( compression="snappy", index=None, partition_cols=None, + storage_options: Optional[Dict[str, Any]] = None, **kwargs, ): self.validate_dataframe(df) @@ -189,7 +195,9 @@ def write( fsspec = import_optional_dependency("fsspec") # if filesystem is provided by fsspec, file must be opened in 'wb' mode. - kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open() + kwargs["open_with"] = lambda path, _: fsspec.open( + path, "wb", **(storage_options or {}) + ).open() else: path, _, _, _ = get_filepath_or_buffer(path) @@ -203,11 +211,19 @@ def write( **kwargs, ) - def read(self, path, columns=None, **kwargs): + def read( + self, + path, + columns=None, + storage_options: Optional[Dict[str, Any]] = None, + **kwargs, + ): if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") - open_with = lambda path, _: fsspec.open(path, "rb").open() + open_with = lambda path, _: fsspec.open( + path, "rb", **(storage_options or {}) + ).open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: path, _, _, _ = get_filepath_or_buffer(path) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8f26870c356a2..9dc0e1f71d13b 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -420,7 +420,7 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" encoding = kwds.get("encoding", None) - storage_options = kwds.get('storage_options', None) + storage_options = kwds.get("storage_options", None) if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding @@ -596,6 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, + storage_options=None, ): # gh-23761 # @@ -682,6 +683,7 @@ def read_csv( mangle_dupe_cols=mangle_dupe_cols, infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, + storage_options=storage_options, ) return _read(filepath_or_buffer, kwds) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 5f591d50bdd6b..c845d898dfae9 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -14,7 +14,7 @@ def to_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, - storage_options: Optional[Dict[str, Any]] = None + storage_options: Optional[Dict[str, Any]] = None, ): """ Pickle (serialize) object to file. @@ -77,8 +77,10 @@ def to_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression, mode="wb", - storage_options=storage_options + filepath_or_buffer, + compression=compression, + mode="wb", + storage_options=storage_options, ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None @@ -99,8 +101,9 @@ def to_pickle( def read_pickle( - filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", - storage_options: Optional[Dict[str, Any]] = None + filepath_or_buffer: FilePathOrBuffer, + compression: Optional[str] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ): """ Load pickled pandas object (or any object) from file. diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 7f788abceb5c0..32ef0b20f8a2a 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -110,7 +110,7 @@ def __init__( encoding=None, convert_text=True, convert_header_text=True, - storage_options=None + storage_options=None, ): self.index = index @@ -138,8 +138,9 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf, - storage_options=storage_options) + self._path_or_buf, _, _, _ = get_filepath_or_buffer( + path_or_buf, storage_options=storage_options + ) if isinstance(self._path_or_buf, str): self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 56c827a944efa..a768589ae6f9a 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -244,7 +244,11 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None, + self, + filepath_or_buffer, + index=None, + encoding="ISO-8859-1", + chunksize=None, storage_options=None, ): @@ -259,8 +263,9 @@ def __init__( encoding, compression, should_close, - ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, - storage_options=storage_options) + ) = get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, storage_options=storage_options + ) if isinstance(filepath_or_buffer, (str, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, "rb") diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 39a3755a3bc5c..4f6a13ed867d0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -11,7 +11,7 @@ """ from collections import abc import datetime -from io import BytesIO, IOBase +from io import BytesIO import os from pathlib import Path import struct @@ -1069,8 +1069,9 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf, - storage_options=storage_options) + path_or_buf, encoding, _, should_close = get_filepath_or_buffer( + path_or_buf, storage_options=storage_options + ) if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 767050a00ea7c..2356e9c87ea05 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -189,9 +189,9 @@ def test_http_path(self, feather_file): def test_fsspec_options(fsspectest): - df = pd.DataFrame({'a': [0]}) - df.to_feather('testmem://afile', storage_options={'test': 'feather_write'}) + df = pd.DataFrame({"a": [0]}) + df.to_feather("testmem://afile", storage_options={"test": "feather_write"}) assert fsspectest.test[0] == "feather_write" - out = pd.read_feather('testmem://afile', storage_options={'test': 'feather_read'}) + out = pd.read_feather("testmem://afile", storage_options={"test": "feather_read"}) assert fsspectest.test[0] == "feather_read" tm.assert_frame_equal(df, out) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c397a61616c1c..98a942e7af5fc 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -62,6 +62,16 @@ def test_to_csv(cleared_fs): tm.assert_frame_equal(df1, df2) +def test_csv_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_csv( + "testmem://test/test.csv", storage_options={"test": "csv_write"}, index=False + ) + assert fsspectest.test[0] == "csv_write" + read_csv("testmem://test/test.csv", storage_options={"test": "csv_read"}) + assert fsspectest.test[0] == "csv_read" + + @td.skip_if_no("fastparquet") def test_to_parquet_new_file(monkeypatch, cleared_fs): """Regression test for writing to a not-yet-existent GCS Parquet file.""" @@ -70,6 +80,44 @@ def test_to_parquet_new_file(monkeypatch, cleared_fs): ) +@td.skip_if_no("pyarrow") +def test_arrowparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df = DataFrame({"a": [0]}) + df.to_parquet( + "testmem://test/test.csv", + engine="pyarrow", + compression=None, + storage_options={"test": "parquet_write"}, + ) + assert fsspectest.test[0] == "parquet_write" + read_parquet( + "testmem://test/test.csv", + engine="pyarrow", + storage_options={"test": "parquet_read"}, + ) + assert fsspectest.test[0] == "parquet_read" + + +@td.skip_if_no("fastparquet") +def test_fastparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df = DataFrame({"a": [0]}) + df.to_parquet( + "testmem://test/test.csv", + engine="fastparquet", + compression=None, + storage_options={"test": "parquet_write"}, + ) + assert fsspectest.test[0] == "parquet_write" + read_parquet( + "testmem://test/test.csv", + engine="fastparquet", + storage_options={"test": "parquet_read"}, + ) + assert fsspectest.test[0] == "parquet_read" + + @td.skip_if_no("s3fs") def test_from_s3_csv(s3_resource, tips_file): tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 5e0f7edf4d8ae..a137e76b1696b 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -32,7 +32,7 @@ def test_read_without_creds_from_pub_bucket(): @tm.network @td.skip_if_no("s3fs") -def test_read_with_creds_from_pub_bucke(): +def test_read_with_creds_from_pub_bucket(): # Ensure we can read from a public bucket with credentials # GH 34626 # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt From 19f041dbcf02cad68a664e94d10b48d40cfd1c70 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 12:09:08 -0400 Subject: [PATCH 04/27] deeper lint --- pandas/io/formats/csvs.py | 4 ++-- pandas/io/json/_json.py | 2 +- pandas/io/pickle.py | 3 +-- pandas/io/stata.py | 1 + 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index dba307c0fad95..4d8fb9270f5c9 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,14 +5,14 @@ import csv as csvlib from io import StringIO import os -from typing import Hashable, List, Mapping, Optional, Sequence, Union +from typing import Any, Dict, Hashable, List, Mapping, Optional, Sequence, Union import warnings from zipfile import ZipFile import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer, Dict, Any +from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.generic import ( ABCDatetimeIndex, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 429edf323cb0e..ea37ec4c50a7c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,7 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import Any, Callable, Optional, Type, Dict +from typing import Any, Callable, Dict, Optional, Type import numpy as np diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index c845d898dfae9..3c55dd4fe043e 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -3,9 +3,8 @@ from typing import Any, Optional import warnings -from pandas._typing import FilePathOrBuffer, Dict +from pandas._typing import Dict, FilePathOrBuffer from pandas.compat import pickle_compat as pc - from pandas.io.common import get_filepath_or_buffer, get_handle diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4f6a13ed867d0..39c7e37ae0f6e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1076,6 +1076,7 @@ def __init__( if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") elif hasattr(path_or_buf, "read"): + assert not isinstance(path_or_buf, str) # appease typing # Copy to BytesIO, and ensure no encoding contents = path_or_buf.read() self.path_or_buf = BytesIO(contents) From f9e1e692da85a0e4d97a2bf8835bc75525b85eed Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 15:21:50 -0400 Subject: [PATCH 05/27] more tests --- pandas/core/generic.py | 11 +++++++++- pandas/io/json/_json.py | 4 +++- pandas/io/pickle.py | 4 ++-- pandas/io/stata.py | 4 ++-- pandas/tests/io/test_feather.py | 9 -------- pandas/tests/io/test_fsspec.py | 38 ++++++++++++++++++++++++++++++++- 6 files changed, 54 insertions(+), 16 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a129787ad6adf..315ffc4a84fe3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2042,6 +2042,7 @@ def to_json( compression: Optional[str] = "infer", index: bool_t = True, indent: Optional[int] = None, + storage_options: Optional[Dict[str, Any]] = None, ) -> Optional[str]: """ Convert the object to a JSON string. @@ -2303,6 +2304,7 @@ def to_json( compression=compression, index=index, indent=indent, + storage_options=storage_options, ) def to_hdf( @@ -2617,6 +2619,7 @@ def to_pickle( path, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: Optional[Dict[str, Any]] = None, ) -> None: """ Pickle (serialize) object to file. @@ -2670,7 +2673,13 @@ def to_pickle( """ from pandas.io.pickle import to_pickle - to_pickle(self, path, compression=compression, protocol=protocol) + to_pickle( + self, + path, + compression=compression, + protocol=protocol, + storage_options=storage_options, + ) def to_clipboard( self, excel: bool_t = True, sep: Optional[str] = None, **kwargs diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ea37ec4c50a7c..3f10c4694d10f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -53,7 +53,7 @@ def to_json( ) if path_or_buf is not None: - path_or_buf, _, _, _ = get_filepath_or_buffer( + path_or_buf, _, _, should_close = get_filepath_or_buffer( path_or_buf, compression=compression, mode="w", @@ -101,6 +101,8 @@ def to_json( return s else: path_or_buf.write(s) + if should_close: + path_or_buf.close() class Writer: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 3c55dd4fe043e..01fd357d03763 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,9 +1,9 @@ """ pickle compat """ import pickle -from typing import Any, Optional +from typing import Any, Dict, Optional import warnings -from pandas._typing import Dict, FilePathOrBuffer +from pandas._typing import FilePathOrBuffer from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 39c7e37ae0f6e..d3fc5f55ce7ac 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, Label +from pandas._typing import FilePathOrBuffer, IO, Label from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1076,8 +1076,8 @@ def __init__( if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") elif hasattr(path_or_buf, "read"): - assert not isinstance(path_or_buf, str) # appease typing # Copy to BytesIO, and ensure no encoding + assert isinstance(path_or_buf, IO) contents = path_or_buf.read() self.path_or_buf = BytesIO(contents) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 2356e9c87ea05..a8a5c8f00e6bf 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -186,12 +186,3 @@ def test_http_path(self, feather_file): expected = pd.read_feather(feather_file) res = pd.read_feather(url) tm.assert_frame_equal(expected, res) - - -def test_fsspec_options(fsspectest): - df = pd.DataFrame({"a": [0]}) - df.to_feather("testmem://afile", storage_options={"test": "feather_write"}) - assert fsspectest.test[0] == "feather_write" - out = pd.read_feather("testmem://afile", storage_options={"test": "feather_read"}) - assert fsspectest.test[0] == "feather_read" - tm.assert_frame_equal(df, out) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 98a942e7af5fc..4289e1bf84461 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -1,7 +1,15 @@ import numpy as np import pytest -from pandas import DataFrame, date_range, read_csv, read_parquet +from pandas import ( + DataFrame, + date_range, + read_csv, + read_parquet, + read_feather, + read_pickle, + read_json, +) import pandas._testing as tm from pandas.util import _test_decorators as td @@ -148,3 +156,31 @@ def test_not_present_exception(): with pytest.raises(ImportError) as e: read_csv("memory://test/test.csv") assert "fsspec library is required" in str(e.value) + + +@td.skip_if_no("pyarrow") +def test_feather_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_feather("testmem://afile", storage_options={"test": "feather_write"}) + assert fsspectest.test[0] == "feather_write" + out = read_feather("testmem://afile", storage_options={"test": "feather_read"}) + assert fsspectest.test[0] == "feather_read" + tm.assert_frame_equal(df, out) + + +def test_pickle_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_pickle("testmem://afile", storage_options={"test": "pickle_write"}) + assert fsspectest.test[0] == "pickle_write" + out = read_pickle("testmem://afile", storage_options={"test": "pickle_read"}) + assert fsspectest.test[0] == "pickle_read" + tm.assert_frame_equal(df, out) + + +def test_json_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_json("testmem://afile", storage_options={"test": "json_write"}) + assert fsspectest.test[0] == "json_write" + out = read_json("testmem://afile", storage_options={"test": "json_read"}) + assert fsspectest.test[0] == "json_read" + tm.assert_frame_equal(df, out) From 7f69afea9835f187d70c0f38913822c653d680e3 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 15:31:03 -0400 Subject: [PATCH 06/27] blank line --- pandas/io/pickle.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 01fd357d03763..44b15391da6a4 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -5,6 +5,7 @@ from pandas._typing import FilePathOrBuffer from pandas.compat import pickle_compat as pc + from pandas.io.common import get_filepath_or_buffer, get_handle From cc0e4c30625caf4a4ed120482900496588c80e81 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 15:58:21 -0400 Subject: [PATCH 07/27] attempt relint --- pandas/io/stata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d3fc5f55ce7ac..5508bb759c9b2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, IO, Label +from pandas._typing import IO, FilePathOrBuffer, Label from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1077,8 +1077,8 @@ def __init__( self.path_or_buf = open(path_or_buf, "rb") elif hasattr(path_or_buf, "read"): # Copy to BytesIO, and ensure no encoding - assert isinstance(path_or_buf, IO) - contents = path_or_buf.read() + pb: Any = path_or_buf + contents = pb.read() self.path_or_buf = BytesIO(contents) self._read_header() From e356e9351d93d637b7c6058a96147442d6a8f2ec Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 16:44:06 -0400 Subject: [PATCH 08/27] unused import --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5508bb759c9b2..feb76e6c37bda 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import IO, FilePathOrBuffer, Label +from pandas._typing import FilePathOrBuffer, Label from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( From c7170dd8adac757cd997e16ef4cd21cbcced88be Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 17:18:48 -0400 Subject: [PATCH 09/27] more order --- pandas/tests/io/test_fsspec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 4289e1bf84461..4b3f5b87e6583 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,10 +5,10 @@ DataFrame, date_range, read_csv, - read_parquet, read_feather, - read_pickle, read_json, + read_parquet, + read_pickle, ) import pandas._testing as tm from pandas.util import _test_decorators as td @@ -45,8 +45,8 @@ def test_read_csv(cleared_fs): def test_reasonable_error(monkeypatch, cleared_fs): - from fsspec.registry import known_implementations from fsspec import registry + from fsspec.registry import known_implementations registry.target.clear() with pytest.raises(ValueError) as e: From b96778dcc69871d33252fd3c9d10c34f5836f4ef Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 11:23:51 -0400 Subject: [PATCH 10/27] plumb stata and test --- pandas/core/frame.py | 2 ++ pandas/io/stata.py | 19 ++++++++++++++++--- pandas/tests/io/test_fsspec.py | 12 ++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d3f07c0ec3ff2..344f35b01bce7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2055,6 +2055,7 @@ def to_stata( version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ) -> None: """ Export DataFrame object to Stata dta format. @@ -2187,6 +2188,7 @@ def to_stata( write_index=write_index, variable_labels=variable_labels, compression=compression, + storage_options=storage_options, **kwargs, ) writer.write_file() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index feb76e6c37bda..cc59343fc0534 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1911,6 +1911,7 @@ def read_stata( order_categoricals: bool = True, chunksize: Optional[int] = None, iterator: bool = False, + storage_options: Optional[Dict[str, Any]] = None, ) -> Union[DataFrame, StataReader]: reader = StataReader( @@ -1923,6 +1924,7 @@ def read_stata( columns=columns, order_categoricals=order_categoricals, chunksize=chunksize, + storage_options=storage_options, ) if iterator or chunksize: @@ -1936,7 +1938,9 @@ def read_stata( def _open_file_binary_write( - fname: FilePathOrBuffer, compression: Union[str, Mapping[str, str], None], + fname: FilePathOrBuffer, + compression: Union[str, Mapping[str, str], None], + storage_options: Optional[Dict[str, Any]] = None, ) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]: """ Open a binary file or no-op if file-like. @@ -1963,7 +1967,10 @@ def _open_file_binary_write( compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) path_or_buf, _, compression_typ, _ = get_filepath_or_buffer( - fname, compression=compression_typ + fname, + mode="wb", + compression=compression_typ, + storage_options=storage_options, ) if compression_typ is not None: compression = compression_args @@ -2209,6 +2216,7 @@ def __init__( data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ): super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates @@ -2221,6 +2229,7 @@ def __init__( self._output_file: Optional[BinaryIO] = None # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) + self.storage_options = storage_options if byteorder is None: byteorder = sys.byteorder @@ -2507,7 +2516,7 @@ def _encode_strings(self) -> None: def write_file(self) -> None: self._file, self._own_file, compression = _open_file_binary_write( - self._fname, self._compression + self._fname, self._compression, storage_options=self.storage_options ) if compression is not None: self._output_file = self._file @@ -3090,6 +3099,7 @@ def __init__( variable_labels: Optional[Dict[Label, str]] = None, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ): # Copy to new list since convert_strl might be modified later self._convert_strl: List[Label] = [] @@ -3106,6 +3116,7 @@ def __init__( data_label=data_label, variable_labels=variable_labels, compression=compression, + storage_options=storage_options, ) self._map: Dict[str, int] = {} self._strl_blob = b"" @@ -3493,6 +3504,7 @@ def __init__( convert_strl: Optional[Sequence[Label]] = None, version: Optional[int] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ): if version is None: version = 118 if data.shape[1] <= 32767 else 119 @@ -3515,6 +3527,7 @@ def __init__( variable_labels=variable_labels, convert_strl=convert_strl, compression=compression, + storage_options=storage_options, ) # Override version set in StataWriter117 init self._dta_version = version diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 4b3f5b87e6583..86288255fc566 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -9,6 +9,7 @@ read_json, read_parquet, read_pickle, + read_stata, ) import pandas._testing as tm from pandas.util import _test_decorators as td @@ -184,3 +185,14 @@ def test_json_options(fsspectest): out = read_json("testmem://afile", storage_options={"test": "json_read"}) assert fsspectest.test[0] == "json_read" tm.assert_frame_equal(df, out) + + +def test_stata_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_stata( + "testmem://afile", storage_options={"test": "stata_write"}, write_index=False + ) + assert fsspectest.test[0] == "stata_write" + out = read_stata("testmem://afile", storage_options={"test": "stata_read"}) + assert fsspectest.test[0] == "stata_read" + tm.assert_frame_equal(df, out.astype("int64")) From 1dc41b1acaa48bb56dcbd3a25d80b3cb740985c5 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 11:50:21 -0400 Subject: [PATCH 11/27] Add note about storage_options in whatsnew --- doc/source/whatsnew/v1.1.0.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 55e2a810e6fc3..72687608f64b6 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -265,6 +265,12 @@ SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_. The existing capability to interface with S3 and GCS will be unaffected by this change, as ``fsspec`` will still bring in the same packages as before. +Many read/write functions have acquired the `storage_options` optional argument, +to pass a dictionary of parameters to the storage backend. This allows, for +example, for passing credentials to S3 and GCS storage. The details of what +parameters can be passed to which backends can be found in the documentation +of the individual storage backends. + .. _Azure Datalake and Blob: https://github.com/dask/adlfs .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ From d8829841586a4a3433d5d85aad12f84b99b6df82 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 12:00:24 -0400 Subject: [PATCH 12/27] Plumb and test markdown --- pandas/core/frame.py | 9 ++++++--- pandas/tests/io/test_fsspec.py | 7 +++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 344f35b01bce7..4354979fcfe3c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2241,8 +2241,8 @@ def to_feather(self, path, **kwargs) -> None: ) def to_markdown( self, - buf: Optional[IO[str]] = None, - mode: Optional[str] = None, + buf: Optional[Union[IO[str], str]] = None, + mode: Optional[str] = "wt", index: bool = True, storage_options: Optional[Dict[str, Any]] = None, **kwargs, @@ -2262,11 +2262,14 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, _ = get_filepath_or_buffer( + buf, _, _, should_close = get_filepath_or_buffer( buf, mode=mode, storage_options=storage_options ) assert buf is not None # Help mypy. + assert not isinstance(buf, str) buf.writelines(result) + if should_close: + buf.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 86288255fc566..58b8332920da2 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -196,3 +196,10 @@ def test_stata_options(fsspectest): out = read_stata("testmem://afile", storage_options={"test": "stata_read"}) assert fsspectest.test[0] == "stata_read" tm.assert_frame_equal(df, out.astype("int64")) + + +def test_markdown_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_markdown("testmem://afile", storage_options={"test": "md_write"}) + assert fsspectest.test[0] == "md_write" + assert fsspectest.cat("afile") From f1e455dbd78d129dbf40fdd9dd155ccd1e012831 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 13:37:57 -0400 Subject: [PATCH 13/27] optional markdown --- pandas/tests/io/test_fsspec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 58b8332920da2..09d82244ca13f 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -198,6 +198,7 @@ def test_stata_options(fsspectest): tm.assert_frame_equal(df, out.astype("int64")) +@td.skip_if_no("tabulate") def test_markdown_options(fsspectest): df = DataFrame({"a": [0]}) df.to_markdown("testmem://afile", storage_options={"test": "md_write"}) From c88b75f2e3038a4115570be775db7d0fa6114309 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 13:52:42 -0400 Subject: [PATCH 14/27] remove extraneous --- pandas/io/excel/_base.py | 6 ++---- pandas/io/sas/sas7bdat.py | 5 +---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 962d974e353e1..2a12f779230b2 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -336,14 +336,12 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer, storage_options=None): + def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( - filepath_or_buffer, storage_options=storage_options - ) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 32ef0b20f8a2a..3d9be7c15726b 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -110,7 +110,6 @@ def __init__( encoding=None, convert_text=True, convert_header_text=True, - storage_options=None, ): self.index = index @@ -138,9 +137,7 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, storage_options=storage_options - ) + self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, str): self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf From 58481a4bef9ff2abf7ade0b9022929fce2591f4b Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 13:54:35 -0400 Subject: [PATCH 15/27] more extraneous --- pandas/io/sas/sas_xport.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index a768589ae6f9a..6cf248b748107 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -244,12 +244,7 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, - filepath_or_buffer, - index=None, - encoding="ISO-8859-1", - chunksize=None, - storage_options=None, + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None, ): self._encoding = encoding @@ -263,9 +258,7 @@ def __init__( encoding, compression, should_close, - ) = get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding, storage_options=storage_options - ) + ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) if isinstance(filepath_or_buffer, (str, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, "rb") From 704770b3598b0c5dc6ecac6a984ea514ac9682d1 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 13:33:22 -0400 Subject: [PATCH 16/27] Add fsspec options error and docstrings --- pandas/core/frame.py | 22 +++++++++++++++++++++- pandas/core/generic.py | 28 ++++++++++++++++++++++++++++ pandas/io/common.py | 20 ++++++++++++++++---- pandas/io/feather_format.py | 10 ++++++++++ pandas/io/json/_json.py | 10 ++++++++++ pandas/io/parquet.py | 14 +++++++++++++- pandas/io/pickle.py | 20 ++++++++++++++++++++ pandas/io/stata.py | 20 ++++++++++++++++++++ pandas/tests/io/test_fsspec.py | 13 +++++++++++++ 9 files changed, 151 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4354979fcfe3c..f089bdfea3a4a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2132,6 +2132,16 @@ def to_stata( .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Raises ------ NotImplementedError @@ -2242,7 +2252,7 @@ def to_feather(self, path, **kwargs) -> None: def to_markdown( self, buf: Optional[Union[IO[str], str]] = None, - mode: Optional[str] = "wt", + mode: str = "wt", index: bool = True, storage_options: Optional[Dict[str, Any]] = None, **kwargs, @@ -2329,6 +2339,16 @@ def to_parquet( .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 315ffc4a84fe3..f4854ece0dea4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2126,6 +2126,16 @@ def to_json( .. versionadded:: 1.0.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- None or str @@ -2640,6 +2650,16 @@ def to_pickle( .. [1] https://docs.python.org/3/library/pickle.html. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + See Also -------- read_pickle : Load pickled pandas object (or any object) from file. @@ -3119,6 +3139,14 @@ def to_csv( See the errors argument for :func:`open` for a full list of options. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + .. versionadded:: 1.1.0 Returns diff --git a/pandas/io/common.py b/pandas/io/common.py index 56041481a9d34..152d09f569371 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -167,8 +167,16 @@ def get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional - storage_options: dict, optional - passed on to fsspec, if using it; this is not yet accessed by the public API + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 Returns ------- @@ -181,7 +189,9 @@ def get_filepath_or_buffer( if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged if storage_options: - raise ValueError("storage_options passed with non-fsspec URL") + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -237,7 +247,9 @@ def get_filepath_or_buffer( return file_obj, encoding, compression, True elif storage_options: - raise ValueError("storage_options passed with non-fsspec URL") + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 80a3ee1496a2e..f6760bb3c772f 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -15,6 +15,16 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): ---------- df : DataFrame path : string file path, or file-like object + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 3f10c4694d10f..119ba47096df9 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -516,6 +516,16 @@ def read_json( .. versionadded:: 1.1 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- Series or DataFrame diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7a66ca1b19587..6f0c6caaeb385 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -110,7 +110,9 @@ def write( kwargs["filesystem"] = fs else: if storage_options: - raise ValueError("storage_options passed with non-fsspec URL") + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) path = _expand_user(path) if partition_cols is not None: # writes to multiple files under the given path @@ -282,6 +284,16 @@ def to_parquet( .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + kwargs Additional keyword arguments passed to the engine """ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 44b15391da6a4..6e91389d567a8 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -43,6 +43,16 @@ def to_pickle( protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + .. [1] https://docs.python.org/3/library/pickle.html See Also @@ -127,6 +137,16 @@ def read_pickle( compression) If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- unpickled : same type as object stored in file diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cc59343fc0534..7bb83a07e5f16 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1952,6 +1952,16 @@ def _open_file_binary_write( compression : {str, dict, None} The compression method to use. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- file : file-like object @@ -2167,6 +2177,16 @@ class StataWriter(StataParser): .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- writer : StataWriter instance diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 09d82244ca13f..4282884885f4d 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -1,3 +1,4 @@ +import io import numpy as np import pytest @@ -204,3 +205,15 @@ def test_markdown_options(fsspectest): df.to_markdown("testmem://afile", storage_options={"test": "md_write"}) assert fsspectest.test[0] == "md_write" assert fsspectest.cat("afile") + + +def test_non_fsspec_options(): + with pytest.raises(ValueError, match="storage_options"): + read_csv("localfile", storage_options={"a": True}) + with pytest.raises(ValueError, match="storage_options"): + # separate test for parquet, which has a different code path + read_parquet("localfile", storage_options={"a": True}) + by = io.BytesIO() + + with pytest.raises(ValueError, match="storage_options"): + read_csv(by, storage_options={"a": True}) From 1b8637e78dfb8041d523939bac93794872245a42 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 14:04:08 -0400 Subject: [PATCH 17/27] fix that --- pandas/io/parquet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6f0c6caaeb385..46fc18868b1ab 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -141,6 +141,8 @@ def read( fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) should_close = False else: + if storage_options: + raise ValueError("storage_options passed with buffer or non-fsspec filepath") fs = kwargs.pop("filesystem", None) should_close = False path = _expand_user(path) From bbcef17f0f6d68afe39487b43fb17bce4a8ca098 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 14:04:21 -0400 Subject: [PATCH 18/27] black --- pandas/io/parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 46fc18868b1ab..1188574519a5c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -142,7 +142,9 @@ def read( should_close = False else: if storage_options: - raise ValueError("storage_options passed with buffer or non-fsspec filepath") + raise ValueError( + "storage_options passed with buffer or non-fsspec filepath" + ) fs = kwargs.pop("filesystem", None) should_close = False path = _expand_user(path) From a18686cba287750e6aa260bbdaee8d09e89ae785 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 14:32:51 -0400 Subject: [PATCH 19/27] fix it again --- pandas/tests/io/test_fsspec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 4282884885f4d..c05aa8c0db4d4 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -207,6 +207,7 @@ def test_markdown_options(fsspectest): assert fsspectest.cat("afile") +@td.skip_if_no("pyarrow") def test_non_fsspec_options(): with pytest.raises(ValueError, match="storage_options"): read_csv("localfile", storage_options={"a": True}) From fa656cb14b9d92fbda14788247d0cad2a7c99a47 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 15:08:10 -0400 Subject: [PATCH 20/27] more lint --- pandas/core/series.py | 2 +- pandas/tests/io/test_fsspec.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index ef3be854bc3bb..b6755364bf5e6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1421,7 +1421,7 @@ def to_string( def to_markdown( self, buf: Optional[IO[str]] = None, - mode: Optional[str] = None, + mode: str = "wt", index: bool = True, **kwargs, ) -> Optional[str]: diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c05aa8c0db4d4..577822dd90bae 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -1,4 +1,5 @@ import io + import numpy as np import pytest From a79a274be7b8c70048a035c089a6445bff2b1978 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 27 Jul 2020 11:12:00 -0400 Subject: [PATCH 21/27] Requested changes --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/series.py | 2 +- pandas/io/json/_json.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 788a1465bde04..ef6a5ad92904e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -269,7 +269,7 @@ Many read/write functions have acquired the `storage_options` optional argument, to pass a dictionary of parameters to the storage backend. This allows, for example, for passing credentials to S3 and GCS storage. The details of what parameters can be passed to which backends can be found in the documentation -of the individual storage backends. +of the individual storage backends (linked from the fsspec docs). .. _Azure Datalake and Blob: https://github.com/dask/adlfs diff --git a/pandas/core/series.py b/pandas/core/series.py index b6755364bf5e6..2d7a571335510 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1435,7 +1435,7 @@ def to_markdown( buf : str, Path or StringIO-like, optional, default None Buffer to write to. If None, the output is returned as a string. mode : str, optional - Mode in which file is opened. + Mode in which file is opened, "wt" by default. index : bool, optional, default True Add index (row) labels. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 119ba47096df9..22ff5957e5413 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -56,7 +56,7 @@ def to_json( path_or_buf, _, _, should_close = get_filepath_or_buffer( path_or_buf, compression=compression, - mode="w", + mode="wt", storage_options=storage_options, ) From e99f8ed37f977403e5bfae32efc8d3e4fde5f444 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 29 Jul 2020 10:06:30 -0400 Subject: [PATCH 22/27] Update versions --- doc/source/whatsnew/v1.1.0.rst | 6 ------ doc/source/whatsnew/v1.2.0.rst | 12 +++++++++++- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 8 +++++--- pandas/io/common.py | 2 +- pandas/io/feather_format.py | 3 ++- pandas/io/json/_json.py | 2 +- pandas/io/parquet.py | 2 +- pandas/io/pickle.py | 4 ++-- pandas/io/stata.py | 4 ++-- 10 files changed, 27 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bf8b09839f442..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -265,12 +265,6 @@ SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_. The existing capability to interface with S3 and GCS will be unaffected by this change, as ``fsspec`` will still bring in the same packages as before. -Many read/write functions have acquired the `storage_options` optional argument, -to pass a dictionary of parameters to the storage backend. This allows, for -example, for passing credentials to S3 and GCS storage. The details of what -parameters can be passed to which backends can be found in the documentation -of the individual storage backends (linked from the fsspec docs). - .. _Azure Datalake and Blob: https://github.com/dask/adlfs .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2066858e5de86..d7116df8904f3 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,6 +13,16 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +Passing arguments to fsspec backends +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Many read/write functions have acquired the `storage_options` optional argument, +to pass a dictionary of parameters to the storage backend. This allows, for +example, for passing credentials to S3 and GCS storage. The details of what +parameters can be passed to which backends can be found in the documentation +of the individual storage backends (linked from the fsspec docs). + + .. _whatsnew_120.enhancements.other: Other enhancements @@ -165,4 +175,4 @@ Other .. _whatsnew_120.contributors: Contributors -~~~~~~~~~~~~ \ No newline at end of file +~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ddd2f7956e699..987e89693c1bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2140,7 +2140,7 @@ def to_stata( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Raises ------ @@ -2347,7 +2347,7 @@ def to_parquet( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 **kwargs Additional arguments passed to the parquet library. See diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f4854ece0dea4..544477cc10a59 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2134,7 +2134,7 @@ def to_json( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- @@ -2658,7 +2658,7 @@ def to_pickle( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 See Also -------- @@ -3139,6 +3139,8 @@ def to_csv( See the errors argument for :func:`open` for a full list of options. + .. versionadded:: 1.1.0 + storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will @@ -3147,7 +3149,7 @@ def to_csv( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 880a4f01be175..7387867603b12 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -176,7 +176,7 @@ def get_filepath_or_buffer( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index f6760bb3c772f..2c664e73b9463 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -24,7 +24,8 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 + **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 22ff5957e5413..080d6ced677b2 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -524,7 +524,7 @@ def read_json( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 1188574519a5c..1101261f25184 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -296,7 +296,7 @@ def to_parquet( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 kwargs Additional keyword arguments passed to the engine diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 6e91389d567a8..3aecc9c91d189 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -51,7 +51,7 @@ def to_pickle( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 .. [1] https://docs.python.org/3/library/pickle.html @@ -145,7 +145,7 @@ def read_pickle( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7bb83a07e5f16..ef5c9b7e1ab83 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1960,7 +1960,7 @@ def _open_file_binary_write( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- @@ -2185,7 +2185,7 @@ class StataWriter(StataParser): a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- From 38a8330ac8fd12610cd7f29dce4672325cd23eef Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 6 Aug 2020 10:53:59 -0400 Subject: [PATCH 23/27] Add docs, links and ensure fsspec test teardown --- doc/source/user_guide/io.rst | 49 +++++++++++++++++++++++++++++----- doc/source/whatsnew/v1.2.0.rst | 6 +++-- pandas/conftest.py | 8 ++++-- pandas/core/frame.py | 2 +- pandas/core/series.py | 16 ++++++++++- pandas/io/parquet.py | 4 +++ pandas/tests/io/test_fsspec.py | 4 +++ 7 files changed, 76 insertions(+), 13 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index d4be9d802d697..4079165ab22fc 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1635,26 +1635,61 @@ python engine is selected explicitly using ``engine='python'``. Reading remote files '''''''''''''''''''' -You can pass in a URL to a CSV file: +You can pass in a URL to read or write remote files to many of Pandas' IO +functions - the following example shows reading a CSV file: .. code-block:: python df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item', sep='\t') -S3 URLs are handled as well but require installing the `S3Fs +All URLs which are not local files or HTTP(s) are handled by +`fsspec`_, if installed, and its various filesystem implementations +(including Amazon S3, Google Cloud, SSH, FTP, webHDFS...). +Some of these implementations will require additional packages to be +installed, for example +S3 URLs require the `s3fs `_ library: .. code-block:: python - df = pd.read_csv('s3://pandas-test/tips.csv') + df = pd.read_json('s3://pandas-test/adatafile.json') -If your S3 bucket requires credentials you will need to set them as environment -variables or in the ``~/.aws/credentials`` config file, refer to the `S3Fs -documentation on credentials -`_. +When dealing with remote storage systems, you might need +extra configuration with environment variables or config files in +special locations. For example, to access data in your S3 bucket, +you will need to define credentials in one of the several ways listed in +the `S3Fs documentation +`_. The same is true +for several of the storage backends, and you should follow the links +at `fsimpl`_ for implementations not included in the main ``fsspec`` +distribution. +You can also pass parameters directly to the backend driver. For example, +if you do *not* have S3 credentials, you can still access public data by +specifying an anonymous connection, such as +.. code-block:: python + + pd.read_csv("s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" + "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"anon": True}) + +``fsspec`` also allows complex URLs, for accessing data in compressed +archives, local caching of files, and more. To locally cache the above +example, you would modify the call to + + pd.read_csv("simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" + "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + storage_options={"s3": {"anon": True}}) + +where we specify that the "anon" parameter is meant for the "s3" part of +the implementation, not to the caching implementation. Note that this caches to a temporary +directory for the duration of the session only, but you can also specify +a permanent store. + +.. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ +.. _fsimpl:: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations Writing out data '''''''''''''''' diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index d7116df8904f3..d994263efedc5 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -20,9 +20,11 @@ Many read/write functions have acquired the `storage_options` optional argument, to pass a dictionary of parameters to the storage backend. This allows, for example, for passing credentials to S3 and GCS storage. The details of what parameters can be passed to which backends can be found in the documentation -of the individual storage backends (linked from the fsspec docs). - +of the individual storage backends (detailed from the fsspec docs for +`builtin implementations`_ and linked to `external ones`_). +.. _builtin implementations: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _external ones: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations .. _whatsnew_120.enhancements.other: Other enhancements diff --git a/pandas/conftest.py b/pandas/conftest.py index 52a1a2678b6f6..620f5686a319d 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1231,6 +1231,7 @@ def fsspectest(): pytest.importorskip("fsspec") from fsspec.implementations.memory import MemoryFileSystem from fsspec import register_implementation + from fsspec.registry import _registry as registry class TestMemoryFS(MemoryFileSystem): protocol = "testmem" @@ -1240,5 +1241,8 @@ def __init__(self, **kwargs): self.test[0] = kwargs.pop("test", None) super().__init__(**kwargs) - register_implementation("testmem", TestMemoryFS, True) - return TestMemoryFS() + register_implementation("testmem", TestMemoryFS, clobber=True) + yield TestMemoryFS() + registry.pop("testmem", None) + TestMemoryFS.test[0] = None + TestMemoryFS.store.clear() diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 987e89693c1bb..1cc1790939e19 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2138,7 +2138,7 @@ def to_stata( be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error will be raised if providing this argument with a local path or a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values + docs for the set of allowed keys and values. .. versionadded:: 1.2.0 diff --git a/pandas/core/series.py b/pandas/core/series.py index 2d7a571335510..2d613f9c65bd5 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -9,6 +9,7 @@ TYPE_CHECKING, Any, Callable, + Dict, Iterable, List, Optional, @@ -1423,6 +1424,7 @@ def to_markdown( buf: Optional[IO[str]] = None, mode: str = "wt", index: bool = True, + storage_options: Optional[Dict[str, Any]] = None, **kwargs, ) -> Optional[str]: """ @@ -1441,6 +1443,16 @@ def to_markdown( .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 + **kwargs These parameters will be passed to `tabulate \ `_. @@ -1476,7 +1488,9 @@ def to_markdown( | 3 | quetzal | +----+----------+ """ - return self.to_frame().to_markdown(buf, mode, index, **kwargs) + return self.to_frame().to_markdown( + buf, mode, index, storage_options=storage_options, **kwargs + ) # ---------------------------------------------------------------------- diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 1101261f25184..27563c59dab8b 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -205,6 +205,10 @@ def write( path, "wb", **(storage_options or {}) ).open() else: + if storage_options: + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) path, _, _, _ = get_filepath_or_buffer(path) with catch_warnings(record=True): diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 577822dd90bae..406d529cfdc20 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -219,3 +219,7 @@ def test_non_fsspec_options(): with pytest.raises(ValueError, match="storage_options"): read_csv(by, storage_options={"a": True}) + + df = DataFrame({"a": [0]}) + with pytest.raises(ValueError, match="storage_options"): + df.to_parquet("nonfsspecpath", storage_options={"a": True}) From 32cf2043cc3da8790cc63ec468351b44d7859e1c Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 6 Aug 2020 12:03:03 -0400 Subject: [PATCH 24/27] sort and remove colon --- doc/source/user_guide/io.rst | 2 +- pandas/conftest.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 4079165ab22fc..44753554a60cf 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1689,7 +1689,7 @@ directory for the duration of the session only, but you can also specify a permanent store. .. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ -.. _fsimpl:: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _fsimpl: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations Writing out data '''''''''''''''' diff --git a/pandas/conftest.py b/pandas/conftest.py index 620f5686a319d..10124b09ae249 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1229,8 +1229,8 @@ def sort_by_key(request): @pytest.fixture() def fsspectest(): pytest.importorskip("fsspec") - from fsspec.implementations.memory import MemoryFileSystem from fsspec import register_implementation + from fsspec.implementations.memory import MemoryFileSystem from fsspec.registry import _registry as registry class TestMemoryFS(MemoryFileSystem): From f7f086c398c63044d4bfba3904de07403fdcc618 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 7 Aug 2020 16:07:11 -0400 Subject: [PATCH 25/27] More suggestions Mainly defining the StorageOptions type --- doc/source/user_guide/io.rst | 10 ++++++++-- doc/source/whatsnew/v1.2.0.rst | 3 ++- pandas/_typing.py | 3 +++ pandas/core/frame.py | 7 ++++--- pandas/core/generic.py | 7 ++++--- pandas/core/series.py | 4 ++-- pandas/io/common.py | 4 ++-- pandas/io/formats/csvs.py | 6 +++--- pandas/io/json/_json.py | 8 ++++---- pandas/io/parquet.py | 20 ++++++-------------- pandas/io/pickle.py | 8 ++++---- pandas/io/stata.py | 14 +++++++------- 12 files changed, 49 insertions(+), 45 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 3d3ecbab95fc6..a172d18b20d50 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1649,8 +1649,10 @@ options include: Specifying any of the above options will produce a ``ParserWarning`` unless the python engine is selected explicitly using ``engine='python'``. -Reading remote files -'''''''''''''''''''' +.. _io.remote: + +Reading/writing remote files +'''''''''''''''''''''''''''' You can pass in a URL to read or write remote files to many of Pandas' IO functions - the following example shows reading a CSV file: @@ -1686,6 +1688,8 @@ You can also pass parameters directly to the backend driver. For example, if you do *not* have S3 credentials, you can still access public data by specifying an anonymous connection, such as +.. versionadded:: 1.2.0 + .. code-block:: python pd.read_csv("s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" @@ -1696,6 +1700,8 @@ specifying an anonymous connection, such as archives, local caching of files, and more. To locally cache the above example, you would modify the call to +.. code-block:: python + pd.read_csv("simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", storage_options={"s3": {"anon": True}}) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 023da9dfeb9d4..c0182654e97fd 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -21,7 +21,8 @@ to pass a dictionary of parameters to the storage backend. This allows, for example, for passing credentials to S3 and GCS storage. The details of what parameters can be passed to which backends can be found in the documentation of the individual storage backends (detailed from the fsspec docs for -`builtin implementations`_ and linked to `external ones`_). +`builtin implementations`_ and linked to `external ones`_). See +Section :ref:`io.remote`. .. _builtin implementations: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations .. _external ones: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations diff --git a/pandas/_typing.py b/pandas/_typing.py index 76ec527e6e258..47a102ddc70e0 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -106,3 +106,6 @@ List[AggFuncTypeBase], Dict[Label, Union[AggFuncTypeBase, List[AggFuncTypeBase]]], ] + +# for arbitrary kwargs passed during reading/writing files +StorageOptions = Optional[Dict[str, Any]] diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e302d8e72f33a..39fcf1a3fd573 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -55,6 +55,7 @@ Label, Level, Renamer, + StorageOptions, ValueKeyFunc, ) from pandas.compat import PY37 @@ -2056,7 +2057,7 @@ def to_stata( version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ) -> None: """ Export DataFrame object to Stata dta format. @@ -2259,7 +2260,7 @@ def to_markdown( buf: Optional[Union[IO[str], str]] = None, mode: str = "wt", index: bool = True, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, **kwargs, ) -> Optional[str]: if "showindex" in kwargs: @@ -2295,7 +2296,7 @@ def to_parquet( compression: Optional[str] = "snappy", index: Optional[bool] = None, partition_cols: Optional[List[str]] = None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, **kwargs, ) -> None: """ diff --git a/pandas/core/generic.py b/pandas/core/generic.py index af077e5cd04e3..5ac8a6b165171 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -40,6 +40,7 @@ Label, Level, Renamer, + StorageOptions, TimedeltaConvertibleTypes, TimestampConvertibleTypes, ValueKeyFunc, @@ -2042,7 +2043,7 @@ def to_json( compression: Optional[str] = "infer", index: bool_t = True, indent: Optional[int] = None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ) -> Optional[str]: """ Convert the object to a JSON string. @@ -2629,7 +2630,7 @@ def to_pickle( path, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ) -> None: """ Pickle (serialize) object to file. @@ -3044,7 +3045,7 @@ def to_csv( escapechar: Optional[str] = None, decimal: Optional[str] = ".", errors: str = "strict", - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. diff --git a/pandas/core/series.py b/pandas/core/series.py index ddeaf7fe9ec05..37558ed5e99a3 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -9,7 +9,6 @@ TYPE_CHECKING, Any, Callable, - Dict, Iterable, List, Optional, @@ -32,6 +31,7 @@ FrameOrSeriesUnion, IndexKeyFunc, Label, + StorageOptions, ValueKeyFunc, ) from pandas.compat.numpy import function as nv @@ -1425,7 +1425,7 @@ def to_markdown( buf: Optional[IO[str]] = None, mode: str = "wt", index: bool = True, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, **kwargs, ) -> Optional[str]: """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 2129fca1f0897..9ac642e58b544 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -29,7 +29,7 @@ ) import zipfile -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat import _get_lzma_file, _import_lzma from pandas.compat._optional import import_optional_dependency @@ -162,7 +162,7 @@ def get_filepath_or_buffer( encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 86b3f4801aed7..6eceb94387171 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,13 +5,13 @@ import csv as csvlib from io import StringIO, TextIOWrapper import os -from typing import Any, Dict, Hashable, List, Mapping, Optional, Sequence, Union +from typing import Hashable, List, Mapping, Optional, Sequence, Union import warnings import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -53,7 +53,7 @@ def __init__( doublequote: bool = True, escapechar: Optional[str] = None, decimal=".", - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): self.obj = obj diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 86d37ecaeaf6f..0d2b351926343 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,13 +3,13 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import Any, Callable, Dict, Optional, Type +from typing import Any, Callable, Optional, Type import numpy as np import pandas._libs.json as json from pandas._libs.tslibs import iNaT -from pandas._typing import JSONSerializable +from pandas._typing import JSONSerializable, StorageOptions from pandas.errors import AbstractMethodError from pandas.util._decorators import deprecate_kwarg, deprecate_nonkeyword_arguments @@ -44,7 +44,7 @@ def to_json( compression: Optional[str] = "infer", index: bool = True, indent: int = 0, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): if not index and orient not in ["split", "table"]: @@ -371,7 +371,7 @@ def read_json( chunksize: Optional[int] = None, compression="infer", nrows: Optional[int] = None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): """ Convert a JSON string to pandas object. diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 27563c59dab8b..7f0eef039a1e8 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -3,7 +3,7 @@ from typing import Any, AnyStr, Dict, List, Optional from warnings import catch_warnings -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.errors import AbstractMethodError @@ -89,7 +89,7 @@ def write( path: FilePathOrBuffer[AnyStr], compression: Optional[str] = "snappy", index: Optional[bool] = None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, partition_cols: Optional[List[str]] = None, **kwargs, ): @@ -128,11 +128,7 @@ def write( self.api.parquet.write_table(table, path, compression=compression, **kwargs) def read( - self, - path, - columns=None, - storage_options: Optional[Dict[str, Any]] = None, - **kwargs, + self, path, columns=None, storage_options: StorageOptions = None, **kwargs, ): if is_fsspec_url(path) and "filesystem" not in kwargs: import_optional_dependency("fsspec") @@ -178,7 +174,7 @@ def write( compression="snappy", index=None, partition_cols=None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, **kwargs, ): self.validate_dataframe(df) @@ -222,11 +218,7 @@ def write( ) def read( - self, - path, - columns=None, - storage_options: Optional[Dict[str, Any]] = None, - **kwargs, + self, path, columns=None, storage_options: StorageOptions = None, **kwargs, ): if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") @@ -248,7 +240,7 @@ def to_parquet( engine: str = "auto", compression: Optional[str] = "snappy", index: Optional[bool] = None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, partition_cols: Optional[List[str]] = None, **kwargs, ): diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 3aecc9c91d189..549d55e65546d 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,9 +1,9 @@ """ pickle compat """ import pickle -from typing import Any, Dict, Optional +from typing import Any, Optional import warnings -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, StorageOptions from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle @@ -14,7 +14,7 @@ def to_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): """ Pickle (serialize) object to file. @@ -113,7 +113,7 @@ def to_pickle( def read_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): """ Load pickled pandas object (or any object) from file. diff --git a/pandas/io/stata.py b/pandas/io/stata.py index ab6517b07f4c4..7a25617885839 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, Label +from pandas._typing import FilePathOrBuffer, Label, StorageOptions from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1035,7 +1035,7 @@ def __init__( columns: Optional[Sequence[str]] = None, order_categoricals: bool = True, chunksize: Optional[int] = None, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): super().__init__() self.col_sizes: List[int] = [] @@ -1910,7 +1910,7 @@ def read_stata( order_categoricals: bool = True, chunksize: Optional[int] = None, iterator: bool = False, - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ) -> Union[DataFrame, StataReader]: reader = StataReader( @@ -1939,7 +1939,7 @@ def read_stata( def _open_file_binary_write( fname: FilePathOrBuffer, compression: Union[str, Mapping[str, str], None], - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]: """ Open a binary file or no-op if file-like. @@ -2238,7 +2238,7 @@ def __init__( data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, compression: Union[str, Mapping[str, str], None] = "infer", - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates @@ -3121,7 +3121,7 @@ def __init__( variable_labels: Optional[Dict[Label, str]] = None, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): # Copy to new list since convert_strl might be modified later self._convert_strl: List[Label] = [] @@ -3526,7 +3526,7 @@ def __init__( convert_strl: Optional[Sequence[Label]] = None, version: Optional[int] = None, compression: Union[str, Mapping[str, str], None] = "infer", - storage_options: Optional[Dict[str, Any]] = None, + storage_options: StorageOptions = None, ): if version is None: version = 118 if data.shape[1] <= 32767 else 119 From b5138f0290e4728f2bc96de777958ae23e69807a Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 7 Aug 2020 16:24:11 -0400 Subject: [PATCH 26/27] More IO link --- doc/source/user_guide/io.rst | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index a172d18b20d50..b25ad8defdac8 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1681,7 +1681,8 @@ you will need to define credentials in one of the several ways listed in the `S3Fs documentation `_. The same is true for several of the storage backends, and you should follow the links -at `fsimpl`_ for implementations not included in the main ``fsspec`` +at `fsimpl1`_ for implementations built into ``fsspec`` and `fsimpl2`_ +for those not included in the main ``fsspec`` distribution. You can also pass parameters directly to the backend driver. For example, @@ -1712,7 +1713,8 @@ directory for the duration of the session only, but you can also specify a permanent store. .. _fsspec: https://filesystem-spec.readthedocs.io/en/latest/ -.. _fsimpl: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _fsimpl1: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations +.. _fsimpl2: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations Writing out data '''''''''''''''' From afdc0309d15e28dcd88ba4646d29a3e91413a948 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 09:44:13 -0400 Subject: [PATCH 27/27] doc code lint fix --- doc/source/user_guide/io.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index b25ad8defdac8..35403b5c8b66f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -1703,8 +1703,8 @@ example, you would modify the call to .. code-block:: python - pd.read_csv("simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/SaKe2013" - "-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", + pd.read_csv("simplecache::s3://ncei-wcsd-archive/data/processed/SH1305/18kHz/" + "SaKe2013-D20130523-T080854_to_SaKe2013-D20130523-T085643.csv", storage_options={"s3": {"anon": True}}) where we specify that the "anon" parameter is meant for the "s3" part of