Skip to content

Storage options #35381

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 33 commits into from
Aug 10, 2020
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions doc/source/whatsnew/v1.1.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,12 @@ SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_.
The existing capability to interface with S3 and GCS will be unaffected by this
change, as ``fsspec`` will still bring in the same packages as before.

Many read/write functions have acquired the `storage_options` optional argument,
to pass a dictionary of parameters to the storage backend. This allows, for
example, for passing credentials to S3 and GCS storage. The details of what
parameters can be passed to which backends can be found in the documentation
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there a lint from fsspec docs to the storage back ends?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

of the individual storage backends.

.. _Azure Datalake and Blob: https://github.com/dask/adlfs

.. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/
Expand Down
18 changes: 18 additions & 0 deletions pandas/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -1224,3 +1224,21 @@ def sort_by_key(request):
Tests None (no key) and the identity key.
"""
return request.param


@pytest.fixture()
def fsspectest():
pytest.importorskip("fsspec")
from fsspec.implementations.memory import MemoryFileSystem
from fsspec import register_implementation

class TestMemoryFS(MemoryFileSystem):
protocol = "testmem"
test = [None]

def __init__(self, **kwargs):
self.test[0] = kwargs.pop("test", None)
super().__init__(**kwargs)

register_implementation("testmem", TestMemoryFS, True)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Keyword for True?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need to deregister the implementation as a teardown? Does any state leak between tests?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not really, but done now anyway. The tests were only seeing that the .test[0] changes, always with a new value.

return TestMemoryFS()
16 changes: 13 additions & 3 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2055,6 +2055,7 @@ def to_stata(
version: Optional[int] = 114,
convert_strl: Optional[Sequence[Label]] = None,
compression: Union[str, Mapping[str, str], None] = "infer",
storage_options: Optional[Dict[str, Any]] = None,
) -> None:
"""
Export DataFrame object to Stata dta format.
Expand Down Expand Up @@ -2187,6 +2188,7 @@ def to_stata(
write_index=write_index,
variable_labels=variable_labels,
compression=compression,
storage_options=storage_options,
**kwargs,
)
writer.write_file()
Expand Down Expand Up @@ -2239,9 +2241,10 @@ def to_feather(self, path, **kwargs) -> None:
)
def to_markdown(
self,
buf: Optional[IO[str]] = None,
mode: Optional[str] = None,
buf: Optional[Union[IO[str], str]] = None,
mode: Optional[str] = "wt",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did this need to change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mode=None doesn't actually make sense. This is for writing, and most open commands (including python builtin) would default to "rt" or "rb".

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

if changing this, could also remove Optional from type annotation.

index: bool = True,
storage_options: Optional[Dict[str, Any]] = None,
**kwargs,
) -> Optional[str]:
if "showindex" in kwargs:
Expand All @@ -2259,9 +2262,14 @@ def to_markdown(
result = tabulate.tabulate(self, **kwargs)
if buf is None:
return result
buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode)
buf, _, _, should_close = get_filepath_or_buffer(
buf, mode=mode, storage_options=storage_options
)
assert buf is not None # Help mypy.
assert not isinstance(buf, str)
buf.writelines(result)
if should_close:
buf.close()
return None

@deprecate_kwarg(old_arg_name="fname", new_arg_name="path")
Expand All @@ -2272,6 +2280,7 @@ def to_parquet(
compression: Optional[str] = "snappy",
index: Optional[bool] = None,
partition_cols: Optional[List[str]] = None,
storage_options: Optional[Dict[str, Any]] = None,
**kwargs,
) -> None:
"""
Expand Down Expand Up @@ -2366,6 +2375,7 @@ def to_parquet(
compression=compression,
index=index,
partition_cols=partition_cols,
storage_options=storage_options,
**kwargs,
)

Expand Down
13 changes: 12 additions & 1 deletion pandas/core/generic.py
Original file line number Diff line number Diff line change
Expand Up @@ -2042,6 +2042,7 @@ def to_json(
compression: Optional[str] = "infer",
index: bool_t = True,
indent: Optional[int] = None,
storage_options: Optional[Dict[str, Any]] = None,
) -> Optional[str]:
"""
Convert the object to a JSON string.
Expand Down Expand Up @@ -2303,6 +2304,7 @@ def to_json(
compression=compression,
index=index,
indent=indent,
storage_options=storage_options,
)

def to_hdf(
Expand Down Expand Up @@ -2617,6 +2619,7 @@ def to_pickle(
path,
compression: Optional[str] = "infer",
protocol: int = pickle.HIGHEST_PROTOCOL,
storage_options: Optional[Dict[str, Any]] = None,
) -> None:
"""
Pickle (serialize) object to file.
Expand Down Expand Up @@ -2670,7 +2673,13 @@ def to_pickle(
"""
from pandas.io.pickle import to_pickle

to_pickle(self, path, compression=compression, protocol=protocol)
to_pickle(
self,
path,
compression=compression,
protocol=protocol,
storage_options=storage_options,
)

def to_clipboard(
self, excel: bool_t = True, sep: Optional[str] = None, **kwargs
Expand Down Expand Up @@ -3010,6 +3019,7 @@ def to_csv(
escapechar: Optional[str] = None,
decimal: Optional[str] = ".",
errors: str = "strict",
storage_options: Optional[Dict[str, Any]] = None,
) -> Optional[str]:
r"""
Write object to a comma-separated values (csv) file.
Expand Down Expand Up @@ -3163,6 +3173,7 @@ def to_csv(
doublequote=doublequote,
escapechar=escapechar,
decimal=decimal,
storage_options=storage_options,
)
formatter.save()

Expand Down
4 changes: 4 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,8 @@ def get_filepath_or_buffer(

if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
# TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
if storage_options:
raise ValueError("storage_options passed with non-fsspec URL")
req = urlopen(filepath_or_buffer)
content_encoding = req.headers.get("Content-Encoding", None)
if content_encoding == "gzip":
Expand Down Expand Up @@ -234,6 +236,8 @@ def get_filepath_or_buffer(
).open()

return file_obj, encoding, compression, True
elif storage_options:
raise ValueError("storage_options passed with non-fsspec URL")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the "URL" here actually a user-provided buffer?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Local file or HTTP URL; but I guess an already open file instance would end up here too, worth checking.


if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
return _expand_user(filepath_or_buffer), None, compression, False
Expand Down
14 changes: 9 additions & 5 deletions pandas/io/feather_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

from pandas import DataFrame, Int64Index, RangeIndex

from pandas.io.common import get_filepath_or_buffer, stringify_path
from pandas.io.common import get_filepath_or_buffer


def to_feather(df: DataFrame, path, **kwargs):
def to_feather(df: DataFrame, path, storage_options=None, **kwargs):
"""
Write a DataFrame to the binary Feather format.

Expand All @@ -23,7 +23,9 @@ def to_feather(df: DataFrame, path, **kwargs):
import_optional_dependency("pyarrow")
from pyarrow import feather

path = stringify_path(path)
path, _, _, should_close = get_filepath_or_buffer(
path, mode="wb", storage_options=storage_options
)

if not isinstance(df, DataFrame):
raise ValueError("feather only support IO with DataFrames")
Expand Down Expand Up @@ -64,7 +66,7 @@ def to_feather(df: DataFrame, path, **kwargs):
feather.write_feather(df, path, **kwargs)


def read_feather(path, columns=None, use_threads: bool = True):
def read_feather(path, columns=None, use_threads: bool = True, storage_options=None):
"""
Load a feather-format object from the file path.

Expand Down Expand Up @@ -98,7 +100,9 @@ def read_feather(path, columns=None, use_threads: bool = True):
import_optional_dependency("pyarrow")
from pyarrow import feather

path, _, _, should_close = get_filepath_or_buffer(path)
path, _, _, should_close = get_filepath_or_buffer(
path, storage_options=storage_options
)

df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads))

Expand Down
9 changes: 7 additions & 2 deletions pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
import csv as csvlib
from io import StringIO
import os
from typing import Hashable, List, Mapping, Optional, Sequence, Union
from typing import Any, Dict, Hashable, List, Mapping, Optional, Sequence, Union
import warnings
from zipfile import ZipFile

Expand Down Expand Up @@ -54,6 +54,7 @@ def __init__(
doublequote: bool = True,
escapechar: Optional[str] = None,
decimal=".",
storage_options: Optional[Dict[str, Any]] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe add this to typing.py, e.g. StorageOptions=....

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Up to you

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am leaving it for now, but can do as you suggest if you request it

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

seeing as we are doing this in lots of places I would add it

):
self.obj = obj

Expand All @@ -64,7 +65,11 @@ def __init__(
compression, self.compression_args = get_compression_method(compression)

self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer(
path_or_buf, encoding=encoding, compression=compression, mode=mode
path_or_buf,
encoding=encoding,
compression=compression,
mode=mode,
storage_options=storage_options,
)
self.sep = sep
self.na_rep = na_rep
Expand Down
18 changes: 14 additions & 4 deletions pandas/io/json/_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from io import BytesIO, StringIO
from itertools import islice
import os
from typing import Any, Callable, Optional, Type
from typing import Any, Callable, Dict, Optional, Type

import numpy as np

Expand Down Expand Up @@ -44,6 +44,7 @@ def to_json(
compression: Optional[str] = "infer",
index: bool = True,
indent: int = 0,
storage_options: Optional[Dict[str, Any]] = None,
):

if not index and orient not in ["split", "table"]:
Expand All @@ -52,8 +53,11 @@ def to_json(
)

if path_or_buf is not None:
path_or_buf, _, _, _ = get_filepath_or_buffer(
path_or_buf, compression=compression, mode="w"
path_or_buf, _, _, should_close = get_filepath_or_buffer(
path_or_buf,
compression=compression,
mode="w",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

wt?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose? I didn't write this, it's due to style reformatting.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done

storage_options=storage_options,
)

if lines and orient != "records":
Expand Down Expand Up @@ -97,6 +101,8 @@ def to_json(
return s
else:
path_or_buf.write(s)
if should_close:
path_or_buf.close()


class Writer:
Expand Down Expand Up @@ -364,6 +370,7 @@ def read_json(
chunksize: Optional[int] = None,
compression="infer",
nrows: Optional[int] = None,
storage_options: Optional[Dict[str, Any]] = None,
):
"""
Convert a JSON string to pandas object.
Expand Down Expand Up @@ -591,7 +598,10 @@ def read_json(

compression = infer_compression(path_or_buf, compression)
filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer(
path_or_buf, encoding=encoding, compression=compression
path_or_buf,
encoding=encoding,
compression=compression,
storage_options=storage_options,
)

json_reader = JsonReader(
Expand Down
Loading