pandas-dev · jorisvandenbossche · Jun 23, 2020 · Apr 14, 2020 · May 19, 2020 · May 20, 2020
diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst
@@ -267,8 +267,9 @@ SQLAlchemy                1.1.4              SQL support for databases other tha
 SciPy                     0.19.0             Miscellaneous statistical functions
 XLsxWriter                0.9.8              Excel writing
 blosc                                        Compression for HDF5
+fsspec                    0.7.4              File operations handling
 fastparquet               0.3.2              Parquet reading / writing
-gcsfs                     0.2.2              Google Cloud Storage access
+gcsfs                     0.6.0              Google Cloud Storage access
 html5lib                                     HTML parser for read_html (see :ref:`note <optional_html>`)
 lxml                      3.8.0              HTML parser for read_html (see :ref:`note <optional_html>`)
 matplotlib                2.2.2              Visualization
@@ -282,7 +283,7 @@ pyreadstat                                   SPSS files (.sav) reading
 pytables                  3.4.3              HDF5 reading / writing
 pyxlsb                    1.0.6              Reading for xlsb files
 qtpy                                         Clipboard I/O
-s3fs                      0.3.0              Amazon S3 access
+s3fs                      0.4.0              Amazon S3 access
 tabulate                  0.8.3              Printing in Markdown-friendly format (see `tabulate`_)
 xarray                    0.8.2              pandas-like API for N-dimensional data
 xclip                                        Clipboard I/O on linux

diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst
@@ -197,6 +197,20 @@ For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`.
 
 .. _whatsnew_110.enhancements.other:
 
+fsspec now used for filesystem handling
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For reading and writing to filesystems other than local and reading from HTTP(S),
+the optional dependency ``fsspec`` will be used to dispatch operations. This will give unchanged
+functionality for S3 and GCS storage, which were already supported, but also add
+support for several other storage implementations such as Azure Datalake and Blob,
+SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_.
+
+In the future, we will implement a way to pass parameters to the invoked
+filesystem instances.
+
+.. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/
+
 Other enhancements
 ^^^^^^^^^^^^^^^^^^
 

diff --git a/environment.yml b/environment.yml
@@ -98,7 +98,8 @@ dependencies:
 
   - pyqt>=5.9.2  # pandas.read_clipboard
   - pytables>=3.4.3  # pandas.read_hdf, DataFrame.to_hdf
-  - s3fs>=0.4.0  # pandas.read_csv... when using 's3://...' path (also brings in fsspec)
+  - s3fs>=0.4.0  # pandas.read_csv... when using 's3://...' path
+  - fsspec>=0.7.4  # for generic remote file operations
   - sqlalchemy  # pandas.read_sql, DataFrame.to_sql
   - xarray  # DataFrame.to_xarray
   - cftime  # Needed for downstream xarray.CFTimeIndex test

diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py
@@ -8,8 +8,9 @@
 VERSIONS = {
     "bs4": "4.6.0",
     "bottleneck": "1.2.1",
+    "fsspec": "0.7.4",
     "fastparquet": "0.3.2",
-    "gcsfs": "0.2.2",
+    "gcsfs": "0.6.0",
     "lxml.etree": "3.8.0",
     "matplotlib": "2.2.2",
     "numexpr": "2.6.2",
@@ -20,7 +21,7 @@
     "pytables": "3.4.3",
     "pytest": "5.0.1",
     "pyxlsb": "1.0.6",
-    "s3fs": "0.3.0",
+    "s3fs": "0.4.0",
     "scipy": "1.2.0",
     "sqlalchemy": "1.1.4",
     "tables": "3.4.3",

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -31,6 +31,7 @@
 
 from pandas._typing import FilePathOrBuffer
 from pandas.compat import _get_lzma_file, _import_lzma
+from pandas.compat._optional import import_optional_dependency
 
 from pandas.core.dtypes.common import is_file_like
 
@@ -185,12 +186,11 @@ def get_filepath_or_buffer(
         return reader, encoding, compression, True
 
     if is_fsspec_url(filepath_or_buffer):
-        import fsspec
+        fsspec = import_optional_dependency('fsspec')
 
         file_obj = fsspec.open(
             filepath_or_buffer, mode=mode or "rb", **storage_options
         ).open()
-        # TODO: both fsspec and pandas handle compression and encoding
         return file_obj, encoding, compression, True
 
     if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):

diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -102,7 +102,10 @@ def write(
         # write_to_dataset does not support a file-like object when
         # a directory path is used, so just pass the path string.
         if partition_cols is not None:
+            # user may provide filesystem= with an instance, in which case it takes priority
+            # and fsspec need not analyse the path
             if is_fsspec_url(path) and "filesystem" not in kwargs:
+                import_optional_dependency('fsspec')
                 import fsspec.core
 
                 fs, path = fsspec.core.url_to_fs(path)
@@ -123,12 +126,15 @@ def write(
 
     def read(self, path, columns=None, **kwargs):
         if is_fsspec_url(path) and "filesystem" not in kwargs:
+            import_optional_dependency('fsspec')
             import fsspec.core
 
             fs, path = fsspec.core.url_to_fs(path)
             parquet_ds = self.api.parquet.ParquetDataset(path, filesystem=fs, **kwargs)
         else:
             parquet_ds = self.api.parquet.ParquetDataset(path, **kwargs)
+            # this key valid for ParquetDataset but not read_pandas
+            kwargs.pop('filesystem', None)
 
         kwargs["columns"] = columns
         result = parquet_ds.read_pandas(**kwargs).to_pandas()
@@ -170,7 +176,7 @@ def write(
             kwargs["file_scheme"] = "hive"
 
         if is_fsspec_url(path):
-            import fsspec
+            fsspec = import_optional_dependency('fsspec')
 
             # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
             kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open()
@@ -189,7 +195,7 @@ def write(
 
     def read(self, path, columns=None, **kwargs):
         if is_fsspec_url(path):
-            import fsspec
+            fsspec = import_optional_dependency('fsspec')
 
             open_with = lambda path, _: fsspec.open(path, "rb").open()
             parquet_file = self.api.ParquetFile(path, open_with=open_with)

diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
@@ -18,16 +18,13 @@
 
 @pytest.fixture
 def cleared_fs():
-    import fsspec
+    fsspec = pytest.importorskip('fsspec')
 
     memfs = fsspec.filesystem("memory")
-    try:
-        yield memfs
-    finally:
-        memfs.store.clear()
+    yield memfs
+    memfs.store.clear()
 
 
-@td.skip_if_no("fsspec")
 def test_read_csv(cleared_fs):
     from fsspec.implementations.memory import MemoryFile
 
@@ -37,8 +34,7 @@ def test_read_csv(cleared_fs):
     tm.assert_frame_equal(df1, df2)
 
 
-@td.skip_if_no("fsspec")
-def test_reasonable_error(monkeypatch):
+def test_reasonable_error(monkeypatch, cleared_fs):
     from fsspec.registry import known_implementations
     from fsspec import registry
 
@@ -57,7 +53,6 @@ def test_reasonable_error(monkeypatch):
         assert err_mgs in str(e.value)
 
 
-@td.skip_if_no("fsspec")
 def test_to_csv(cleared_fs):
     df1.to_csv("memory://test/test.csv", index=True)
     df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0)
@@ -66,8 +61,7 @@ def test_to_csv(cleared_fs):
 
 
 @td.skip_if_no("fastparquet")
-@td.skip_if_no("fsspec")
-def test_to_parquet_new_file(monkeypatch):
+def test_to_parquet_new_file(monkeypatch, cleared_fs):
     """Regression test for writing to a not-yet-existent GCS Parquet file."""
     df1.to_parquet(
         "memory://test/test.csv", index=True, engine="fastparquet", compression=None

diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py
@@ -537,6 +537,13 @@ def test_categorical(self, pa):
             check_round_trip(df, pa, expected=expected)
 
     def test_s3_roundtrip(self, df_compat, s3_resource, pa):
+        s3fs = pytest.importorskip("s3fs")
+        s3 = s3fs.S3FileSystem()
+        kw = dict(filesystem=s3)
+        check_round_trip(df_compat, pa, path="pandas-test/pyarrow.parquet",
+                         read_kwargs=kw, write_kwargs=kw)
+
+    def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa):
         # GH #19134
         check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet")