pandas-dev · jorisvandenbossche · Jun 23, 2020 · Apr 14, 2020 · May 19, 2020 · May 20, 2020
diff --git a/environment.yml b/environment.yml
@@ -98,7 +98,7 @@ dependencies:
 
   - pyqt>=5.9.2  # pandas.read_clipboard
   - pytables>=3.4.3  # pandas.read_hdf, DataFrame.to_hdf
-  - s3fs  # pandas.read_csv... when using 's3://...' path
+  - s3fs>=0.4.0  # pandas.read_csv... when using 's3://...' path (also brings in fsspec)
   - sqlalchemy  # pandas.read_sql, DataFrame.to_sql
   - xarray  # DataFrame.to_xarray
   - cftime  # Needed for downstream xarray.CFTimeIndex test

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -126,20 +126,6 @@ def stringify_path(
     return _expand_user(filepath_or_buffer)
 
 
-def is_s3_url(url) -> bool:
-    """Check for an s3, s3n, or s3a url"""
-    if not isinstance(url, str):
-        return False
-    return parse_url(url).scheme in ["s3", "s3n", "s3a"]
-
-
-def is_gcs_url(url) -> bool:
-    """Check for a gcs url"""
-    if not isinstance(url, str):
-        return False
-    return parse_url(url).scheme in ["gcs", "gs"]
-
-
 def urlopen(*args, **kwargs):
     """
     Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
@@ -150,38 +136,20 @@ def urlopen(*args, **kwargs):
     return urllib.request.urlopen(*args, **kwargs)
 
 
-def get_fs_for_path(filepath: str):
+def is_fsspec_url(url: FilePathOrBuffer) -> bool:
     """
-    Get appropriate filesystem given a filepath.
-    Supports s3fs, gcs and local file system.
-
-    Parameters
-    ----------
-    filepath : str
-        File path. e.g s3://bucket/object, /local/path, gcs://pandas/obj
-
-    Returns
-    -------
-    s3fs.S3FileSystem, gcsfs.GCSFileSystem, None
-        Appropriate FileSystem to use. None for local filesystem.
+    Returns true if fsspec is installed and the given URL looks like
+    something fsspec can handle
     """
-    if is_s3_url(filepath):
-        from pandas.io import s3
-
-        return s3.get_fs()
-    elif is_gcs_url(filepath):
-        from pandas.io import gcs
-
-        return gcs.get_fs()
-    else:
-        return None
+    return isinstance(url, str) and ("::" in url or "://" in url)
 
 
 def get_filepath_or_buffer(
     filepath_or_buffer: FilePathOrBuffer,
     encoding: Optional[str] = None,
     compression: Optional[str] = None,
     mode: Optional[str] = None,
+    **storage_options: Dict[str, Any],
 ):
     """
     If the filepath_or_buffer is a url, translate and return the buffer.
@@ -194,6 +162,8 @@ def get_filepath_or_buffer(
     compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
     encoding : the encoding to use to decode bytes, default is 'utf-8'
     mode : str, optional
+    storage_options: dict
+        passed on to fsspec, if using it; this is not yet accessed by the public API
 
     Returns
     -------
@@ -204,6 +174,7 @@ def get_filepath_or_buffer(
     filepath_or_buffer = stringify_path(filepath_or_buffer)
 
     if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
+        # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
         req = urlopen(filepath_or_buffer)
         content_encoding = req.headers.get("Content-Encoding", None)
         if content_encoding == "gzip":
@@ -213,19 +184,14 @@ def get_filepath_or_buffer(
         req.close()
         return reader, encoding, compression, True
 
-    if is_s3_url(filepath_or_buffer):
-        from pandas.io import s3
-
-        return s3.get_filepath_or_buffer(
-            filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
-        )
+    if is_fsspec_url(filepath_or_buffer):
+        import fsspec
 
-    if is_gcs_url(filepath_or_buffer):
-        from pandas.io import gcs
-
-        return gcs.get_filepath_or_buffer(
-            filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
-        )
+        file_obj = fsspec.open(
+            filepath_or_buffer, mode=mode or "rb", **storage_options
+        ).open()
+        # TODO: both fsspec and pandas handle compression and encoding
+        return file_obj, encoding, compression, True
 
     if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
         return _expand_user(filepath_or_buffer), None, compression, False

diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -8,12 +8,7 @@
 
 from pandas import DataFrame, get_option
 
-from pandas.io.common import (
-    get_filepath_or_buffer,
-    get_fs_for_path,
-    is_gcs_url,
-    is_s3_url,
-)
+from pandas.io.common import get_filepath_or_buffer, is_fsspec_url
 
 
 def get_engine(engine: str) -> "BaseImpl":
@@ -107,6 +102,11 @@ def write(
         # write_to_dataset does not support a file-like object when
         # a directory path is used, so just pass the path string.
         if partition_cols is not None:
+            if is_fsspec_url(path) and "filesystem" not in kwargs:
+                import fsspec.core
+
+                fs, path = fsspec.core.url_to_fs(path)
+                kwargs["filesystem"] = fs
             self.api.parquet.write_to_dataset(
                 table,
                 path,
@@ -122,9 +122,14 @@ def write(
             file_obj_or_path.close()
 
     def read(self, path, columns=None, **kwargs):
-        parquet_ds = self.api.parquet.ParquetDataset(
-            path, filesystem=get_fs_for_path(path), **kwargs
-        )
+        if is_fsspec_url(path) and "filesystem" not in kwargs:
+            import fsspec.core
+
+            fs, path = fsspec.core.url_to_fs(path)
+            parquet_ds = self.api.parquet.ParquetDataset(path, filesystem=fs, **kwargs)
+        else:
+            parquet_ds = self.api.parquet.ParquetDataset(path, **kwargs)
+
         kwargs["columns"] = columns
         result = parquet_ds.read_pandas(**kwargs).to_pandas()
         return result
@@ -164,13 +169,11 @@ def write(
         if partition_cols is not None:
             kwargs["file_scheme"] = "hive"
 
-        if is_s3_url(path) or is_gcs_url(path):
-            # if path is s3:// or gs:// we need to open the file in 'wb' mode.
-            # TODO: Support 'ab'
+        if is_fsspec_url(path):
+            import fsspec
 
-            path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
-            # And pass the opened file to the fastparquet internal impl.
-            kwargs["open_with"] = lambda path, _: path
+            # if filesystem is provided by fsspec, file must be opened in 'wb' mode.
+            kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open()
         else:
             path, _, _, _ = get_filepath_or_buffer(path)
 
@@ -185,17 +188,11 @@ def write(
             )
 
     def read(self, path, columns=None, **kwargs):
-        if is_s3_url(path):
-            from pandas.io.s3 import get_file_and_filesystem
+        if is_fsspec_url(path):
+            import fsspec
 
-            # When path is s3:// an S3File is returned.
-            # We need to retain the original path(str) while also
-            # pass the S3File().open function to fastparquet impl.
-            s3, filesystem = get_file_and_filesystem(path)
-            try:
-                parquet_file = self.api.ParquetFile(path, open_with=filesystem.open)
-            finally:
-                s3.close()
+            open_with = lambda path, _: fsspec.open(path, "rb").open()
+            parquet_file = self.api.ParquetFile(path, open_with=open_with)
         else:
             path, _, _, _ = get_filepath_or_buffer(path)
             parquet_file = self.api.ParquetFile(path)

diff --git a/pandas/io/s3.py b/pandas/io/s3.py
diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py
@@ -367,3 +367,9 @@ def test_unknown_engine(self):
             df.to_csv(path)
             with pytest.raises(ValueError, match="Unknown engine"):
                 pd.read_csv(path, engine="pyt")
+
+
+def test_is_fsspec_url():
+    assert icom.is_fsspec_url("gcs://pandas/somethingelse.com")
+    assert icom.is_fsspec_url("gs://pandas/somethingelse.com")
+    assert not icom.is_fsspec_url("random:pandas/somethingelse.com")
diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
@@ -0,0 +1,98 @@
+import numpy as np
+import pytest
+
+from pandas import DataFrame, date_range, read_csv, read_parquet
+import pandas._testing as tm
+from pandas.util import _test_decorators as td
+
+df1 = DataFrame(
+    {
+        "int": [1, 3],
+        "float": [2.0, np.nan],
+        "str": ["t", "s"],
+        "dt": date_range("2018-06-18", periods=2),
+    }
+)
+text = df1.to_csv(index=False).encode()
+
+
+@pytest.fixture
+def cleared_fs():
+    import fsspec
+
+    memfs = fsspec.filesystem("memory")
+    try:
+        yield memfs
+    finally:
+        memfs.store.clear()
+
+
+@td.skip_if_no("fsspec")
+def test_read_csv(cleared_fs):
+    from fsspec.implementations.memory import MemoryFile
+
+    cleared_fs.store["test/test.csv"] = MemoryFile(data=text)
+    df2 = read_csv("memory://test/test.csv", parse_dates=["dt"])
+
+    tm.assert_frame_equal(df1, df2)
+
+
+@td.skip_if_no("fsspec")
+def test_reasonable_error(monkeypatch):
+    from fsspec.registry import known_implementations
+    from fsspec import registry
+
+    registry.target.clear()
+    with pytest.raises(ValueError) as e:
+        read_csv("nosuchprotocol://test/test.csv")
+        assert "nosuchprotocol" in str(e.value)
+    err_mgs = "test error messgae"
+    monkeypatch.setitem(
+        known_implementations,
+        "couldexist",
+        {"class": "unimportable.CouldExist", "err": err_mgs},
+    )
+    with pytest.raises(ImportError) as e:
+        read_csv("couldexist://test/test.csv")
+        assert err_mgs in str(e.value)
+
+
+@td.skip_if_no("fsspec")
+def test_to_csv(cleared_fs):
+    df1.to_csv("memory://test/test.csv", index=True)
+    df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0)
+
+    tm.assert_frame_equal(df1, df2)
+
+
+@td.skip_if_no("fastparquet")
+@td.skip_if_no("fsspec")
+def test_to_parquet_new_file(monkeypatch):
+    """Regression test for writing to a not-yet-existent GCS Parquet file."""
+    df1.to_parquet(
+        "memory://test/test.csv", index=True, engine="fastparquet", compression=None
+    )
+
+
+@td.skip_if_no("s3fs")
+def test_from_s3_csv(s3_resource, tips_file):
+    tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file))
+    # the following are decompressed by pandas, not fsspec
+    tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file))
+    tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file))
+
+
+@td.skip_if_no("s3fs")
+@td.skip_if_no("fastparquet")
+def test_s3_parquet(s3_resource):
+    fn = "s3://pandas-test/test.parquet"
+    df1.to_parquet(fn, index=False, engine="fastparquet", compression=None)
+    df2 = read_parquet(fn, engine="fastparquet")
+    tm.assert_equal(df1, df2)
+
+
+@td.skip_if_installed("fsspec")
+def test_not_present_exception():
+    with pytest.raises(ImportError) as e:
+        read_csv("memory://test/test.csv")
+        assert "fsspec library is required" in str(e.value)