pandas-dev · jorisvandenbossche · Jun 23, 2020 · Apr 14, 2020 · May 19, 2020 · May 20, 2020
diff --git a/environment.yml b/environment.yml
@@ -98,7 +98,7 @@ dependencies:
 
   - pyqt>=5.9.2  # pandas.read_clipboard
   - pytables>=3.4.2  # pandas.read_hdf, DataFrame.to_hdf
-  - s3fs  # pandas.read_csv... when using 's3://...' path
+  - s3fs  # pandas.read_csv... when using 's3://...' path (also brings in fsspec)
   - sqlalchemy  # pandas.read_sql, DataFrame.to_sql
   - xarray  # DataFrame.to_xarray
   - cftime  # Needed for downstream xarray.CFTimeIndex test

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -134,20 +134,6 @@ def stringify_path(
     return _expand_user(filepath_or_buffer)
 
 
-def is_s3_url(url) -> bool:
-    """Check for an s3, s3n, or s3a url"""
-    if not isinstance(url, str):
-        return False
-    return parse_url(url).scheme in ["s3", "s3n", "s3a"]
-
-
-def is_gcs_url(url) -> bool:
-    """Check for a gcs url"""
-    if not isinstance(url, str):
-        return False
-    return parse_url(url).scheme in ["gcs", "gs"]
-
-
 def urlopen(*args, **kwargs):
     """
     Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of
@@ -158,11 +144,25 @@ def urlopen(*args, **kwargs):
     return urllib.request.urlopen(*args, **kwargs)
 
 
+def is_fsspec_url(url: FilePathOrBuffer) -> bool:
+    """
+    Returns true if fsspec is installed and the given URL looks like
+    something fsspec can handle
+    """
+    try:
+        import fsspec  # noqa: F401
+
+        return isinstance(url, str) and ("::" in url or "://" in url)
+    except ImportError:
+        return False
+
+
 def get_filepath_or_buffer(
     filepath_or_buffer: FilePathOrBuffer,
     encoding: Optional[str] = None,
     compression: Optional[str] = None,
     mode: Optional[str] = None,
+    **storage_options,
 ):
     """
     If the filepath_or_buffer is a url, translate and return the buffer.
@@ -175,6 +175,7 @@ def get_filepath_or_buffer(
     compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional
     encoding : the encoding to use to decode bytes, default is 'utf-8'
     mode : str, optional
+    storage_options: passed on to fsspec, if using it
-    storage_options: passed on to fsspec, if using it
+    **storage_options : dict, optional
+        passed on to fsspec.open, if using it.
-    storage_options: passed on to fsspec, if using it
+    **storage_options : dict, optional
+        passed on to fsspec.open, if using it.
 
     Returns
     -------
@@ -185,6 +186,7 @@ def get_filepath_or_buffer(
     filepath_or_buffer = stringify_path(filepath_or_buffer)
 
     if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer):
+        # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged
         req = urlopen(filepath_or_buffer)
         content_encoding = req.headers.get("Content-Encoding", None)
         if content_encoding == "gzip":
@@ -194,19 +196,14 @@ def get_filepath_or_buffer(
         req.close()
         return reader, encoding, compression, True
 
-    if is_s3_url(filepath_or_buffer):
-        from pandas.io import s3
-
-        return s3.get_filepath_or_buffer(
-            filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
-        )
+    if is_fsspec_url(filepath_or_buffer):
+        import fsspec
 
-    if is_gcs_url(filepath_or_buffer):
-        from pandas.io import gcs
-
-        return gcs.get_filepath_or_buffer(
-            filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
-        )
+        file_obj = fsspec.open(
+            filepath_or_buffer, mode=mode or "rb", **storage_options
+        ).open()
+        # TODO: both fsspec and pandas handle compression and encoding
+        return file_obj, encoding, compression, True
 
     if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)):
         return _expand_user(filepath_or_buffer), None, compression, False

diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py
diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py
@@ -8,7 +8,7 @@
 
 from pandas import DataFrame, get_option
 
-from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url
+from pandas.io.common import get_filepath_or_buffer, is_fsspec_url
 
 
 def get_engine(engine: str) -> "BaseImpl":
@@ -157,13 +157,13 @@ def write(
         if partition_cols is not None:
             kwargs["file_scheme"] = "hive"
 
-        if is_s3_url(path) or is_gcs_url(path):
+        if is_fsspec_url(path):
+            import fsspec
+
             # if path is s3:// or gs:// we need to open the file in 'wb' mode.
             # TODO: Support 'ab'
 
-            path, _, _, _ = get_filepath_or_buffer(path, mode="wb")
-            # And pass the opened file to the fastparquet internal impl.
-            kwargs["open_with"] = lambda path, _: path
+            kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open()
         else:
             path, _, _, _ = get_filepath_or_buffer(path)
 
@@ -178,20 +178,8 @@ def write(
             )
 
     def read(self, path, columns=None, **kwargs):
-        if is_s3_url(path):
-            from pandas.io.s3 import get_file_and_filesystem
-
-            # When path is s3:// an S3File is returned.
-            # We need to retain the original path(str) while also
-            # pass the S3File().open function to fsatparquet impl.
-            s3, filesystem = get_file_and_filesystem(path)
-            try:
-                parquet_file = self.api.ParquetFile(path, open_with=filesystem.open)
-            finally:
-                s3.close()
-        else:
-            path, _, _, _ = get_filepath_or_buffer(path)
-            parquet_file = self.api.ParquetFile(path)
+        path, _, _, _ = get_filepath_or_buffer(path)
+        parquet_file = self.api.ParquetFile(path)
 
         return parquet_file.to_pandas(columns=columns, **kwargs)
 

diff --git a/pandas/io/s3.py b/pandas/io/s3.py
diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py
@@ -0,0 +1,70 @@
+import gc
+
+import numpy as np
+import pytest
+
+from pandas import DataFrame, date_range, read_csv
+import pandas._testing as tm
+from pandas.util import _test_decorators as td
+
+from pandas.io.common import is_fsspec_url
+
+df1 = DataFrame(
+    {
+        "int": [1, 3],
+        "float": [2.0, np.nan],
+        "str": ["t", "s"],
+        "dt": date_range("2018-06-18", periods=2),
+    }
+)
+text = df1.to_csv(index=False).encode()
+
+
+@pytest.fixture
+@td.skip_if_installed("fsspec")
+def cleared_fs():
+    import fsspec
+
+    memfs = fsspec.filesystem("memory")
+    try:
+        yield memfs
+    finally:
+        memfs.store.clear()
+
+
+def test_is_fsspec_url():
+    assert is_fsspec_url("gcs://pandas/somethingelse.com")
+    assert is_fsspec_url("gs://pandas/somethingelse.com")
+    assert not is_fsspec_url("random:pandas/somethingelse.com")
+
+
+def test_read_csv(cleared_fs):
+    from fsspec.implementations.memory import MemoryFile
+
+    cleared_fs.store["test/test.csv"] = MemoryFile(data=text)
+    df2 = read_csv("memory://test/test.csv", parse_dates=["dt"])
+
+    tm.assert_frame_equal(df1, df2)
+
+
+def test_to_csv(cleared_fs):
+    df1.to_csv("memory://test/test.csv", index=True)
+    gc.collect()  # pandas does not explicitly close file buffers
+    df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0)
+
+    tm.assert_frame_equal(df1, df2)
+
+
+@td.skip_if_no("fastparquet")
+def test_to_parquet_new_file(monkeypatch):
+    """Regression test for writing to a not-yet-existent GCS Parquet file."""
+    df1.to_parquet(
+        "memory://test/test.csv", index=True, engine="fastparquet", compression=None
+    )
+
+
+@td.skip_if_installed("fsspec")
+def test_not_present_exception():
+    with pytest.raises(ImportError) as e:
+        read_csv("memory://test/test.csv")
+        assert "fsspec library is required" in str(e.value)