From 94e717f185f3e583245baae88da7efdf3af8ff1b Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 14 Apr 2020 22:37:58 +0200 Subject: [PATCH] Add remote file io using fsspec. --- pandas/io/common.py | 44 ++++++++++++++++++++++++++++--------- pandas/tests/io/test_gcs.py | 4 ++-- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index ff527de79c387..98f584c60a964 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -158,6 +158,23 @@ def urlopen(*args, **kwargs): return urllib.request.urlopen(*args, **kwargs) +def is_fsspec_url(url) -> bool: + """ + Returns true if fsspec is installed and the URL references a known + fsspec filesystem. + """ + + if not isinstance(url, str): + return False + + try: + from fsspec.registry import known_implementations + scheme = parse_url(url).scheme + return scheme != "file" and scheme in known_implementations + except ImportError: + return False + + def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, @@ -194,19 +211,26 @@ def get_filepath_or_buffer( req.close() return reader, encoding, compression, True - if is_s3_url(filepath_or_buffer): - from pandas.io import s3 + if is_fsspec_url(filepath_or_buffer): + import fsspec + scheme = parse_url(filepath_or_buffer).scheme + filesystem = fsspec.filesystem(scheme) + file_obj = filesystem.open(filepath_or_buffer, mode=mode or "rb") + return file_obj, encoding, compression, True - return s3.get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding, compression=compression, mode=mode - ) + # if is_s3_url(filepath_or_buffer): + # from pandas.io import s3 - if is_gcs_url(filepath_or_buffer): - from pandas.io import gcs + # return s3.get_filepath_or_buffer( + # filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + # ) - return gcs.get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding, compression=compression, mode=mode - ) + # if is_gcs_url(filepath_or_buffer): + # from pandas.io import gcs + + # return gcs.get_filepath_or_buffer( + # filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + # ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 557a9d5c13987..f9282487e559a 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -29,7 +29,7 @@ def test_read_csv_gcs(monkeypatch): ) class MockGCSFileSystem: - def open(*args): + def open(self, path, mode, *args): return StringIO(df1.to_csv(index=False)) monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) @@ -51,7 +51,7 @@ def test_to_csv_gcs(monkeypatch): s = StringIO() class MockGCSFileSystem: - def open(*args): + def open(self, path, mode, *args): return s monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem)