From 94e717f185f3e583245baae88da7efdf3af8ff1b Mon Sep 17 00:00:00 2001 From: Julian de Ruiter Date: Tue, 14 Apr 2020 22:37:58 +0200 Subject: [PATCH 01/34] Add remote file io using fsspec. --- pandas/io/common.py | 44 ++++++++++++++++++++++++++++--------- pandas/tests/io/test_gcs.py | 4 ++-- 2 files changed, 36 insertions(+), 12 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index ff527de79c387..98f584c60a964 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -158,6 +158,23 @@ def urlopen(*args, **kwargs): return urllib.request.urlopen(*args, **kwargs) +def is_fsspec_url(url) -> bool: + """ + Returns true if fsspec is installed and the URL references a known + fsspec filesystem. + """ + + if not isinstance(url, str): + return False + + try: + from fsspec.registry import known_implementations + scheme = parse_url(url).scheme + return scheme != "file" and scheme in known_implementations + except ImportError: + return False + + def get_filepath_or_buffer( filepath_or_buffer: FilePathOrBuffer, encoding: Optional[str] = None, @@ -194,19 +211,26 @@ def get_filepath_or_buffer( req.close() return reader, encoding, compression, True - if is_s3_url(filepath_or_buffer): - from pandas.io import s3 + if is_fsspec_url(filepath_or_buffer): + import fsspec + scheme = parse_url(filepath_or_buffer).scheme + filesystem = fsspec.filesystem(scheme) + file_obj = filesystem.open(filepath_or_buffer, mode=mode or "rb") + return file_obj, encoding, compression, True - return s3.get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding, compression=compression, mode=mode - ) + # if is_s3_url(filepath_or_buffer): + # from pandas.io import s3 - if is_gcs_url(filepath_or_buffer): - from pandas.io import gcs + # return s3.get_filepath_or_buffer( + # filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + # ) - return gcs.get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding, compression=compression, mode=mode - ) + # if is_gcs_url(filepath_or_buffer): + # from pandas.io import gcs + + # return gcs.get_filepath_or_buffer( + # filepath_or_buffer, encoding=encoding, compression=compression, mode=mode + # ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 557a9d5c13987..f9282487e559a 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -29,7 +29,7 @@ def test_read_csv_gcs(monkeypatch): ) class MockGCSFileSystem: - def open(*args): + def open(self, path, mode, *args): return StringIO(df1.to_csv(index=False)) monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) @@ -51,7 +51,7 @@ def test_to_csv_gcs(monkeypatch): s = StringIO() class MockGCSFileSystem: - def open(*args): + def open(self, path, mode, *args): return s monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) From fd7e072faa58cf2d1de4379bdd8d1a61516f53a9 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 19 May 2020 15:56:09 -0400 Subject: [PATCH 02/34] Attempt refactor and clean --- environment.yml | 2 +- pandas/io/common.py | 55 ++++------------ pandas/io/gcs.py | 18 ------ pandas/io/parquet.py | 26 ++------ pandas/io/s3.py | 49 -------------- pandas/tests/io/test_fsspec.py | 70 ++++++++++++++++++++ pandas/tests/io/test_gcs.py | 115 --------------------------------- pandas/tests/io/test_pickle.py | 40 ++---------- pandas/tests/io/test_s3.py | 25 ------- 9 files changed, 96 insertions(+), 304 deletions(-) delete mode 100644 pandas/io/gcs.py delete mode 100644 pandas/io/s3.py create mode 100644 pandas/tests/io/test_fsspec.py delete mode 100644 pandas/tests/io/test_gcs.py delete mode 100644 pandas/tests/io/test_s3.py diff --git a/environment.yml b/environment.yml index 67b2df4dc5a0e..ddaeb462f7009 100644 --- a/environment.yml +++ b/environment.yml @@ -98,7 +98,7 @@ dependencies: - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.2 # pandas.read_hdf, DataFrame.to_hdf - - s3fs # pandas.read_csv... when using 's3://...' path + - s3fs # pandas.read_csv... when using 's3://...' path (also brings in fsspec) - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test diff --git a/pandas/io/common.py b/pandas/io/common.py index 98f584c60a964..788396fc7eeb0 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -134,20 +134,6 @@ def stringify_path( return _expand_user(filepath_or_buffer) -def is_s3_url(url) -> bool: - """Check for an s3, s3n, or s3a url""" - if not isinstance(url, str): - return False - return parse_url(url).scheme in ["s3", "s3n", "s3a"] - - -def is_gcs_url(url) -> bool: - """Check for a gcs url""" - if not isinstance(url, str): - return False - return parse_url(url).scheme in ["gcs", "gs"] - - def urlopen(*args, **kwargs): """ Lazy-import wrapper for stdlib urlopen, as that imports a big chunk of @@ -158,19 +144,15 @@ def urlopen(*args, **kwargs): return urllib.request.urlopen(*args, **kwargs) -def is_fsspec_url(url) -> bool: +def is_fsspec_url(url: FilePathOrBuffer) -> bool: """ - Returns true if fsspec is installed and the URL references a known - fsspec filesystem. + Returns true if fsspec is installed and the given URL looks like + something fsspec can handle """ - - if not isinstance(url, str): - return False - try: - from fsspec.registry import known_implementations - scheme = parse_url(url).scheme - return scheme != "file" and scheme in known_implementations + import fsspec # noqa: F401 + + return isinstance(url, str) and ("::" in url or "://" in url) except ImportError: return False @@ -180,6 +162,7 @@ def get_filepath_or_buffer( encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, + **storage_options, ): """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -192,6 +175,7 @@ def get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional + storage_options: passed on to fsspec, if using it Returns ------- @@ -202,6 +186,7 @@ def get_filepath_or_buffer( filepath_or_buffer = stringify_path(filepath_or_buffer) if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): + # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -213,24 +198,12 @@ def get_filepath_or_buffer( if is_fsspec_url(filepath_or_buffer): import fsspec - scheme = parse_url(filepath_or_buffer).scheme - filesystem = fsspec.filesystem(scheme) - file_obj = filesystem.open(filepath_or_buffer, mode=mode or "rb") - return file_obj, encoding, compression, True - # if is_s3_url(filepath_or_buffer): - # from pandas.io import s3 - - # return s3.get_filepath_or_buffer( - # filepath_or_buffer, encoding=encoding, compression=compression, mode=mode - # ) - - # if is_gcs_url(filepath_or_buffer): - # from pandas.io import gcs - - # return gcs.get_filepath_or_buffer( - # filepath_or_buffer, encoding=encoding, compression=compression, mode=mode - # ) + file_obj = fsspec.open( + filepath_or_buffer, mode=mode or "rb", **storage_options + ).open() + # TODO: both fsspec and pandas handle compression and encoding + return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/io/gcs.py b/pandas/io/gcs.py deleted file mode 100644 index 1f5e0faedc6d2..0000000000000 --- a/pandas/io/gcs.py +++ /dev/null @@ -1,18 +0,0 @@ -""" GCS support for remote file interactivity """ -from pandas.compat._optional import import_optional_dependency - -gcsfs = import_optional_dependency( - "gcsfs", extra="The gcsfs library is required to handle GCS files" -) - - -def get_filepath_or_buffer( - filepath_or_buffer, encoding=None, compression=None, mode=None -): - - if mode is None: - mode = "rb" - - fs = gcsfs.GCSFileSystem() - filepath_or_buffer = fs.open(filepath_or_buffer, mode) - return filepath_or_buffer, None, compression, True diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 33747d2a6dd83..4b0d91773d6d2 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -8,7 +8,7 @@ from pandas import DataFrame, get_option -from pandas.io.common import get_filepath_or_buffer, is_gcs_url, is_s3_url +from pandas.io.common import get_filepath_or_buffer, is_fsspec_url def get_engine(engine: str) -> "BaseImpl": @@ -157,13 +157,13 @@ def write( if partition_cols is not None: kwargs["file_scheme"] = "hive" - if is_s3_url(path) or is_gcs_url(path): + if is_fsspec_url(path): + import fsspec + # if path is s3:// or gs:// we need to open the file in 'wb' mode. # TODO: Support 'ab' - path, _, _, _ = get_filepath_or_buffer(path, mode="wb") - # And pass the opened file to the fastparquet internal impl. - kwargs["open_with"] = lambda path, _: path + kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open() else: path, _, _, _ = get_filepath_or_buffer(path) @@ -178,20 +178,8 @@ def write( ) def read(self, path, columns=None, **kwargs): - if is_s3_url(path): - from pandas.io.s3 import get_file_and_filesystem - - # When path is s3:// an S3File is returned. - # We need to retain the original path(str) while also - # pass the S3File().open function to fsatparquet impl. - s3, filesystem = get_file_and_filesystem(path) - try: - parquet_file = self.api.ParquetFile(path, open_with=filesystem.open) - finally: - s3.close() - else: - path, _, _, _ = get_filepath_or_buffer(path) - parquet_file = self.api.ParquetFile(path) + path, _, _, _ = get_filepath_or_buffer(path) + parquet_file = self.api.ParquetFile(path) return parquet_file.to_pandas(columns=columns, **kwargs) diff --git a/pandas/io/s3.py b/pandas/io/s3.py deleted file mode 100644 index 976c319f89d47..0000000000000 --- a/pandas/io/s3.py +++ /dev/null @@ -1,49 +0,0 @@ -""" s3 support for remote file interactivity """ -from typing import IO, Any, Optional, Tuple -from urllib.parse import urlparse as parse_url - -from pandas._typing import FilePathOrBuffer -from pandas.compat._optional import import_optional_dependency - -s3fs = import_optional_dependency( - "s3fs", extra="The s3fs package is required to handle s3 files." -) - - -def _strip_schema(url): - """Returns the url without the s3:// part""" - result = parse_url(url, allow_fragments=False) - return result.netloc + result.path - - -def get_file_and_filesystem( - filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None -) -> Tuple[IO, Any]: - from botocore.exceptions import NoCredentialsError - - if mode is None: - mode = "rb" - - fs = s3fs.S3FileSystem(anon=False) - try: - file = fs.open(_strip_schema(filepath_or_buffer), mode) - except (FileNotFoundError, NoCredentialsError): - # boto3 has troubles when trying to access a public file - # when credentialed... - # An OSError is raised if you have credentials, but they - # aren't valid for that bucket. - # A NoCredentialsError is raised if you don't have creds - # for that bucket. - fs = s3fs.S3FileSystem(anon=True) - file = fs.open(_strip_schema(filepath_or_buffer), mode) - return file, fs - - -def get_filepath_or_buffer( - filepath_or_buffer: FilePathOrBuffer, - encoding: Optional[str] = None, - compression: Optional[str] = None, - mode: Optional[str] = None, -) -> Tuple[IO, Optional[str], Optional[str], bool]: - file, _fs = get_file_and_filesystem(filepath_or_buffer, mode=mode) - return file, None, compression, True diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py new file mode 100644 index 0000000000000..8bba306672c38 --- /dev/null +++ b/pandas/tests/io/test_fsspec.py @@ -0,0 +1,70 @@ +import gc + +import numpy as np +import pytest + +from pandas import DataFrame, date_range, read_csv +import pandas._testing as tm +from pandas.util import _test_decorators as td + +from pandas.io.common import is_fsspec_url + +df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } +) +text = df1.to_csv(index=False).encode() + + +@pytest.fixture +@td.skip_if_installed("fsspec") +def cleared_fs(): + import fsspec + + memfs = fsspec.filesystem("memory") + try: + yield memfs + finally: + memfs.store.clear() + + +def test_is_fsspec_url(): + assert is_fsspec_url("gcs://pandas/somethingelse.com") + assert is_fsspec_url("gs://pandas/somethingelse.com") + assert not is_fsspec_url("random:pandas/somethingelse.com") + + +def test_read_csv(cleared_fs): + from fsspec.implementations.memory import MemoryFile + + cleared_fs.store["test/test.csv"] = MemoryFile(data=text) + df2 = read_csv("memory://test/test.csv", parse_dates=["dt"]) + + tm.assert_frame_equal(df1, df2) + + +def test_to_csv(cleared_fs): + df1.to_csv("memory://test/test.csv", index=True) + gc.collect() # pandas does not explicitly close file buffers + df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) + + tm.assert_frame_equal(df1, df2) + + +@td.skip_if_no("fastparquet") +def test_to_parquet_new_file(monkeypatch): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df1.to_parquet( + "memory://test/test.csv", index=True, engine="fastparquet", compression=None + ) + + +@td.skip_if_installed("fsspec") +def test_not_present_exception(): + with pytest.raises(ImportError) as e: + read_csv("memory://test/test.csv") + assert "fsspec library is required" in str(e.value) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py deleted file mode 100644 index f9282487e559a..0000000000000 --- a/pandas/tests/io/test_gcs.py +++ /dev/null @@ -1,115 +0,0 @@ -from io import StringIO -import os - -import numpy as np -import pytest - -from pandas import DataFrame, date_range, read_csv -import pandas._testing as tm -from pandas.util import _test_decorators as td - -from pandas.io.common import is_gcs_url - - -def test_is_gcs_url(): - assert is_gcs_url("gcs://pandas/somethingelse.com") - assert is_gcs_url("gs://pandas/somethingelse.com") - assert not is_gcs_url("s3://pandas/somethingelse.com") - - -@td.skip_if_no("gcsfs") -def test_read_csv_gcs(monkeypatch): - df1 = DataFrame( - { - "int": [1, 3], - "float": [2.0, np.nan], - "str": ["t", "s"], - "dt": date_range("2018-06-18", periods=2), - } - ) - - class MockGCSFileSystem: - def open(self, path, mode, *args): - return StringIO(df1.to_csv(index=False)) - - monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) - df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) - - tm.assert_frame_equal(df1, df2) - - -@td.skip_if_no("gcsfs") -def test_to_csv_gcs(monkeypatch): - df1 = DataFrame( - { - "int": [1, 3], - "float": [2.0, np.nan], - "str": ["t", "s"], - "dt": date_range("2018-06-18", periods=2), - } - ) - s = StringIO() - - class MockGCSFileSystem: - def open(self, path, mode, *args): - return s - - monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) - df1.to_csv("gs://test/test.csv", index=True) - df2 = read_csv(StringIO(s.getvalue()), parse_dates=["dt"], index_col=0) - - tm.assert_frame_equal(df1, df2) - - -@td.skip_if_no("fastparquet") -@td.skip_if_no("gcsfs") -def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): - """Regression test for writing to a not-yet-existent GCS Parquet file.""" - df1 = DataFrame( - { - "int": [1, 3], - "float": [2.0, np.nan], - "str": ["t", "s"], - "dt": date_range("2018-06-18", periods=2), - } - ) - - class MockGCSFileSystem: - def open(self, path, mode="r", *args): - if "w" not in mode: - raise FileNotFoundError - return open(os.path.join(tmpdir, "test.parquet"), mode) - - monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) - df1.to_parquet( - "gs://test/test.csv", index=True, engine="fastparquet", compression=None - ) - - -@td.skip_if_no("gcsfs") -def test_gcs_get_filepath_or_buffer(monkeypatch): - df1 = DataFrame( - { - "int": [1, 3], - "float": [2.0, np.nan], - "str": ["t", "s"], - "dt": date_range("2018-06-18", periods=2), - } - ) - - def mock_get_filepath_or_buffer(*args, **kwargs): - return (StringIO(df1.to_csv(index=False)), None, None, False) - - monkeypatch.setattr( - "pandas.io.gcs.get_filepath_or_buffer", mock_get_filepath_or_buffer - ) - df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) - - tm.assert_frame_equal(df1, df2) - - -@td.skip_if_installed("gcsfs") -def test_gcs_not_present_exception(): - with pytest.raises(ImportError) as e: - read_csv("gs://test/test.csv") - assert "gcsfs library is required" in str(e.value) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 584a545769c4c..3fbb6133a5f5b 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -456,42 +456,10 @@ def mock_urlopen_read(*args, **kwargs): tm.assert_frame_equal(df, result) -@td.skip_if_no("gcsfs") -@pytest.mark.parametrize("mockurl", ["gs://gcs.com", "gcs://gcs.com"]) -def test_pickle_gcsurl_roundtrip(monkeypatch, mockurl): - with tm.ensure_clean() as path: - - class MockGCSFileSystem: - def __init__(self, *args, **kwargs): - pass - - def open(self, *args): - mode = args[1] or None - f = open(path, mode) - return f - - monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) - df = tm.makeDataFrame() - df.to_pickle(mockurl) - result = pd.read_pickle(mockurl) - tm.assert_frame_equal(df, result) - - -@td.skip_if_no("s3fs") -@pytest.mark.parametrize("mockurl", ["s3://s3.com", "s3n://s3.com", "s3a://s3.com"]) -def test_pickle_s3url_roundtrip(monkeypatch, mockurl): - with tm.ensure_clean() as path: - - class MockS3FileSystem: - def __init__(self, *args, **kwargs): - pass - - def open(self, *args): - mode = args[1] or None - f = open(path, mode) - return f - - monkeypatch.setattr("s3fs.S3FileSystem", MockS3FileSystem) +@td.skip_if_no("fsspec") +def test_pickle_fsspec_roundtrip(): + with tm.ensure_clean(): + mockurl = "memory://afile" df = tm.makeDataFrame() df.to_pickle(mockurl) result = pd.read_pickle(mockurl) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py deleted file mode 100644 index 04c6979596eca..0000000000000 --- a/pandas/tests/io/test_s3.py +++ /dev/null @@ -1,25 +0,0 @@ -from io import BytesIO - -import pytest - -from pandas import read_csv - -from pandas.io.common import is_s3_url - - -class TestS3URL: - def test_is_s3_url(self): - assert is_s3_url("s3://pandas/somethingelse.com") - assert not is_s3_url("s4://pandas/somethingelse.com") - - -def test_streaming_s3_objects(): - # GH17135 - # botocore gained iteration support in 1.10.47, can now be used in read_* - pytest.importorskip("botocore", minversion="1.10.47") - from botocore.response import StreamingBody - - data = [b"foo,bar,baz\n1,2,3\n4,5,6\n", b"just,the,header\n"] - for el in data: - body = StreamingBody(BytesIO(el), content_length=len(el)) - read_csv(body) From 9e6d3b205c27401e6737b5c99a486687e5ebacd8 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 21 May 2020 14:37:33 -0400 Subject: [PATCH 03/34] readd and adapt s3/gcs tests --- pandas/io/parquet.py | 16 +++-- pandas/tests/io/test_fsspec.py | 23 ++++++- pandas/tests/io/test_gcs.py | 108 ++++++++++++++++++++++++++++++++ pandas/tests/io/test_parquet.py | 5 +- pandas/tests/io/test_s3.py | 17 +++++ 5 files changed, 154 insertions(+), 15 deletions(-) create mode 100644 pandas/tests/io/test_gcs.py create mode 100644 pandas/tests/io/test_s3.py diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a6b447987373f..481f77cdbccdd 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -102,10 +102,11 @@ def write( # write_to_dataset does not support a file-like object when # a directory path is used, so just pass the path string. if partition_cols is not None: - if is_fsspec_url(path) and 'filesystem' not in kwargs: + if is_fsspec_url(path) and "filesystem" not in kwargs: import fsspec.core + fs, path = fsspec.core.url_to_fs(path) - kwargs['filesystem'] = fs + kwargs["filesystem"] = fs self.api.parquet.write_to_dataset( table, path, @@ -121,16 +122,13 @@ def write( file_obj_or_path.close() def read(self, path, columns=None, **kwargs): - if is_fsspec_url(path) and 'filesystem' not in kwargs: + if is_fsspec_url(path) and "filesystem" not in kwargs: import fsspec.core + fs, path = fsspec.core.url_to_fs(path) - parquet_ds = self.api.parquet.ParquetDataset( - path, filesystem=fs, **kwargs - ) + parquet_ds = self.api.parquet.ParquetDataset(path, filesystem=fs, **kwargs) else: - parquet_ds = self.api.parquet.ParquetDataset( - path, **kwargs - ) + parquet_ds = self.api.parquet.ParquetDataset(path, **kwargs) kwargs["columns"] = columns result = parquet_ds.read_pandas(**kwargs).to_pandas() diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 8bba306672c38..cdb0ce4d6dddf 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from pandas import DataFrame, date_range, read_csv +from pandas import DataFrame, date_range, read_csv, read_parquet import pandas._testing as tm from pandas.util import _test_decorators as td @@ -21,7 +21,6 @@ @pytest.fixture -@td.skip_if_installed("fsspec") def cleared_fs(): import fsspec @@ -38,6 +37,7 @@ def test_is_fsspec_url(): assert not is_fsspec_url("random:pandas/somethingelse.com") +@td.skip_if_no("fsspec") def test_read_csv(cleared_fs): from fsspec.implementations.memory import MemoryFile @@ -47,6 +47,7 @@ def test_read_csv(cleared_fs): tm.assert_frame_equal(df1, df2) +@td.skip_if_no("fsspec") def test_to_csv(cleared_fs): df1.to_csv("memory://test/test.csv", index=True) gc.collect() # pandas does not explicitly close file buffers @@ -56,6 +57,7 @@ def test_to_csv(cleared_fs): @td.skip_if_no("fastparquet") +@td.skip_if_no("fsspec") def test_to_parquet_new_file(monkeypatch): """Regression test for writing to a not-yet-existent GCS Parquet file.""" df1.to_parquet( @@ -63,6 +65,23 @@ def test_to_parquet_new_file(monkeypatch): ) +@td.skip_if_no("s3fs") +def test_from_s3_csv(s3_resource, tips_file): + tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) + # the following are decompressed by pandas, not fsspec + tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file)) + tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file)) + + +@td.skip_if_no("s3fs") +@td.skip_if_no("fastparquet") +def test_s3_parquet(s3_resource): + fn = "s3://pandas-test/test.parquet" + df1.to_parquet(fn, index=False, engine="fastparquet", compression=None) + df2 = read_parquet(fn, engine="fastparquet") + tm.assert_equal(df1, df2) + + @td.skip_if_installed("fsspec") def test_not_present_exception(): with pytest.raises(ImportError) as e: diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py new file mode 100644 index 0000000000000..5b207cbc66036 --- /dev/null +++ b/pandas/tests/io/test_gcs.py @@ -0,0 +1,108 @@ +from io import BytesIO +import os + +import numpy as np +import pytest + +from pandas import DataFrame, date_range, read_csv +import pandas._testing as tm +from pandas.util import _test_decorators as td + + +@td.skip_if_no("gcsfs") +def test_read_csv_gcs(monkeypatch): + from fsspec import AbstractFileSystem + from fsspec.registry import _registry + + _registry.clear() # noqa # remove state + + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) + + class MockGCSFileSystem(AbstractFileSystem): + def open(*args, **kwargs): + return BytesIO(df1.to_csv(index=False).encode()) + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df2 = read_csv("gs://test/test.csv", parse_dates=["dt"]) + + tm.assert_frame_equal(df1, df2) + + +@td.skip_if_no("gcsfs") +def test_to_csv_gcs(monkeypatch): + from fsspec import AbstractFileSystem + from fsspec.registry import _registry + + _registry.clear() # noqa # remove state + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) + s = BytesIO() + s.close = lambda: True + + class MockGCSFileSystem(AbstractFileSystem): + def open(*args, **kwargs): + s.seek(0) + return s + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df1.to_csv("gs://test/test.csv", index=True) + + def mock_get_filepath_or_buffer(*args, **kwargs): + return BytesIO(df1.to_csv(index=True).encode()), None, None, False + + monkeypatch.setattr( + "pandas.io.common.get_filepath_or_buffer", mock_get_filepath_or_buffer + ) + + df2 = read_csv("gs://test/test.csv", parse_dates=["dt"], index_col=0) + + tm.assert_frame_equal(df1, df2) + + +@td.skip_if_no("fastparquet") +@td.skip_if_no("gcsfs") +def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + from fsspec import AbstractFileSystem + from fsspec.registry import _registry + + _registry.clear() # noqa # remove state + df1 = DataFrame( + { + "int": [1, 3], + "float": [2.0, np.nan], + "str": ["t", "s"], + "dt": date_range("2018-06-18", periods=2), + } + ) + + class MockGCSFileSystem(AbstractFileSystem): + def open(self, path, mode="r", *args): + if "w" not in mode: + raise FileNotFoundError + return open(os.path.join(tmpdir, "test.parquet"), mode) + + monkeypatch.setattr("gcsfs.GCSFileSystem", MockGCSFileSystem) + df1.to_parquet( + "gs://test/test.csv", index=True, engine="fastparquet", compression=None + ) + + +@td.skip_if_installed("gcsfs") +def test_gcs_not_present_exception(): + with pytest.raises(ImportError) as e: + read_csv("gs://test/test.csv") + assert "gcsfs library is required" in str(e.value) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 02fcb48637749..95a777dedb01e 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -556,10 +556,7 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): pa, expected=expected_df, path="s3://pandas-test/parquet_dir", - write_kwargs={ - "partition_cols": partition_col, - "compression": None, - }, + write_kwargs={"partition_cols": partition_col, "compression": None}, check_like=True, repeat=1, ) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py new file mode 100644 index 0000000000000..a76be9465f62a --- /dev/null +++ b/pandas/tests/io/test_s3.py @@ -0,0 +1,17 @@ +from io import BytesIO + +import pytest + +from pandas import read_csv + + +def test_streaming_s3_objects(): + # GH17135 + # botocore gained iteration support in 1.10.47, can now be used in read_* + pytest.importorskip("botocore", minversion="1.10.47") + from botocore.response import StreamingBody + + data = [b"foo,bar,baz\n1,2,3\n4,5,6\n", b"just,the,header\n"] + for el in data: + body = StreamingBody(BytesIO(el), content_length=len(el)) + read_csv(body) From 4564c8d303bb298b936d9d3fe31231d760f06331 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 21 May 2020 14:43:38 -0400 Subject: [PATCH 04/34] remove gc from test --- pandas/tests/io/test_fsspec.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index cdb0ce4d6dddf..cf80166caf8a6 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -1,5 +1,3 @@ -import gc - import numpy as np import pytest @@ -50,7 +48,6 @@ def test_read_csv(cleared_fs): @td.skip_if_no("fsspec") def test_to_csv(cleared_fs): df1.to_csv("memory://test/test.csv", index=True) - gc.collect() # pandas does not explicitly close file buffers df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) tm.assert_frame_equal(df1, df2) From 0654537ba1106948bc9a81384bfdb4b01d3db58a Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 21 May 2020 15:18:05 -0400 Subject: [PATCH 05/34] Simpler is_fsspec --- pandas/io/common.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index a0dbd500affea..89a691ebdef0b 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -141,12 +141,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: Returns true if fsspec is installed and the given URL looks like something fsspec can handle """ - try: - import fsspec # noqa: F401 - - return isinstance(url, str) and ("::" in url or "://" in url) - except ImportError: - return False + return isinstance(url, str) and ("::" in url or "://" in url) def get_filepath_or_buffer( From 8d45cbbf959807e7744b351eaa5fef0c83c375dc Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 21 May 2020 15:34:05 -0400 Subject: [PATCH 06/34] add test --- pandas/tests/io/test_fsspec.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index cf80166caf8a6..57acca4840f32 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -45,6 +45,25 @@ def test_read_csv(cleared_fs): tm.assert_frame_equal(df1, df2) +@td.skip_if_no("fsspec") +def test_reasonable_error(monkeypatch): + from fsspec.registry import _registry, known_implementations + + _registry.clear() + with pytest.raises(ValueError) as e: + read_csv("nosuchprotocol://test/test.csv") + assert "nosuchprotocol" in str(e.value) + err_mgs = "test error messgae" + monkeypatch.setitem( + known_implementations, + "couldexist", + {"class": "unimportable.CouldExist", "err": err_mgs}, + ) + with pytest.raises(ImportError) as e: + read_csv("couldexist://test/test.csv") + assert err_mgs in str(e.value) + + @td.skip_if_no("fsspec") def test_to_csv(cleared_fs): df1.to_csv("memory://test/test.csv", index=True) From 006e73651b1b8f58d46e16a052cedec21cde20ef Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 28 May 2020 09:24:51 -0400 Subject: [PATCH 07/34] Answered most points --- environment.yml | 2 +- pandas/io/common.py | 5 +++-- pandas/io/parquet.py | 5 +---- pandas/tests/io/test_common.py | 6 ++++++ pandas/tests/io/test_fsspec.py | 13 +++---------- pandas/tests/io/test_gcs.py | 12 ++++++------ 6 files changed, 20 insertions(+), 23 deletions(-) diff --git a/environment.yml b/environment.yml index 4c3e10c31efd4..65d3b83538312 100644 --- a/environment.yml +++ b/environment.yml @@ -98,7 +98,7 @@ dependencies: - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.3 # pandas.read_hdf, DataFrame.to_hdf - - s3fs # pandas.read_csv... when using 's3://...' path (also brings in fsspec) + - s3fs>=0.4.0 # pandas.read_csv... when using 's3://...' path (also brings in fsspec) - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test diff --git a/pandas/io/common.py b/pandas/io/common.py index 89a691ebdef0b..815558b662884 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -149,7 +149,7 @@ def get_filepath_or_buffer( encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, - **storage_options, + **storage_options: Dict[str, Any], ): """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -162,7 +162,8 @@ def get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional - storage_options: passed on to fsspec, if using it + storage_options: dict + passed on to fsspec, if using it; this is not yet accessed by the public API Returns ------- diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 481f77cdbccdd..ebb9f8f654f1c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -172,7 +172,7 @@ def write( if is_fsspec_url(path): import fsspec - # if path is s3:// or gs:// we need to open the file in 'wb' mode. + # if filesystem is provided by fsspec, file must be opened in 'wb' mode. kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open() else: path, _, _, _ = get_filepath_or_buffer(path) @@ -191,9 +191,6 @@ def read(self, path, columns=None, **kwargs): if is_fsspec_url(path): import fsspec - # if path is s3:// or gs:// we need to open the file in 'wb' mode. - # TODO: Support 'ab' - open_with = lambda path, _: fsspec.open(path, "rb").open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index b27b028694d20..66293776325c1 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -367,3 +367,9 @@ def test_unknown_engine(self): df.to_csv(path) with pytest.raises(ValueError, match="Unknown engine"): pd.read_csv(path, engine="pyt") + + +def test_is_fsspec_url(): + assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") + assert icom.is_fsspec_url("gs://pandas/somethingelse.com") + assert not icom.is_fsspec_url("random:pandas/somethingelse.com") diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 57acca4840f32..47ddf9223c4d8 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,8 +5,6 @@ import pandas._testing as tm from pandas.util import _test_decorators as td -from pandas.io.common import is_fsspec_url - df1 = DataFrame( { "int": [1, 3], @@ -29,12 +27,6 @@ def cleared_fs(): memfs.store.clear() -def test_is_fsspec_url(): - assert is_fsspec_url("gcs://pandas/somethingelse.com") - assert is_fsspec_url("gs://pandas/somethingelse.com") - assert not is_fsspec_url("random:pandas/somethingelse.com") - - @td.skip_if_no("fsspec") def test_read_csv(cleared_fs): from fsspec.implementations.memory import MemoryFile @@ -47,9 +39,10 @@ def test_read_csv(cleared_fs): @td.skip_if_no("fsspec") def test_reasonable_error(monkeypatch): - from fsspec.registry import _registry, known_implementations + from fsspec.registry import known_implementations + from fsspec import registry - _registry.clear() + registry.target.clear() with pytest.raises(ValueError) as e: read_csv("nosuchprotocol://test/test.csv") assert "nosuchprotocol" in str(e.value) diff --git a/pandas/tests/io/test_gcs.py b/pandas/tests/io/test_gcs.py index 5b207cbc66036..4d93119ffa3f5 100644 --- a/pandas/tests/io/test_gcs.py +++ b/pandas/tests/io/test_gcs.py @@ -12,9 +12,9 @@ @td.skip_if_no("gcsfs") def test_read_csv_gcs(monkeypatch): from fsspec import AbstractFileSystem - from fsspec.registry import _registry + from fsspec import registry - _registry.clear() # noqa # remove state + registry.target.clear() # noqa # remove state df1 = DataFrame( { @@ -38,9 +38,9 @@ def open(*args, **kwargs): @td.skip_if_no("gcsfs") def test_to_csv_gcs(monkeypatch): from fsspec import AbstractFileSystem - from fsspec.registry import _registry + from fsspec import registry - _registry.clear() # noqa # remove state + registry.target.clear() # noqa # remove state df1 = DataFrame( { "int": [1, 3], @@ -77,9 +77,9 @@ def mock_get_filepath_or_buffer(*args, **kwargs): def test_to_parquet_gcs_new_file(monkeypatch, tmpdir): """Regression test for writing to a not-yet-existent GCS Parquet file.""" from fsspec import AbstractFileSystem - from fsspec.registry import _registry + from fsspec import registry - _registry.clear() # noqa # remove state + registry.target.clear() # noqa # remove state df1 = DataFrame( { "int": [1, 3], From 724ebd86616b2e0cc173157acd8d85011802ae6c Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 28 May 2020 14:47:51 -0400 Subject: [PATCH 08/34] Implemented suggestions Added test with filesystem= Added whatsnew Changed imports, updated comments --- doc/source/getting_started/install.rst | 5 +++-- doc/source/whatsnew/v1.1.0.rst | 14 ++++++++++++++ environment.yml | 3 ++- pandas/compat/_optional.py | 5 +++-- pandas/io/common.py | 4 ++-- pandas/io/parquet.py | 10 ++++++++-- pandas/tests/io/test_fsspec.py | 16 +++++----------- pandas/tests/io/test_parquet.py | 7 +++++++ 8 files changed, 44 insertions(+), 20 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index ba99aaa9f430c..612bfc3cb30cf 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -267,8 +267,9 @@ SQLAlchemy 1.1.4 SQL support for databases other tha SciPy 0.19.0 Miscellaneous statistical functions XLsxWriter 0.9.8 Excel writing blosc Compression for HDF5 +fsspec 0.7.4 File operations handling fastparquet 0.3.2 Parquet reading / writing -gcsfs 0.2.2 Google Cloud Storage access +gcsfs 0.6.0 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) lxml 3.8.0 HTML parser for read_html (see :ref:`note `) matplotlib 2.2.2 Visualization @@ -282,7 +283,7 @@ pyreadstat SPSS files (.sav) reading pytables 3.4.3 HDF5 reading / writing pyxlsb 1.0.6 Reading for xlsb files qtpy Clipboard I/O -s3fs 0.3.0 Amazon S3 access +s3fs 0.4.0 Amazon S3 access tabulate 0.8.3 Printing in Markdown-friendly format (see `tabulate`_) xarray 0.8.2 pandas-like API for N-dimensional data xclip Clipboard I/O on linux diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 4605c14643fa2..57e15de089ad5 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -197,6 +197,20 @@ For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. .. _whatsnew_110.enhancements.other: +fsspec now used for filesystem handling +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +For reading and writing to filesystems other than local and reading from HTTP(S), +the optional dependency ``fsspec`` will be used to dispatch operations. This will give unchanged +functionality for S3 and GCS storage, which were already supported, but also add +support for several other storage implementations such as Azure Datalake and Blob, +SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_. + +In the future, we will implement a way to pass parameters to the invoked +filesystem instances. + +.. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ + Other enhancements ^^^^^^^^^^^^^^^^^^ diff --git a/environment.yml b/environment.yml index 65d3b83538312..a3267097521bf 100644 --- a/environment.yml +++ b/environment.yml @@ -98,7 +98,8 @@ dependencies: - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.3 # pandas.read_hdf, DataFrame.to_hdf - - s3fs>=0.4.0 # pandas.read_csv... when using 's3://...' path (also brings in fsspec) + - s3fs>=0.4.0 # pandas.read_csv... when using 's3://...' path + - fsspec>=0.7.4 # for generic remote file operations - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test diff --git a/pandas/compat/_optional.py b/pandas/compat/_optional.py index c5fd294699c45..6bd74860c77dc 100644 --- a/pandas/compat/_optional.py +++ b/pandas/compat/_optional.py @@ -8,8 +8,9 @@ VERSIONS = { "bs4": "4.6.0", "bottleneck": "1.2.1", + "fsspec": "0.7.4", "fastparquet": "0.3.2", - "gcsfs": "0.2.2", + "gcsfs": "0.6.0", "lxml.etree": "3.8.0", "matplotlib": "2.2.2", "numexpr": "2.6.2", @@ -20,7 +21,7 @@ "pytables": "3.4.3", "pytest": "5.0.1", "pyxlsb": "1.0.6", - "s3fs": "0.3.0", + "s3fs": "0.4.0", "scipy": "1.2.0", "sqlalchemy": "1.1.4", "tables": "3.4.3", diff --git a/pandas/io/common.py b/pandas/io/common.py index 815558b662884..9fd68ab463b45 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -31,6 +31,7 @@ from pandas._typing import FilePathOrBuffer from pandas.compat import _get_lzma_file, _import_lzma +from pandas.compat._optional import import_optional_dependency from pandas.core.dtypes.common import is_file_like @@ -185,12 +186,11 @@ def get_filepath_or_buffer( return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): - import fsspec + fsspec = import_optional_dependency('fsspec') file_obj = fsspec.open( filepath_or_buffer, mode=mode or "rb", **storage_options ).open() - # TODO: both fsspec and pandas handle compression and encoding return file_obj, encoding, compression, True if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index ebb9f8f654f1c..fb7f6a6b6d87d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -102,7 +102,10 @@ def write( # write_to_dataset does not support a file-like object when # a directory path is used, so just pass the path string. if partition_cols is not None: + # user may provide filesystem= with an instance, in which case it takes priority + # and fsspec need not analyse the path if is_fsspec_url(path) and "filesystem" not in kwargs: + import_optional_dependency('fsspec') import fsspec.core fs, path = fsspec.core.url_to_fs(path) @@ -123,12 +126,15 @@ def write( def read(self, path, columns=None, **kwargs): if is_fsspec_url(path) and "filesystem" not in kwargs: + import_optional_dependency('fsspec') import fsspec.core fs, path = fsspec.core.url_to_fs(path) parquet_ds = self.api.parquet.ParquetDataset(path, filesystem=fs, **kwargs) else: parquet_ds = self.api.parquet.ParquetDataset(path, **kwargs) + # this key valid for ParquetDataset but not read_pandas + kwargs.pop('filesystem', None) kwargs["columns"] = columns result = parquet_ds.read_pandas(**kwargs).to_pandas() @@ -170,7 +176,7 @@ def write( kwargs["file_scheme"] = "hive" if is_fsspec_url(path): - import fsspec + fsspec = import_optional_dependency('fsspec') # if filesystem is provided by fsspec, file must be opened in 'wb' mode. kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open() @@ -189,7 +195,7 @@ def write( def read(self, path, columns=None, **kwargs): if is_fsspec_url(path): - import fsspec + fsspec = import_optional_dependency('fsspec') open_with = lambda path, _: fsspec.open(path, "rb").open() parquet_file = self.api.ParquetFile(path, open_with=open_with) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 47ddf9223c4d8..0ac6133598f90 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -18,16 +18,13 @@ @pytest.fixture def cleared_fs(): - import fsspec + fsspec = pytest.importorskip('fsspec') memfs = fsspec.filesystem("memory") - try: - yield memfs - finally: - memfs.store.clear() + yield memfs + memfs.store.clear() -@td.skip_if_no("fsspec") def test_read_csv(cleared_fs): from fsspec.implementations.memory import MemoryFile @@ -37,8 +34,7 @@ def test_read_csv(cleared_fs): tm.assert_frame_equal(df1, df2) -@td.skip_if_no("fsspec") -def test_reasonable_error(monkeypatch): +def test_reasonable_error(monkeypatch, cleared_fs): from fsspec.registry import known_implementations from fsspec import registry @@ -57,7 +53,6 @@ def test_reasonable_error(monkeypatch): assert err_mgs in str(e.value) -@td.skip_if_no("fsspec") def test_to_csv(cleared_fs): df1.to_csv("memory://test/test.csv", index=True) df2 = read_csv("memory://test/test.csv", parse_dates=["dt"], index_col=0) @@ -66,8 +61,7 @@ def test_to_csv(cleared_fs): @td.skip_if_no("fastparquet") -@td.skip_if_no("fsspec") -def test_to_parquet_new_file(monkeypatch): +def test_to_parquet_new_file(monkeypatch, cleared_fs): """Regression test for writing to a not-yet-existent GCS Parquet file.""" df1.to_parquet( "memory://test/test.csv", index=True, engine="fastparquet", compression=None diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 95a777dedb01e..85de5f0aade25 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -537,6 +537,13 @@ def test_categorical(self, pa): check_round_trip(df, pa, expected=expected) def test_s3_roundtrip(self, df_compat, s3_resource, pa): + s3fs = pytest.importorskip("s3fs") + s3 = s3fs.S3FileSystem() + kw = dict(filesystem=s3) + check_round_trip(df_compat, pa, path="pandas-test/pyarrow.parquet", + read_kwargs=kw, write_kwargs=kw) + + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): # GH #19134 check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") From 9da16890da6ca54693fc437a4fbc76df45755ef9 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 28 May 2020 15:21:18 -0400 Subject: [PATCH 09/34] lint --- pandas/io/common.py | 2 +- pandas/io/parquet.py | 14 +++++++------- pandas/tests/io/test_fsspec.py | 2 +- pandas/tests/io/test_parquet.py | 9 +++++++-- 4 files changed, 16 insertions(+), 11 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 9fd68ab463b45..1d4f21d348f28 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -186,7 +186,7 @@ def get_filepath_or_buffer( return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): - fsspec = import_optional_dependency('fsspec') + fsspec = import_optional_dependency("fsspec") file_obj = fsspec.open( filepath_or_buffer, mode=mode or "rb", **storage_options diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index fb7f6a6b6d87d..c1172061e39f5 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -102,10 +102,10 @@ def write( # write_to_dataset does not support a file-like object when # a directory path is used, so just pass the path string. if partition_cols is not None: - # user may provide filesystem= with an instance, in which case it takes priority - # and fsspec need not analyse the path + # user may provide filesystem= with an instance, in which case it takes + # priority and fsspec need not analyse the path if is_fsspec_url(path) and "filesystem" not in kwargs: - import_optional_dependency('fsspec') + import_optional_dependency("fsspec") import fsspec.core fs, path = fsspec.core.url_to_fs(path) @@ -126,7 +126,7 @@ def write( def read(self, path, columns=None, **kwargs): if is_fsspec_url(path) and "filesystem" not in kwargs: - import_optional_dependency('fsspec') + import_optional_dependency("fsspec") import fsspec.core fs, path = fsspec.core.url_to_fs(path) @@ -134,7 +134,7 @@ def read(self, path, columns=None, **kwargs): else: parquet_ds = self.api.parquet.ParquetDataset(path, **kwargs) # this key valid for ParquetDataset but not read_pandas - kwargs.pop('filesystem', None) + kwargs.pop("filesystem", None) kwargs["columns"] = columns result = parquet_ds.read_pandas(**kwargs).to_pandas() @@ -176,7 +176,7 @@ def write( kwargs["file_scheme"] = "hive" if is_fsspec_url(path): - fsspec = import_optional_dependency('fsspec') + fsspec = import_optional_dependency("fsspec") # if filesystem is provided by fsspec, file must be opened in 'wb' mode. kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open() @@ -195,7 +195,7 @@ def write( def read(self, path, columns=None, **kwargs): if is_fsspec_url(path): - fsspec = import_optional_dependency('fsspec') + fsspec = import_optional_dependency("fsspec") open_with = lambda path, _: fsspec.open(path, "rb").open() parquet_file = self.api.ParquetFile(path, open_with=open_with) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 0ac6133598f90..ea3c94ca40beb 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -18,7 +18,7 @@ @pytest.fixture def cleared_fs(): - fsspec = pytest.importorskip('fsspec') + fsspec = pytest.importorskip("fsspec") memfs = fsspec.filesystem("memory") yield memfs diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 85de5f0aade25..ddd70c158c492 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -540,8 +540,13 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa): s3fs = pytest.importorskip("s3fs") s3 = s3fs.S3FileSystem() kw = dict(filesystem=s3) - check_round_trip(df_compat, pa, path="pandas-test/pyarrow.parquet", - read_kwargs=kw, write_kwargs=kw) + check_round_trip( + df_compat, + pa, + path="pandas-test/pyarrow.parquet", + read_kwargs=kw, + write_kwargs=kw, + ) def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): # GH #19134 From a59541184b30fd73c3cf2cb53add37b40bfabc79 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 29 May 2020 13:26:09 -0400 Subject: [PATCH 10/34] Add versions info --- doc/source/whatsnew/v1.1.0.rst | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 57e15de089ad5..c6cc8d51d6a99 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -195,22 +195,22 @@ If needed you can adjust the bins with the argument ``offset`` (a Timedelta) tha For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. -.. _whatsnew_110.enhancements.other: - fsspec now used for filesystem handling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ For reading and writing to filesystems other than local and reading from HTTP(S), -the optional dependency ``fsspec`` will be used to dispatch operations. This will give unchanged +the optional dependency ``fsspec`` will be used to dispatch operations (`GH #33452`_). +This will give unchanged functionality for S3 and GCS storage, which were already supported, but also add support for several other storage implementations such as Azure Datalake and Blob, SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_. -In the future, we will implement a way to pass parameters to the invoked -filesystem instances. +.. _GH #33452: https://github.com/pandas-dev/pandas/issues/33452 .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ +.. _whatsnew_110.enhancements.other: + Other enhancements ^^^^^^^^^^^^^^^^^^ @@ -286,7 +286,9 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | fastparquet | 0.3.2 | | +-----------------+-----------------+---------+ -| gcsfs | 0.2.2 | | +| fsspec | 0.7.4 | | ++-----------------+-----------------+---------+ +| gcsfs | 0.6.0 | | +-----------------+-----------------+---------+ | lxml | 3.8.0 | | +-----------------+-----------------+---------+ @@ -302,7 +304,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | pytables | 3.4.3 | X | +-----------------+-----------------+---------+ -| s3fs | 0.3.0 | | +| s3fs | 0.4.0 | | +-----------------+-----------------+---------+ | scipy | 1.2.0 | X | +-----------------+-----------------+---------+ From 6dd1e92baa3db24543efc6e6441ab230ed33c85c Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 29 May 2020 14:28:23 -0400 Subject: [PATCH 11/34] Update some deps --- ci/deps/azure-36-locale.yaml | 1 + ci/deps/azure-37-locale.yaml | 1 + ci/deps/azure-windows-37.yaml | 1 + ci/deps/travis-36-cov.yaml | 1 + ci/deps/travis-36-locale.yaml | 1 + ci/deps/travis-37.yaml | 1 + 6 files changed, 6 insertions(+) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 56da56b45b702..920cb6a17dcfe 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -17,6 +17,7 @@ dependencies: - beautifulsoup4 - gcsfs - html5lib + - fsspec>=0.7.4 - ipython - jinja2 - lxml diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 31155ac93931a..69125b74d5088 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -14,6 +14,7 @@ dependencies: # pandas dependencies - beautifulsoup4 + - fsspec>=0.7.4 - html5lib - ipython - jinja2 diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index e491fd57b240b..72453f954d1e3 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -15,6 +15,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - bottleneck + - fsspec>=0.7.4 - gcsfs - html5lib - jinja2 diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 2968c8f188d49..a5a3399d95428 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -18,6 +18,7 @@ dependencies: - cython>=0.29.16 - dask - fastparquet>=0.3.2 + - fsspec>=0.7.4 - gcsfs - geopandas - html5lib diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 3fc19f1bca084..a04f9e674fd5f 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -16,6 +16,7 @@ dependencies: - blosc=1.14.3 - python-blosc - fastparquet=0.3.2 + - fsspec>=0.7.4 - gcsfs=0.2.2 - html5lib - ipython diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index 986728d0a4a40..ef650c729fb29 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -13,6 +13,7 @@ dependencies: # pandas dependencies - botocore>=1.11 + - fsspec>=0.7.4 - numpy - python-dateutil - nomkl From 6e13df712e70fed5bcc8b1c396e1efc8fe429ca3 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 29 May 2020 15:03:11 -0400 Subject: [PATCH 12/34] issue link syntax --- doc/source/whatsnew/v1.1.0.rst | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index c6cc8d51d6a99..919ff048777bc 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -199,14 +199,12 @@ fsspec now used for filesystem handling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ For reading and writing to filesystems other than local and reading from HTTP(S), -the optional dependency ``fsspec`` will be used to dispatch operations (`GH #33452`_). +the optional dependency ``fsspec`` will be used to dispatch operations (:issue:`33452`). This will give unchanged functionality for S3 and GCS storage, which were already supported, but also add support for several other storage implementations such as Azure Datalake and Blob, SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_. -.. _GH #33452: https://github.com/pandas-dev/pandas/issues/33452 - .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ .. _whatsnew_110.enhancements.other: From 3262063c3e6b24300d227c1339a282f822ad4619 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 2 Jun 2020 09:41:23 -0400 Subject: [PATCH 13/34] More specific test versions --- ci/deps/azure-36-locale.yaml | 4 ++-- ci/deps/azure-37-locale.yaml | 2 +- ci/deps/azure-windows-37.yaml | 4 ++-- ci/deps/travis-36-cov.yaml | 4 ++-- ci/deps/travis-36-locale.yaml | 4 ++-- ci/deps/travis-36-slow.yaml | 3 ++- ci/deps/travis-37.yaml | 2 +- 7 files changed, 12 insertions(+), 11 deletions(-) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 920cb6a17dcfe..fc22530041ba6 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - - gcsfs + - gcsfs>=0.6.0 - html5lib - fsspec>=0.7.4 - ipython @@ -32,7 +32,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs + - s3fs>=0.4.0 - scipy - xarray - xlrd diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 69125b74d5088..1f1c913076e3b 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -28,7 +28,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs + - s3fs>=0.4.0 - scipy - xarray - xlrd diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 72453f954d1e3..cdf20347d93af 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -16,7 +16,7 @@ dependencies: - beautifulsoup4 - bottleneck - fsspec>=0.7.4 - - gcsfs + - gcsfs>=0.6.0 - html5lib - jinja2 - lxml @@ -29,7 +29,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs + - s3fs>=0.4.0 - scipy - sqlalchemy - xlrd diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index a5a3399d95428..177e0d3f4c0af 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -19,7 +19,7 @@ dependencies: - dask - fastparquet>=0.3.2 - fsspec>=0.7.4 - - gcsfs + - gcsfs>=0.6.0 - geopandas - html5lib - matplotlib @@ -36,7 +36,7 @@ dependencies: - pytables - python-snappy - pytz - - s3fs + - s3fs>=0.4.0 - scikit-learn - scipy - sqlalchemy diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index a04f9e674fd5f..9902c5b660262 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -17,7 +17,7 @@ dependencies: - python-blosc - fastparquet=0.3.2 - fsspec>=0.7.4 - - gcsfs=0.2.2 + - gcsfs>=0.6.0 - html5lib - ipython - jinja2 @@ -34,7 +34,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs=0.3.0 + - s3fs>=0.4.0 - scipy - sqlalchemy=1.1.4 - xarray=0.10 diff --git a/ci/deps/travis-36-slow.yaml b/ci/deps/travis-36-slow.yaml index df693f0e22c71..87bad59fa4873 100644 --- a/ci/deps/travis-36-slow.yaml +++ b/ci/deps/travis-36-slow.yaml @@ -13,6 +13,7 @@ dependencies: # pandas dependencies - beautifulsoup4 + - fsspec>=0.7.4 - html5lib - lxml - matplotlib @@ -25,7 +26,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs + - s3fs>=0.4.0 - scipy - sqlalchemy - xlrd diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index ef650c729fb29..e896233aac63c 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -19,7 +19,7 @@ dependencies: - nomkl - pyarrow - pytz - - s3fs + - s3fs>=0.4.0 - tabulate - pyreadstat - pip From 4bc2411c8c62e5b44b65ae1daf3092afe00a8702 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 2 Jun 2020 12:33:58 -0400 Subject: [PATCH 14/34] Account for alternate S3 protocols, and ignore type error Note that the S3 protocols (s3a and s3n) could have been implemented in s3fs and/or fsspec, but they don't actually have any meaning outside of hadoop implementations. --- pandas/io/common.py | 7 +++++++ pandas/tests/io/test_fsspec.py | 10 +++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 1d4f21d348f28..3b55e34bb966d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -186,6 +186,13 @@ def get_filepath_or_buffer( return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): + # two special-case s3-like protocols; these have special meaning in Hadoop, + # but are equivalent to just "s3" from fsspec's point of view + # cc #11071 + if filepath_or_buffer.startswith("s3a://"): + filepath_or_buffer = filepath_or_buffer.replace("s3a://", "s3://") + if filepath_or_buffer.startswith("s3n://"): + filepath_or_buffer = filepath_or_buffer.replace("s3n://", "s3://") fsspec = import_optional_dependency("fsspec") file_obj = fsspec.open( diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index ea3c94ca40beb..4244cf6c9a929 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -13,7 +13,7 @@ "dt": date_range("2018-06-18", periods=2), } ) -text = df1.to_csv(index=False).encode() +text = df1.to_csv(index=False).encode() # type: ignore @pytest.fixture @@ -76,6 +76,14 @@ def test_from_s3_csv(s3_resource, tips_file): tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file)) +@pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) +@td.skip_if_no("s3fs") +def test_s3_protocols(s3_resource, tips_file, protocol): + tm.assert_equal( + read_csv("%s://pandas-test/tips.csv" % protocol), read_csv(tips_file) + ) + + @td.skip_if_no("s3fs") @td.skip_if_no("fastparquet") def test_s3_parquet(s3_resource): From 68644ab5c4f6a562a0713deec8d0fd985b5ded93 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 2 Jun 2020 12:36:20 -0400 Subject: [PATCH 15/34] Add comment to mypy ignore insrtuction --- pandas/tests/io/test_fsspec.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 4244cf6c9a929..c397a61616c1c 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -13,6 +13,8 @@ "dt": date_range("2018-06-18", periods=2), } ) +# the ignore on the following line accounts for to_csv returning Optional(str) +# in general, but always str in the case we give no filename text = df1.to_csv(index=False).encode() # type: ignore From 32bc58662ce639094b7e0e91f3767420c6102ee9 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 2 Jun 2020 13:33:06 -0400 Subject: [PATCH 16/34] more mypy --- pandas/io/common.py | 1 + requirements-dev.txt | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 3b55e34bb966d..035d64c4e0de4 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -186,6 +186,7 @@ def get_filepath_or_buffer( return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): + assert isinstance(filepath_or_buffer, str) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 diff --git a/requirements-dev.txt b/requirements-dev.txt index 754ec7ae28748..96b37f82896da 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -65,7 +65,8 @@ pyarrow>=0.13.1 python-snappy pyqt5>=5.9.2 tables>=3.4.3 -s3fs +s3fs>=0.4.0 +fsspec>=0.7.4 sqlalchemy xarray cftime From 037ef2cac655f5a528ac4ac588488ccd9a048a9a Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 2 Jun 2020 13:33:19 -0400 Subject: [PATCH 17/34] more black --- pandas/io/common.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 035d64c4e0de4..844fb22d27eaf 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -186,7 +186,9 @@ def get_filepath_or_buffer( return reader, encoding, compression, True if is_fsspec_url(filepath_or_buffer): - assert isinstance(filepath_or_buffer, str) # just to appease mypy for this branch + assert isinstance( + filepath_or_buffer, str + ) # just to appease mypy for this branch # two special-case s3-like protocols; these have special meaning in Hadoop, # but are equivalent to just "s3" from fsspec's point of view # cc #11071 From c3c3075f9774d3c53eeeac0331f2f4f176402298 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 3 Jun 2020 12:46:11 -0400 Subject: [PATCH 18/34] Make storage_options a dict rather than swallowing kwargs --- pandas/io/common.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 844fb22d27eaf..b1aee79d56506 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -150,7 +150,7 @@ def get_filepath_or_buffer( encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, - **storage_options: Dict[str, Any], + storage_options: Optional[Dict[str, Any]] = None, ): """ If the filepath_or_buffer is a url, translate and return the buffer. @@ -163,7 +163,7 @@ def get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional - storage_options: dict + storage_options: dict, optional passed on to fsspec, if using it; this is not yet accessed by the public API Returns @@ -199,7 +199,7 @@ def get_filepath_or_buffer( fsspec = import_optional_dependency("fsspec") file_obj = fsspec.open( - filepath_or_buffer, mode=mode or "rb", **storage_options + filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) ).open() return file_obj, encoding, compression, True From 85d6452ff13dc91db8ea8ad91778704a4d46f857 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 5 Jun 2020 13:59:52 -0400 Subject: [PATCH 19/34] More requested changes --- doc/source/getting_started/install.rst | 2 +- doc/source/whatsnew/v1.1.0.rst | 6 +++--- pandas/io/parquet.py | 24 ++++++++++++++---------- pandas/tests/io/test_common.py | 2 ++ pandas/tests/io/test_parquet.py | 4 ++-- 5 files changed, 22 insertions(+), 16 deletions(-) diff --git a/doc/source/getting_started/install.rst b/doc/source/getting_started/install.rst index 612bfc3cb30cf..01204353b5d12 100644 --- a/doc/source/getting_started/install.rst +++ b/doc/source/getting_started/install.rst @@ -267,7 +267,7 @@ SQLAlchemy 1.1.4 SQL support for databases other tha SciPy 0.19.0 Miscellaneous statistical functions XLsxWriter 0.9.8 Excel writing blosc Compression for HDF5 -fsspec 0.7.4 File operations handling +fsspec 0.7.4 Handling files aside from local and HTTP fastparquet 0.3.2 Parquet reading / writing gcsfs 0.6.0 Google Cloud Storage access html5lib HTML parser for read_html (see :ref:`note `) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 919ff048777bc..220f4f04925cc 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -284,9 +284,9 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | fastparquet | 0.3.2 | | +-----------------+-----------------+---------+ -| fsspec | 0.7.4 | | +| fsspec | 0.7.4 | X | +-----------------+-----------------+---------+ -| gcsfs | 0.6.0 | | +| gcsfs | 0.6.0 | X | +-----------------+-----------------+---------+ | lxml | 3.8.0 | | +-----------------+-----------------+---------+ @@ -302,7 +302,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | pytables | 3.4.3 | X | +-----------------+-----------------+---------+ -| s3fs | 0.4.0 | | +| s3fs | 0.4.0 | X | +-----------------+-----------------+---------+ | scipy | 1.2.0 | X | +-----------------+-----------------+---------+ diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index c1172061e39f5..e5f813db5b8d5 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -92,7 +92,6 @@ def write( **kwargs, ): self.validate_dataframe(df) - file_obj_or_path, _, _, should_close = get_filepath_or_buffer(path, mode="wb") from_pandas_kwargs: Dict[str, Any] = {"schema": kwargs.pop("schema", None)} if index is not None: @@ -101,15 +100,22 @@ def write( table = self.api.Table.from_pandas(df, **from_pandas_kwargs) # write_to_dataset does not support a file-like object when # a directory path is used, so just pass the path string. + if is_fsspec_url(path) and "filesystem" not in kwargs: + # TODO: use_legacy_dataset=False will require work, when it is available + assert kwargs.get("use_legacy_dataset", False) is False + import_optional_dependency("fsspec") + import fsspec.core + + fs, path = fsspec.core.url_to_fs(path) + kwargs["filesystem"] = fs + should_close = False + else: + file_obj_or_path, _, _, should_close = get_filepath_or_buffer( + path, mode="wb" + ) if partition_cols is not None: # user may provide filesystem= with an instance, in which case it takes # priority and fsspec need not analyse the path - if is_fsspec_url(path) and "filesystem" not in kwargs: - import_optional_dependency("fsspec") - import fsspec.core - - fs, path = fsspec.core.url_to_fs(path) - kwargs["filesystem"] = fs self.api.parquet.write_to_dataset( table, path, @@ -118,9 +124,7 @@ def write( **kwargs, ) else: - self.api.parquet.write_table( - table, file_obj_or_path, compression=compression, **kwargs - ) + self.api.parquet.write_table(table, path, compression=compression, **kwargs) if should_close: file_obj_or_path.close() diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 66293776325c1..145855b20bc6c 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -373,3 +373,5 @@ def test_is_fsspec_url(): assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") assert icom.is_fsspec_url("gs://pandas/somethingelse.com") assert not icom.is_fsspec_url("random:pandas/somethingelse.com") + assert not icom.is_fsspec_url("/local/path") + assert not icom.is_fsspec_url("relative/local/path") diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index ddd70c158c492..63709fff4430f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -536,7 +536,7 @@ def test_categorical(self, pa): expected = df.astype(object) check_round_trip(df, pa, expected=expected) - def test_s3_roundtrip(self, df_compat, s3_resource, pa): + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): s3fs = pytest.importorskip("s3fs") s3 = s3fs.S3FileSystem() kw = dict(filesystem=s3) @@ -548,7 +548,7 @@ def test_s3_roundtrip(self, df_compat, s3_resource, pa): write_kwargs=kw, ) - def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): + def test_s3_roundtrip(self, df_compat, s3_resource, pa): # GH #19134 check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") From 263dd3be7a4ba9ef65247043100adf03602b4392 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 10 Jun 2020 09:13:23 -0400 Subject: [PATCH 20/34] Remove fsspec from locale tests And move "legacy_datasets" clause --- ci/deps/azure-36-locale.yaml | 3 --- ci/deps/azure-37-locale.yaml | 2 -- ci/deps/travis-36-locale.yaml | 3 --- pandas/io/parquet.py | 2 +- 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index fc22530041ba6..a9b9a5a47ccf5 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -15,9 +15,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - - gcsfs>=0.6.0 - html5lib - - fsspec>=0.7.4 - ipython - jinja2 - lxml @@ -32,7 +30,6 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs>=0.4.0 - scipy - xarray - xlrd diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 1f1c913076e3b..81e336cf1ed7f 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -14,7 +14,6 @@ dependencies: # pandas dependencies - beautifulsoup4 - - fsspec>=0.7.4 - html5lib - ipython - jinja2 @@ -28,7 +27,6 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs>=0.4.0 - scipy - xarray - xlrd diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 9902c5b660262..298d1910c1565 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -16,8 +16,6 @@ dependencies: - blosc=1.14.3 - python-blosc - fastparquet=0.3.2 - - fsspec>=0.7.4 - - gcsfs>=0.6.0 - html5lib - ipython - jinja2 @@ -34,7 +32,6 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs>=0.4.0 - scipy - sqlalchemy=1.1.4 - xarray=0.10 diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index e5f813db5b8d5..36d4cd6063fb1 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -102,7 +102,6 @@ def write( # a directory path is used, so just pass the path string. if is_fsspec_url(path) and "filesystem" not in kwargs: # TODO: use_legacy_dataset=False will require work, when it is available - assert kwargs.get("use_legacy_dataset", False) is False import_optional_dependency("fsspec") import fsspec.core @@ -131,6 +130,7 @@ def write( def read(self, path, columns=None, **kwargs): if is_fsspec_url(path) and "filesystem" not in kwargs: import_optional_dependency("fsspec") + assert kwargs.get("use_legacy_dataset", True) is True import fsspec.core fs, path = fsspec.core.url_to_fs(path) From d0afbc303c27d51cf8cb206fc9e7d4b031e8d46c Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 10 Jun 2020 11:06:31 -0400 Subject: [PATCH 21/34] tweak --- pandas/io/common.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index b1aee79d56506..99047a8de2127 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -139,10 +139,10 @@ def urlopen(*args, **kwargs): def is_fsspec_url(url: FilePathOrBuffer) -> bool: """ - Returns true if fsspec is installed and the given URL looks like + Returns true if the given URL looks like something fsspec can handle """ - return isinstance(url, str) and ("::" in url or "://" in url) + return isinstance(url, str) and "://" in url def get_filepath_or_buffer( From 9c037456db4336fae1ea26d8b95cac9a5ed6cab7 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 11 Jun 2020 17:01:16 -0400 Subject: [PATCH 22/34] requested changes --- doc/source/whatsnew/v1.1.0.rst | 7 ++++++- pandas/io/parquet.py | 1 - 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index d19eb26246b1b..72f164fce0ae2 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -253,9 +253,14 @@ For reading and writing to filesystems other than local and reading from HTTP(S) the optional dependency ``fsspec`` will be used to dispatch operations (:issue:`33452`). This will give unchanged functionality for S3 and GCS storage, which were already supported, but also add -support for several other storage implementations such as Azure Datalake and Blob, +support for several other storage implementations such as `Azure Datalake and Blob`_, SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_. +The existing capability to interface with S3 and GCS will be unaffected by this +change, as ``fsspec`` will still bring in the same packages as before. + +.. _Azure Datalake and Blob: https://github.com/dask/adlfs + .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ .. _whatsnew_110.enhancements.other: diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 36d4cd6063fb1..d8241716e880d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -101,7 +101,6 @@ def write( # write_to_dataset does not support a file-like object when # a directory path is used, so just pass the path string. if is_fsspec_url(path) and "filesystem" not in kwargs: - # TODO: use_legacy_dataset=False will require work, when it is available import_optional_dependency("fsspec") import fsspec.core From 7982e7b96edbaa5fb9a45cbb717e1e438a068e01 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 12 Jun 2020 10:01:17 -0400 Subject: [PATCH 23/34] add gcsfs to environment.yml --- environment.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 5b1bc418b8f14..558078655fbde 100644 --- a/environment.yml +++ b/environment.yml @@ -98,8 +98,9 @@ dependencies: - pyqt>=5.9.2 # pandas.read_clipboard - pytables>=3.4.3 # pandas.read_hdf, DataFrame.to_hdf - - s3fs>=0.4.0 # pandas.read_csv... when using 's3://...' path + - s3fs>=0.4.0 # file IO when using 's3://...' path - fsspec>=0.7.4 # for generic remote file operations + - gcsfs>=0.6.0 # file IO when using 'gcs://...' path - sqlalchemy # pandas.read_sql, DataFrame.to_sql - xarray # DataFrame.to_xarray - cftime # Needed for downstream xarray.CFTimeIndex test From 946297b12a774fc9614e21ab394a8794bc685c70 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 12 Jun 2020 10:31:11 -0400 Subject: [PATCH 24/34] rerun deps script --- requirements-dev.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements-dev.txt b/requirements-dev.txt index d7c0620116ca6..884ecd6bca3d0 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -67,6 +67,7 @@ pyqt5>=5.9.2 tables>=3.4.3 s3fs>=0.4.0 fsspec>=0.7.4 +gcsfs>=0.6.0 sqlalchemy xarray cftime From 06e5a3a6de3cc228fd2f9657222fe49d7aae1be2 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 12 Jun 2020 15:13:42 -0400 Subject: [PATCH 25/34] account for passed filesystem again --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index c1fcccd38a317..3acc853c61dc4 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -134,7 +134,7 @@ def read(self, path, columns=None, **kwargs): fs, path = fsspec.core.url_to_fs(path) should_close = False else: - fs = None + fs = kwargs.pop('filesystem', None) if not fs: path, _, _, should_close = get_filepath_or_buffer(path) From 8f3854c0d66fb6bed8a72c5cbfe09f10d4a8d80a Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 12 Jun 2020 15:54:56 -0400 Subject: [PATCH 26/34] specify should_close --- pandas/io/parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 3acc853c61dc4..259fa162c076d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -135,6 +135,7 @@ def read(self, path, columns=None, **kwargs): should_close = False else: fs = kwargs.pop('filesystem', None) + should_close = False if not fs: path, _, _, should_close = get_filepath_or_buffer(path) From 50c08c8dcc918c82f43cafe309430ba5bc2c279e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 12 Jun 2020 16:23:46 -0400 Subject: [PATCH 27/34] lint --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 259fa162c076d..45f0de3bb116f 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -134,7 +134,7 @@ def read(self, path, columns=None, **kwargs): fs, path = fsspec.core.url_to_fs(path) should_close = False else: - fs = kwargs.pop('filesystem', None) + fs = kwargs.pop("filesystem", None) should_close = False if not fs: From 9b20dc62b230e214ae16ac66977520d1f3516b1d Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 12 Jun 2020 17:08:08 -0400 Subject: [PATCH 28/34] Except http passed to fsspec in parquet (was not previously possible) --- pandas/io/common.py | 2 +- pandas/tests/io/test_common.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 93d30c29a6ddd..07209c5fb1adb 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -142,7 +142,7 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: Returns true if the given URL looks like something fsspec can handle """ - return isinstance(url, str) and "://" in url + return isinstance(url, str) and "://" in url and not url.startswith(('http://', 'https://')) def get_filepath_or_buffer( diff --git a/pandas/tests/io/test_common.py b/pandas/tests/io/test_common.py index 727c325ba9b12..e2f4ae04c1f9f 100644 --- a/pandas/tests/io/test_common.py +++ b/pandas/tests/io/test_common.py @@ -372,6 +372,8 @@ def test_unknown_engine(self): def test_is_fsspec_url(): assert icom.is_fsspec_url("gcs://pandas/somethingelse.com") assert icom.is_fsspec_url("gs://pandas/somethingelse.com") + # the following is the only remote URL that is handled without fsspec + assert not icom.is_fsspec_url("http://pandas/somethingelse.com") assert not icom.is_fsspec_url("random:pandas/somethingelse.com") assert not icom.is_fsspec_url("/local/path") assert not icom.is_fsspec_url("relative/local/path") From eb90fe84998d7ff995800ae3ac209448fcfabe79 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 12 Jun 2020 17:08:49 -0400 Subject: [PATCH 29/34] lint --- pandas/io/common.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 07209c5fb1adb..51323c5ff3ef5 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -142,7 +142,11 @@ def is_fsspec_url(url: FilePathOrBuffer) -> bool: Returns true if the given URL looks like something fsspec can handle """ - return isinstance(url, str) and "://" in url and not url.startswith(('http://', 'https://')) + return ( + isinstance(url, str) + and "://" in url + and not url.startswith(("http://", "https://")) + ) def get_filepath_or_buffer( From 4977a001ecd842077c7a08d85e0e8f77e9254ae2 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 16 Jun 2020 12:21:34 -0400 Subject: [PATCH 30/34] redo whatsnew --- doc/source/whatsnew/v1.1.0.rst | 116 ++------------------------------- 1 file changed, 5 insertions(+), 111 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 133917eeffab6..ce82fa61a67f3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -245,7 +245,6 @@ If needed you can adjust the bins with the argument ``offset`` (a Timedelta) tha For a full example, see: :ref:`timeseries.adjust-the-start-of-the-bins`. - fsspec now used for filesystem handling ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -314,117 +313,10 @@ Other enhancements .. --------------------------------------------------------------------------- -Increased minimum versions for dependencies -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -Some minimum supported versions of dependencies were updated (:issue:`33718`, :issue:`29766`, :issue:`29723`, pytables >= 3.4.3). -If installed, we now require: - -+-----------------+-----------------+----------+---------+ -| Package | Minimum Version | Required | Changed | -+=================+=================+==========+=========+ -| numpy | 1.15.4 | X | X | -+-----------------+-----------------+----------+---------+ -| pytz | 2015.4 | X | | -+-----------------+-----------------+----------+---------+ -| python-dateutil | 2.7.3 | X | X | -+-----------------+-----------------+----------+---------+ -| bottleneck | 1.2.1 | | | -+-----------------+-----------------+----------+---------+ -| numexpr | 2.6.2 | | | -+-----------------+-----------------+----------+---------+ -| pytest (dev) | 4.0.2 | | | -+-----------------+-----------------+----------+---------+ - -For `optional libraries `_ the general recommendation is to use the latest version. -The following table lists the lowest version per library that is currently being tested throughout the development of pandas. -Optional libraries below the lowest tested version may still work, but are not considered supported. - -+-----------------+-----------------+---------+ -| Package | Minimum Version | Changed | -+=================+=================+=========+ -| beautifulsoup4 | 4.6.0 | | -+-----------------+-----------------+---------+ -| fastparquet | 0.3.2 | | -+-----------------+-----------------+---------+ -| fsspec | 0.7.4 | X | -+-----------------+-----------------+---------+ -| gcsfs | 0.6.0 | X | -+-----------------+-----------------+---------+ -| lxml | 3.8.0 | | -+-----------------+-----------------+---------+ -| matplotlib | 2.2.2 | | -+-----------------+-----------------+---------+ -| numba | 0.46.0 | | -+-----------------+-----------------+---------+ -| openpyxl | 2.5.7 | | -+-----------------+-----------------+---------+ -| pyarrow | 0.13.0 | | -+-----------------+-----------------+---------+ -| pymysql | 0.7.1 | | -+-----------------+-----------------+---------+ -| pytables | 3.4.3 | X | -+-----------------+-----------------+---------+ -| s3fs | 0.4.0 | X | -+-----------------+-----------------+---------+ -| scipy | 1.2.0 | X | -+-----------------+-----------------+---------+ -| sqlalchemy | 1.1.4 | | -+-----------------+-----------------+---------+ -| xarray | 0.8.2 | | -+-----------------+-----------------+---------+ -| xlrd | 1.1.0 | | -+-----------------+-----------------+---------+ -| xlsxwriter | 0.9.8 | | -+-----------------+-----------------+---------+ -| xlwt | 1.2.0 | | -+-----------------+-----------------+---------+ -| pandas-gbq | 1.2.0 | X | -+-----------------+-----------------+---------+ - -See :ref:`install.dependencies` and :ref:`install.optional_dependencies` for more. - -Development Changes -^^^^^^^^^^^^^^^^^^^ - -- The minimum version of Cython is now the most recent bug-fix version (0.29.16) (:issue:`33334`). - -.. _whatsnew_110.api.other: - -Other API changes -^^^^^^^^^^^^^^^^^ - -- :meth:`Series.describe` will now show distribution percentiles for ``datetime`` dtypes, statistics ``first`` and ``last`` - will now be ``min`` and ``max`` to match with numeric dtypes in :meth:`DataFrame.describe` (:issue:`30164`) -- Added :meth:`DataFrame.value_counts` (:issue:`5377`) -- :meth:`Groupby.groups` now returns an abbreviated representation when called on large dataframes (:issue:`1135`) -- ``loc`` lookups with an object-dtype :class:`Index` and an integer key will now raise ``KeyError`` instead of ``TypeError`` when key is missing (:issue:`31905`) -- Using a :func:`pandas.api.indexers.BaseIndexer` with ``count``, ``min``, ``max``, ``median``, ``skew``, ``cov``, ``corr`` will now return correct results for any monotonic :func:`pandas.api.indexers.BaseIndexer` descendant (:issue:`32865`) -- Added a :func:`pandas.api.indexers.FixedForwardWindowIndexer` class to support forward-looking windows during ``rolling`` operations. -- +.. _whatsnew_110.api: Backwards incompatible API changes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -- :meth:`DataFrame.swaplevels` now raises a ``TypeError`` if the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`31126`) -- :meth:`DataFrame.xs` now raises a ``TypeError`` if a ``level`` keyword is supplied and the axis is not a :class:`MultiIndex`. - Previously an ``AttributeError`` was raised (:issue:`33610`) -- :meth:`DataFrameGroupby.mean` and :meth:`SeriesGroupby.mean` (and similarly for :meth:`~DataFrameGroupby.median`, :meth:`~DataFrameGroupby.std` and :meth:`~DataFrameGroupby.var`) - now raise a ``TypeError`` if a not-accepted keyword argument is passed into it. - Previously a ``UnsupportedFunctionCall`` was raised (``AssertionError`` if ``min_count`` passed into :meth:`~DataFrameGroupby.median`) (:issue:`31485`) -- :meth:`DataFrame.at` and :meth:`Series.at` will raise a ``TypeError`` instead of a ``ValueError`` if an incompatible key is passed, and ``KeyError`` if a missing key is passed, matching the behavior of ``.loc[]`` (:issue:`31722`) -- Passing an integer dtype other than ``int64`` to ``np.array(period_index, dtype=...)`` will now raise ``TypeError`` instead of incorrectly using ``int64`` (:issue:`32255`) -- Passing an invalid ``fill_value`` to :meth:`Categorical.take` raises a ``ValueError`` instead of ``TypeError`` (:issue:`33660`) -- Combining a ``Categorical`` with integer categories and which contains missing values - with a float dtype column in operations such as :func:`concat` or :meth:`~DataFrame.append` - will now result in a float column instead of an object dtyped column (:issue:`33607`) -- :meth:`Series.to_timestamp` now raises a ``TypeError`` if the axis is not a :class:`PeriodIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :meth:`Series.to_period` now raises a ``TypeError`` if the axis is not a :class:`DatetimeIndex`. Previously an ``AttributeError`` was raised (:issue:`33327`) -- :func: `pandas.api.dtypes.is_string_dtype` no longer incorrectly identifies categorical series as string. -- :func:`read_excel` no longer takes ``**kwds`` arguments. This means that passing in keyword ``chunksize`` now raises a ``TypeError`` - (previously raised a ``NotImplementedError``), while passing in keyword ``encoding`` now raises a ``TypeError`` (:issue:`34464`) -- :func: `merge` now checks ``suffixes`` parameter type to be ``tuple`` and raises ``TypeError``, whereas before a ``list`` or ``set`` were accepted and that the ``set`` could produce unexpected results (:issue:`33740`) -- :class:`Period` no longer accepts tuples for the ``freq`` argument (:issue:`34658`) ``MultiIndex.get_indexer`` interprets `method` argument differently ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -820,7 +712,9 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | fastparquet | 0.3.2 | | +-----------------+-----------------+---------+ -| gcsfs | 0.2.2 | | +| fsspec | 0.7.4 | | ++-----------------+-----------------+---------+ +| gcsfs | 0.6.0 | X | +-----------------+-----------------+---------+ | lxml | 3.8.0 | | +-----------------+-----------------+---------+ @@ -836,7 +730,7 @@ Optional libraries below the lowest tested version may still work, but are not c +-----------------+-----------------+---------+ | pytables | 3.4.3 | X | +-----------------+-----------------+---------+ -| s3fs | 0.3.0 | | +| s3fs | 0.4.0 | X | +-----------------+-----------------+---------+ | scipy | 1.2.0 | X | +-----------------+-----------------+---------+ From 29a9785a136f280657a9bdcd847eb250a3d0bd5e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 18 Jun 2020 09:44:54 -0400 Subject: [PATCH 31/34] simplify parquet write --- pandas/io/parquet.py | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 45f0de3bb116f..0ae57964d4ec9 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -98,22 +98,16 @@ def write( from_pandas_kwargs["preserve_index"] = index table = self.api.Table.from_pandas(df, **from_pandas_kwargs) - # write_to_dataset does not support a file-like object when - # a directory path is used, so just pass the path string. + if is_fsspec_url(path) and "filesystem" not in kwargs: + # make fsspec instance, which pyarrow will use to open paths import_optional_dependency("fsspec") import fsspec.core fs, path = fsspec.core.url_to_fs(path) kwargs["filesystem"] = fs - should_close = False - else: - file_obj_or_path, _, _, should_close = get_filepath_or_buffer( - path, mode="wb" - ) if partition_cols is not None: - # user may provide filesystem= with an instance, in which case it takes - # priority and fsspec need not analyse the path + # writes to multiple files under the given path self.api.parquet.write_to_dataset( table, path, @@ -122,9 +116,8 @@ def write( **kwargs, ) else: + # write to single output file self.api.parquet.write_table(table, path, compression=compression, **kwargs) - if should_close: - file_obj_or_path.close() def read(self, path, columns=None, **kwargs): if is_fsspec_url(path) and "filesystem" not in kwargs: From 565031b983c41980605a83427319caf1b0d9532e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 18 Jun 2020 11:31:44 -0400 Subject: [PATCH 32/34] Retry S3 file probe with timeout, in test_to_s3 --- pandas/tests/io/json/test_pandas.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 137e4c991d080..fa92b7cfb8026 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1665,13 +1665,21 @@ def test_json_multiindex(self, dataframe, expected): assert result == expected def test_to_s3(self, s3_resource): + import time + # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) df.to_json(f"s3://{mock_bucket_name}/{target_file}") - assert target_file in ( - obj.key for obj in s3_resource.Bucket("pandas-test").objects.all() - ) + timeout = 5 + while True: + if target_file in ( + obj.key for obj in s3_resource.Bucket("pandas-test").objects.all() + ): + break + time.sleep(0.1) + timeout -= 0.1 + assert timeout > 0, "Timed out waiting for file to appear on moto" def test_json_pandas_na(self): # GH 31615 From 606ce11608d5f417691752aa9f18c560f21bdaa9 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 19 Jun 2020 13:08:55 -0400 Subject: [PATCH 33/34] expand user in non-fsspec paths for parquet; add test for this --- pandas/io/parquet.py | 5 ++++- pandas/tests/io/test_parquet.py | 8 ++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 0ae57964d4ec9..0472166566d3d 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -8,7 +8,7 @@ from pandas import DataFrame, get_option -from pandas.io.common import get_filepath_or_buffer, is_fsspec_url +from pandas.io.common import get_filepath_or_buffer, is_fsspec_url, _expand_user def get_engine(engine: str) -> "BaseImpl": @@ -106,6 +106,8 @@ def write( fs, path = fsspec.core.url_to_fs(path) kwargs["filesystem"] = fs + else: + path = _expand_user(path) if partition_cols is not None: # writes to multiple files under the given path self.api.parquet.write_to_dataset( @@ -129,6 +131,7 @@ def read(self, path, columns=None, **kwargs): else: fs = kwargs.pop("filesystem", None) should_close = False + path = _expand_user(path) if not fs: path, _, _, should_close = get_filepath_or_buffer(path) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 43e32218ba8c9..7ff69b92bcb19 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -591,6 +591,14 @@ def test_read_file_like_obj_support(self, df_compat): df_from_buf = pd.read_parquet(buffer) tm.assert_frame_equal(df_compat, df_from_buf) + def test_expand_user(self, df_compat, monkeypatch): + monkeypatch.setenv("HOME", "TestingUser") + monkeypatch.setenv("USERPROFILE", "TestingUser") + with pytest.raises(OSError, match=r".*TestingUser.*"): + pd.read_parquet("~/file.parquet") + with pytest.raises(OSError, match=r".*TestingUser.*"): + df_compat.to_parquet("~/file.parquet") + def test_partition_cols_supported(self, pa, df_full): # GH #23283 partition_cols = ["bool", "int"] From 60b80a622035b3eaba3bdb790444b7d699822a28 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 19 Jun 2020 14:15:30 -0400 Subject: [PATCH 34/34] reorder imports! --- pandas/io/parquet.py | 2 +- pandas/tests/io/test_parquet.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 0472166566d3d..a0c9242684f0f 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -8,7 +8,7 @@ from pandas import DataFrame, get_option -from pandas.io.common import get_filepath_or_buffer, is_fsspec_url, _expand_user +from pandas.io.common import _expand_user, get_filepath_or_buffer, is_fsspec_url def get_engine(engine: str) -> "BaseImpl": diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 7ff69b92bcb19..82157f3d722a9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -591,6 +591,7 @@ def test_read_file_like_obj_support(self, df_compat): df_from_buf = pd.read_parquet(buffer) tm.assert_frame_equal(df_compat, df_from_buf) + @td.skip_if_no("pyarrow") def test_expand_user(self, df_compat, monkeypatch): monkeypatch.setenv("HOME", "TestingUser") monkeypatch.setenv("USERPROFILE", "TestingUser")