diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index a6552aa096a22..cc996f4077cd9 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -21,6 +21,7 @@ dependencies: - lxml - matplotlib>=3.3.0 - moto + - flask - nomkl - numexpr - numpy=1.16.* diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml index e8ffd3d74ca5e..d17a8a2b0ed9b 100644 --- a/ci/deps/azure-37-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -27,9 +27,11 @@ dependencies: - python-dateutil - pytz - s3fs>=0.4.0 + - moto>=1.3.14 - scipy - sqlalchemy - xlrd - xlsxwriter - xlwt - moto + - flask diff --git a/ci/deps/azure-38-locale.yaml b/ci/deps/azure-38-locale.yaml index c7090d3a46a77..bb40127b672d3 100644 --- a/ci/deps/azure-38-locale.yaml +++ b/ci/deps/azure-38-locale.yaml @@ -14,6 +14,7 @@ dependencies: # pandas dependencies - beautifulsoup4 + - flask - html5lib - ipython - jinja2 @@ -32,6 +33,7 @@ dependencies: - xlrd - xlsxwriter - xlwt + - moto - pyarrow>=0.15 - pip - pip: diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index f4c238ab8b173..4894129915722 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -15,13 +15,14 @@ dependencies: # pandas dependencies - beautifulsoup4 - bottleneck - - fsspec>=0.7.4 + - fsspec>=0.8.0 - gcsfs>=0.6.0 - html5lib - jinja2 - lxml - matplotlib=2.2.* - - moto + - moto>=1.3.14 + - flask - numexpr - numpy=1.16.* - openpyxl @@ -29,7 +30,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs>=0.4.0 + - s3fs>=0.4.2 - scipy - sqlalchemy - xlrd diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index 1f383164b5328..2853e12b28e35 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -16,7 +16,10 @@ dependencies: - blosc - bottleneck - fastparquet>=0.3.2 + - flask + - fsspec>=0.8.0 - matplotlib=3.1.3 + - moto>=1.3.14 - numba - numexpr - numpy=1.18.* @@ -26,6 +29,7 @@ dependencies: - pytables - python-dateutil - pytz + - s3fs>=0.4.0 - scipy - xlrd - xlsxwriter diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index 5cb53489be225..ea29cbef1272b 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -17,5 +17,6 @@ dependencies: - python-dateutil - pytz - pip + - flask - pip: - moto diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml index edc11bdf4ab35..33ee6dfffb1a3 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -23,7 +23,8 @@ dependencies: - geopandas - html5lib - matplotlib - - moto + - moto>=1.3.14 + - flask - nomkl - numexpr - numpy=1.16.* diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 4427c1d940bf2..2cf0e12027401 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -21,12 +21,12 @@ dependencies: - jinja2 - lxml=4.3.0 - matplotlib=3.0.* - - moto - nomkl - numexpr - numpy - openpyxl - pandas-gbq=0.12.0 + - pyarrow>=0.17 - psycopg2=2.7 - pymysql=0.7.11 - pytables diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index e896233aac63c..26d6c2910a7cc 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -20,8 +20,8 @@ dependencies: - pyarrow - pytz - s3fs>=0.4.0 + - moto>=1.3.14 + - flask - tabulate - pyreadstat - pip - - pip: - - moto diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 8ec75b4846ae2..3e6ed1cdf8f7e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -24,6 +24,9 @@ of the individual storage backends (detailed from the fsspec docs for `builtin implementations`_ and linked to `external ones`_). See Section :ref:`io.remote`. +:issue:`35655` added fsspec support (including ``storage_options``) +for reading excel files. + .. _builtin implementations: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations .. _external ones: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations diff --git a/environment.yml b/environment.yml index 806119631d5ee..6afc19c227512 100644 --- a/environment.yml +++ b/environment.yml @@ -51,6 +51,7 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 + - flask - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b1bbda4a4b7e0..aaef71910c9ab 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,11 +3,12 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Union +from typing import Any, Mapping, Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES +from pandas._typing import StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -199,6 +200,15 @@ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. +storage_options : StorageOptions + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 Returns ------- @@ -298,10 +308,11 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, + storage_options: StorageOptions = None, ): if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + io = ExcelFile(io, storage_options=storage_options, engine=engine) elif engine and engine != io.engine: raise ValueError( "Engine should not be specified when passing " @@ -336,12 +347,14 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer, storage_options=storage_options + ) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer @@ -837,14 +850,16 @@ class ExcelFile: from pandas.io.excel._pyxlsb import _PyxlsbReader from pandas.io.excel._xlrd import _XlrdReader - _engines = { + _engines: Mapping[str, Any] = { "xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader, "pyxlsb": _PyxlsbReader, } - def __init__(self, path_or_buffer, engine=None): + def __init__( + self, path_or_buffer, engine=None, storage_options: StorageOptions = None + ): if engine is None: engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): @@ -858,13 +873,14 @@ def __init__(self, path_or_buffer, engine=None): raise ValueError(f"Unknown engine: {engine}") self.engine = engine + self.storage_options = storage_options # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io) + self._reader = self._engines[engine](self._io, storage_options=storage_options) def __fspath__(self): return self._io diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 44abaf5d3b3c9..a6cd8f524503b 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -16,13 +16,19 @@ class _ODFReader(_BaseExcelReader): Parameters ---------- - filepath_or_buffer: string, path to be parsed or + filepath_or_buffer : string, path to be parsed or an open readable stream. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): import_optional_dependency("odf") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 03a30cbd62f9a..73239190604db 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader @@ -467,7 +467,11 @@ def write_cells( class _OpenpyxlReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ) -> None: """ Reader using openpyxl engine. @@ -475,9 +479,11 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 0d96c8c4acdb8..c0e281ff6c2da 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,25 +1,31 @@ from typing import List -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _PyxlsbReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): """ Reader using pyxlsb engine. Parameters ---------- - filepath_or_buffer: str, path object, or Workbook + filepath_or_buffer : str, path object, or Workbook Object to be parsed. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer # And set the result to the book-attribute - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index af82c15fd6b66..ff1b3c8bdb964 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -2,13 +2,14 @@ import numpy as np +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): """ Reader using xlrd engine. @@ -16,10 +17,12 @@ def __init__(self, filepath_or_buffer): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 2c664e73b9463..2d86fa44f22a4 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,5 +1,6 @@ """ feather-format compat """ +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas import DataFrame, Int64Index, RangeIndex @@ -7,7 +8,7 @@ from pandas.io.common import get_filepath_or_buffer -def to_feather(df: DataFrame, path, storage_options=None, **kwargs): +def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -77,7 +78,9 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): +def read_feather( + path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None +): """ Load a feather-format object from the file path. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5d49757ce7d58..983aa56324083 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,7 +20,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer, Union +from pandas._typing import FilePathOrBuffer, StorageOptions, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -596,7 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, - storage_options=None, + storage_options: StorageOptions = None, ): # gh-23761 # diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index fcee25c258efa..518f31d73efa9 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,4 +1,7 @@ import os +import shlex +import subprocess +import time import pytest @@ -31,10 +34,62 @@ def feather_file(datapath): @pytest.fixture -def s3_resource(tips_file, jsonl_file, feather_file): +def s3so(): + return dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + + +@pytest.fixture(scope="module") +def s3_base(): """ Fixture for mocking S3 interaction. + Sets up moto server in separate process + """ + pytest.importorskip("s3fs") + pytest.importorskip("boto3") + requests = pytest.importorskip("requests") + + with tm.ensure_safe_environment_variables(): + # temporary workaround as moto fails for botocore >= 1.11 otherwise, + # see https://github.com/spulec/moto/issues/1924 & 1952 + os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") + os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") + + pytest.importorskip("moto", minversion="1.3.14") + pytest.importorskip("flask") # server mode needs flask too + + # Launching moto in server mode, i.e., as a separate process + # with an S3 endpoint on localhost + + endpoint_uri = "http://127.0.0.1:5555/" + + # pipe to null to avoid logging in terminal + proc = subprocess.Popen( + shlex.split("moto_server s3 -p 5555"), stdout=subprocess.DEVNULL + ) + + timeout = 5 + while timeout > 0: + try: + # OK to go once server is accepting connections + r = requests.get(endpoint_uri) + if r.ok: + break + except Exception: + pass + timeout -= 0.1 + time.sleep(0.1) + yield + + proc.terminate() + proc.wait() + + +@pytest.fixture() +def s3_resource(s3_base, tips_file, jsonl_file, feather_file): + """ + Sets up S3 bucket with contents + The primary bucket name is "pandas-test". The following datasets are loaded. @@ -46,45 +101,59 @@ def s3_resource(tips_file, jsonl_file, feather_file): A private bucket "cant_get_it" is also created. The boto3 s3 resource is yielded by the fixture. """ - s3fs = pytest.importorskip("s3fs") - boto3 = pytest.importorskip("boto3") - - with tm.ensure_safe_environment_variables(): - # temporary workaround as moto fails for botocore >= 1.11 otherwise, - # see https://github.com/spulec/moto/issues/1924 & 1952 - os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") - os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - - moto = pytest.importorskip("moto") - - test_s3_files = [ - ("tips#1.csv", tips_file), - ("tips.csv", tips_file), - ("tips.csv.gz", tips_file + ".gz"), - ("tips.csv.bz2", tips_file + ".bz2"), - ("items.jsonl", jsonl_file), - ("simple_dataset.feather", feather_file), - ] - - def add_tips_files(bucket_name): - for s3_key, file_name in test_s3_files: - with open(file_name, "rb") as f: - conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) - - try: - s3 = moto.mock_s3() - s3.start() - - # see gh-16135 - bucket = "pandas-test" - conn = boto3.resource("s3", region_name="us-east-1") - - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) - - conn.create_bucket(Bucket="cant_get_it", ACL="private") - add_tips_files("cant_get_it") - s3fs.S3FileSystem.clear_instance_cache() - yield conn - finally: - s3.stop() + import boto3 + import s3fs + + test_s3_files = [ + ("tips#1.csv", tips_file), + ("tips.csv", tips_file), + ("tips.csv.gz", tips_file + ".gz"), + ("tips.csv.bz2", tips_file + ".bz2"), + ("items.jsonl", jsonl_file), + ("simple_dataset.feather", feather_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, "rb") as f: + cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f) + + bucket = "pandas-test" + endpoint_uri = "http://127.0.0.1:5555/" + conn = boto3.resource("s3", endpoint_url=endpoint_uri) + cli = boto3.client("s3", endpoint_url=endpoint_uri) + + try: + cli.create_bucket(Bucket=bucket) + except: # noqa + # OK is bucket already exists + pass + try: + cli.create_bucket(Bucket="cant_get_it", ACL="private") + except: # noqa + # OK is bucket already exists + pass + timeout = 2 + while not cli.list_buckets()["Buckets"] and timeout > 0: + time.sleep(0.1) + timeout -= 0.1 + + add_tips_files(bucket) + add_tips_files("cant_get_it") + s3fs.S3FileSystem.clear_instance_cache() + yield conn + + s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + + try: + s3.rm(bucket, recursive=True) + except: # noqa + pass + try: + s3.rm("cant_get_it", recursive=True) + except: # noqa + pass + timeout = 2 + while cli.list_buckets()["Buckets"] and timeout > 0: + time.sleep(0.1) + timeout -= 0.1 diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 51fbbf836a03f..431a50477fccc 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -606,13 +606,14 @@ def test_read_from_http_url(self, read_ext): tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale - def test_read_from_s3_url(self, read_ext, s3_resource): + def test_read_from_s3_url(self, read_ext, s3_resource, s3so): # Bucket "pandas-test" created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) url = "s3://pandas-test/test1" + read_ext - url_table = pd.read_excel(url) + + url_table = pd.read_excel(url, storage_options=s3so) local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 182c21ed1d416..5bb205842269e 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -44,7 +44,11 @@ def test_with_s3_url(compression, s3_resource): with open(path, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) - roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression) + roundtripped_df = pd.read_json( + "s3://pandas-test/test-1", + compression=compression, + storage_options=dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}), + ) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1280d0fd434d5..64a666079876f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1213,10 +1213,12 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @td.skip_if_not_us_locale - def test_read_s3_jsonl(self, s3_resource): + def test_read_s3_jsonl(self, s3_resource, s3so): # GH17200 - result = read_json("s3n://pandas-test/items.jsonl", lines=True) + result = read_json( + "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so + ) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -1706,7 +1708,12 @@ def test_to_s3(self, s3_resource): # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) - df.to_json(f"s3://{mock_bucket_name}/{target_file}") + df.to_json( + f"s3://{mock_bucket_name}/{target_file}", + storage_options=dict( + client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"} + ), + ) timeout = 5 while True: if target_file in ( diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index b30a7b1ef34de..b8b03cbd14a1d 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -71,50 +71,62 @@ def tips_df(datapath): @td.skip_if_not_us_locale() class TestS3: @td.skip_if_no("s3fs") - def test_parse_public_s3_bucket(self, tips_df): + def test_parse_public_s3_bucket(self, tips_df, s3so): # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv("s3://cant_get_it/tips.csv") + df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self, tips_df): + def test_parse_public_s3n_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3n" URL - df = read_csv("s3n://pandas-test/tips.csv", nrows=10) + df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self, tips_df): + def test_parse_public_s3a_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3a" URL - df = read_csv("s3a://pandas-test/tips.csv", nrows=10) + df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self, tips_df): + def test_parse_public_s3_bucket_nrows(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + nrows=10, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self, tips_df): + def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): # Read with a chunksize chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df_reader = read_csv( - "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp + "s3://pandas-test/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -126,7 +138,7 @@ def test_parse_public_s3_bucket_chunked(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self, tips_df): + def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): # Read with a chunksize using the Python parser chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: @@ -135,6 +147,7 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): chunksize=chunksize, compression=comp, engine="python", + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -145,46 +158,53 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self, tips_df): + def test_parse_public_s3_bucket_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression=comp, + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self, tips_df): + def test_infer_s3_compression(self, tips_df, s3so): for ext in ["", ".gz", ".bz2"]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer" + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression="infer", + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self, tips_df): + def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, engine="python", nrows=10, compression=comp, + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_read_s3_fails(self): + def test_read_s3_fails(self, s3so): with pytest.raises(IOError): - read_csv("s3://nyqpug/asdf.csv") + read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): read_csv("s3://cant_get_it/file.csv") - def test_write_s3_csv_fails(self, tips_df): + def test_write_s3_csv_fails(self, tips_df, s3so): # GH 32486 # Attempting to write to an invalid S3 path should raise import botocore @@ -195,10 +215,12 @@ def test_write_s3_csv_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_csv("s3://an_s3_bucket_data_doesnt_exit/not_real.csv") + tips_df.to_csv( + "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so + ) @td.skip_if_no("pyarrow") - def test_write_s3_parquet_fails(self, tips_df): + def test_write_s3_parquet_fails(self, tips_df, s3so): # GH 27679 # Attempting to write to an invalid S3 path should raise import botocore @@ -209,7 +231,10 @@ def test_write_s3_parquet_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_parquet("s3://an_s3_bucket_data_doesnt_exit/not_real.parquet") + tips_df.to_parquet( + "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", + storage_options=s3so, + ) def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -225,7 +250,7 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): expected = read_csv(tips_file) tm.assert_frame_equal(result, expected) - def test_read_csv_chunked_download(self, s3_resource, caplog): + def test_read_csv_chunked_download(self, s3_resource, caplog, s3so): # 8 MB, S3FS usees 5MB chunks import s3fs @@ -245,18 +270,20 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): s3fs.S3FileSystem.clear_instance_cache() with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv("s3://pandas-test/large-file.csv", nrows=5) + read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so) # log of fetch_range (start, stop) assert (0, 5505024) in (x.args[-2:] for x in caplog.records) - def test_read_s3_with_hash_in_key(self, tips_df): + def test_read_s3_with_hash_in_key(self, tips_df, s3so): # GH 25945 - result = read_csv("s3://pandas-test/tips#1.csv") + result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) tm.assert_frame_equal(tips_df, result) @td.skip_if_no("pyarrow") - def test_read_feather_s3_file_path(self, feather_file): + def test_read_feather_s3_file_path(self, feather_file, s3so): # GH 29055 expected = read_feather(feather_file) - res = read_feather("s3://pandas-test/simple_dataset.feather") + res = read_feather( + "s3://pandas-test/simple_dataset.feather", storage_options=s3so + ) tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 3e89f6ca4ae16..666da677d702e 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -131,27 +131,38 @@ def test_fastparquet_options(fsspectest): @td.skip_if_no("s3fs") -def test_from_s3_csv(s3_resource, tips_file): - tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) +def test_from_s3_csv(s3_resource, tips_file, s3so): + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file) + ) # the following are decompressed by pandas, not fsspec - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file)) - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file)) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.gz", storage_options=s3so), + read_csv(tips_file), + ) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.bz2", storage_options=s3so), + read_csv(tips_file), + ) @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) @td.skip_if_no("s3fs") -def test_s3_protocols(s3_resource, tips_file, protocol): +def test_s3_protocols(s3_resource, tips_file, protocol, s3so): tm.assert_equal( - read_csv("%s://pandas-test/tips.csv" % protocol), read_csv(tips_file) + read_csv("%s://pandas-test/tips.csv" % protocol, storage_options=s3so), + read_csv(tips_file), ) @td.skip_if_no("s3fs") @td.skip_if_no("fastparquet") -def test_s3_parquet(s3_resource): +def test_s3_parquet(s3_resource, s3so): fn = "s3://pandas-test/test.parquet" - df1.to_parquet(fn, index=False, engine="fastparquet", compression=None) - df2 = read_parquet(fn, engine="fastparquet") + df1.to_parquet( + fn, index=False, engine="fastparquet", compression=None, storage_options=s3so + ) + df2 = read_parquet(fn, engine="fastparquet", storage_options=s3so) tm.assert_equal(df1, df2) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 82157f3d722a9..3a3ba99484a3a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -158,6 +158,10 @@ def check_round_trip( """ write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} + if isinstance(path, str) and "s3://" in path: + s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + read_kwargs["storage_options"] = s3so + write_kwargs["storage_options"] = s3so if expected is None: expected = df @@ -537,9 +541,11 @@ def test_categorical(self, pa): expected = df.astype(object) check_round_trip(df, pa, expected=expected) - def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): s3fs = pytest.importorskip("s3fs") - s3 = s3fs.S3FileSystem() + if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): + pytest.skip() + s3 = s3fs.S3FileSystem(**s3so) kw = dict(filesystem=s3) check_round_trip( df_compat, @@ -550,6 +556,8 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): ) def test_s3_roundtrip(self, df_compat, s3_resource, pa): + if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): + pytest.skip() # GH #19134 check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") @@ -560,10 +568,13 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716 # As per pyarrow partitioned columns become 'categorical' dtypes # and are added to back of dataframe on read + if partition_col and pd.compat.is_platform_windows(): + pytest.skip("pyarrow/win incompatibility #35791") expected_df = df_compat.copy() if partition_col: expected_df[partition_col] = expected_df[partition_col].astype("category") + check_round_trip( df_compat, pa, diff --git a/requirements-dev.txt b/requirements-dev.txt index deaed8ab9d5f1..2fbb20ddfd3bf 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,6 +32,7 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto +flask pytest>=5.0.1 pytest-cov pytest-xdist>=1.21