From 3a54dde1fb4d5fce731114e437786df2981161ef Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 10 Jul 2020 11:20:55 -0400 Subject: [PATCH 01/59] Pepper storage_options --- pandas/core/frame.py | 8 ++++++-- pandas/io/common.py | 4 ++++ pandas/io/excel/_base.py | 5 +++-- pandas/io/feather_format.py | 5 +++-- pandas/io/formats/csvs.py | 6 ++++-- pandas/io/json/_json.py | 8 +++++--- pandas/io/parquet.py | 7 ++++++- pandas/io/parsers.py | 3 ++- pandas/io/pickle.py | 11 +++++++---- pandas/io/sas/sas7bdat.py | 4 +++- pandas/io/sas/sas_xport.py | 6 ++++-- pandas/io/stata.py | 6 ++++-- 12 files changed, 51 insertions(+), 22 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 87041341ac3a6..09bd24b5543a1 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2217,7 +2217,8 @@ def to_feather(self, path, **kwargs) -> None: """, ) def to_markdown( - self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, **kwargs + self, buf: Optional[IO[str]] = None, mode: Optional[str] = None, + storage_options: Optional[Dict[str, Any]] = None, **kwargs ) -> Optional[str]: kwargs.setdefault("headers", "keys") kwargs.setdefault("tablefmt", "pipe") @@ -2225,7 +2226,8 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode) + buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode, + storage_options=storage_options) assert buf is not None # Help mypy. buf.writelines(result) return None @@ -2238,6 +2240,7 @@ def to_parquet( compression="snappy", index=None, partition_cols=None, + storage_options: Optional[Dict[str, Any]] = None, **kwargs, ) -> None: """ @@ -2328,6 +2331,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, + storage_options=storage_options, **kwargs, ) diff --git a/pandas/io/common.py b/pandas/io/common.py index 51323c5ff3ef5..336de46279325 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -180,6 +180,8 @@ def get_filepath_or_buffer( if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged + if storage_options: + raise ValueError("storage_options passed with non-fsspec URL") req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -206,6 +208,8 @@ def get_filepath_or_buffer( filepath_or_buffer, mode=mode or "rb", **(storage_options or {}) ).open() return file_obj, encoding, compression, True + elif storage_options: + raise ValueError("storage_options passed with non-fsspec URL") if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 4fa4f158e9c3c..0b102af3e7ed8 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -336,12 +336,13 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options=None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer, + storage_options=storage_options) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index dfa43942fc8b3..e1d72e31b0bcf 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -64,7 +64,8 @@ def to_feather(df: DataFrame, path, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True): +def read_feather(path, columns=None, use_threads: bool = True, + storage_options=None): """ Load a feather-format object from the file path. @@ -98,7 +99,7 @@ def read_feather(path, columns=None, use_threads: bool = True): import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer(path) + path, _, _, should_close = get_filepath_or_buffer(path, storage_options=storage_options) df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 5bd51dc8351f6..61f58ff9579ea 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -12,7 +12,7 @@ import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, Dict, Any from pandas.core.dtypes.generic import ( ABCDatetimeIndex, @@ -54,6 +54,7 @@ def __init__( doublequote: bool = True, escapechar: Optional[str] = None, decimal=".", + storage_options: Optional[Dict[str, Any]] = None ): self.obj = obj @@ -64,7 +65,8 @@ def __init__( compression, self.compression_args = get_compression_method(compression) self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, mode=mode + path_or_buf, encoding=encoding, compression=compression, mode=mode, + storage_options=storage_options ) self.sep = sep self.na_rep = na_rep diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ff37c36962aec..93a9fa700e701 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,7 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import Any, Callable, Optional, Type +from typing import Any, Callable, Optional, Type, Dict import numpy as np @@ -44,6 +44,7 @@ def to_json( compression: Optional[str] = "infer", index: bool = True, indent: int = 0, + storage_options: Optional[Dict[str, Any]] = None ): if not index and orient not in ["split", "table"]: @@ -53,7 +54,7 @@ def to_json( if path_or_buf is not None: path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, compression=compression, mode="w" + path_or_buf, compression=compression, mode="w", storage_options=storage_options ) if lines and orient != "records": @@ -364,6 +365,7 @@ def read_json( chunksize: Optional[int] = None, compression="infer", nrows: Optional[int] = None, + storage_options: Optional[Dict[str, Any]] = None ): """ Convert a JSON string to pandas object. @@ -591,7 +593,7 @@ def read_json( compression = infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression + path_or_buf, encoding=encoding, compression=compression, storage_options=storage_options ) json_reader = JsonReader( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index a0c9242684f0f..0a5b9ab77fd96 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -89,6 +89,7 @@ def write( compression="snappy", index: Optional[bool] = None, partition_cols=None, + storage_options=None, **kwargs, ): self.validate_dataframe(df) @@ -104,9 +105,11 @@ def write( import_optional_dependency("fsspec") import fsspec.core - fs, path = fsspec.core.url_to_fs(path) + fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) kwargs["filesystem"] = fs else: + if storage_options: + raise ValueError("storage_options passed with non-fsspec URL") path = _expand_user(path) if partition_cols is not None: # writes to multiple files under the given path @@ -218,6 +221,7 @@ def to_parquet( compression="snappy", index: Optional[bool] = None, partition_cols=None, + storage_options: Optional[Dict[str, Any]] = None, **kwargs, ): """ @@ -268,6 +272,7 @@ def to_parquet( compression=compression, index=index, partition_cols=partition_cols, + storage_options=storage_options, **kwargs, ) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index c427d3a198b10..f8f912a01d742 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -420,6 +420,7 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" encoding = kwds.get("encoding", None) + storage_options = kwds.get('storage_options', None) if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding @@ -432,7 +433,7 @@ def _read(filepath_or_buffer: FilePathOrBuffer, kwds): # though mypy handling of conditional imports is difficult. # See https://github.com/python/mypy/issues/1297 fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, encoding, compression + filepath_or_buffer, encoding, compression, storage_options=storage_options ) kwds["compression"] = compression diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 3b35b54a6dc16..5f591d50bdd6b 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -3,7 +3,7 @@ from typing import Any, Optional import warnings -from pandas._typing import FilePathOrBuffer +from pandas._typing import FilePathOrBuffer, Dict from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle @@ -14,6 +14,7 @@ def to_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: Optional[Dict[str, Any]] = None ): """ Pickle (serialize) object to file. @@ -76,7 +77,8 @@ def to_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression, mode="wb" + filepath_or_buffer, compression=compression, mode="wb", + storage_options=storage_options ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None @@ -97,7 +99,8 @@ def to_pickle( def read_pickle( - filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer" + filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", + storage_options: Optional[Dict[str, Any]] = None ): """ Load pickled pandas object (or any object) from file. @@ -162,7 +165,7 @@ def read_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression + filepath_or_buffer, compression=compression, storage_options=storage_options ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 3d9be7c15726b..7f788abceb5c0 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -110,6 +110,7 @@ def __init__( encoding=None, convert_text=True, convert_header_text=True, + storage_options=None ): self.index = index @@ -137,7 +138,8 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) + self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf, + storage_options=storage_options) if isinstance(self._path_or_buf, str): self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 7fc1bc6d3eb6c..56c827a944efa 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -244,7 +244,8 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None, + storage_options=None, ): self._encoding = encoding @@ -258,7 +259,8 @@ def __init__( encoding, compression, should_close, - ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) + ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, + storage_options=storage_options) if isinstance(filepath_or_buffer, (str, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, "rb") diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7677d8a94d521..39a3755a3bc5c 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1035,6 +1035,7 @@ def __init__( columns: Optional[Sequence[str]] = None, order_categoricals: bool = True, chunksize: Optional[int] = None, + storage_options: Optional[Dict[str, Any]] = None, ): super().__init__() self.col_sizes: List[int] = [] @@ -1068,11 +1069,12 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf) + path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf, + storage_options=storage_options) if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") - elif isinstance(path_or_buf, IOBase): + elif hasattr(path_or_buf, "read"): # Copy to BytesIO, and ensure no encoding contents = path_or_buf.read() self.path_or_buf = BytesIO(contents) From e549f8dde28c7c6ebd81516cc0a2fb28d3a3a1f0 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 10:09:29 -0400 Subject: [PATCH 02/59] Add feather test --- pandas/conftest.py | 18 ++++++++++++++++++ pandas/io/feather_format.py | 5 +++-- pandas/tests/io/test_feather.py | 9 +++++++++ 3 files changed, 30 insertions(+), 2 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index e0adb37e7d2f5..0979c80f3f0c9 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1224,3 +1224,21 @@ def sort_by_key(request): Tests None (no key) and the identity key. """ return request.param + + +@pytest.fixture() +def fsspectest(): + pytest.importorskip('fsspec') + from fsspec.implementations.memory import MemoryFileSystem + from fsspec import register_implementation + + class TestMemoryFS(MemoryFileSystem): + protocol = 'testmem' + test = [None] + + def __init__(self, **kwargs): + self.test[0] = kwargs.pop('test', None) + super().__init__(**kwargs) + + register_implementation('testmem', TestMemoryFS, True) + return TestMemoryFS() diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index e1d72e31b0bcf..653e3c51fab45 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -7,7 +7,7 @@ from pandas.io.common import get_filepath_or_buffer, stringify_path -def to_feather(df: DataFrame, path, **kwargs): +def to_feather(df: DataFrame, path, storage_options=None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -23,7 +23,8 @@ def to_feather(df: DataFrame, path, **kwargs): import_optional_dependency("pyarrow") from pyarrow import feather - path = stringify_path(path) + path, _, _, should_close = get_filepath_or_buffer( + path, mode='wb', storage_options=storage_options) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index a8a5c8f00e6bf..767050a00ea7c 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -186,3 +186,12 @@ def test_http_path(self, feather_file): expected = pd.read_feather(feather_file) res = pd.read_feather(url) tm.assert_frame_equal(expected, res) + + +def test_fsspec_options(fsspectest): + df = pd.DataFrame({'a': [0]}) + df.to_feather('testmem://afile', storage_options={'test': 'feather_write'}) + assert fsspectest.test[0] == "feather_write" + out = pd.read_feather('testmem://afile', storage_options={'test': 'feather_read'}) + assert fsspectest.test[0] == "feather_read" + tm.assert_frame_equal(df, out) From 0034bff3cf8d04b9e4c969d0ea77bf10aad2b5e2 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 11:47:00 -0400 Subject: [PATCH 03/59] Add CSV and parquet options tests; lint --- pandas/conftest.py | 8 +++--- pandas/core/frame.py | 5 ++-- pandas/core/generic.py | 2 ++ pandas/io/excel/_base.py | 5 ++-- pandas/io/feather_format.py | 12 +++++---- pandas/io/formats/csvs.py | 9 ++++--- pandas/io/json/_json.py | 14 +++++++--- pandas/io/parquet.py | 30 ++++++++++++++++----- pandas/io/parsers.py | 4 ++- pandas/io/pickle.py | 13 +++++---- pandas/io/sas/sas7bdat.py | 7 ++--- pandas/io/sas/sas_xport.py | 11 +++++--- pandas/io/stata.py | 7 ++--- pandas/tests/io/test_feather.py | 6 ++--- pandas/tests/io/test_fsspec.py | 48 +++++++++++++++++++++++++++++++++ pandas/tests/io/test_s3.py | 2 +- 16 files changed, 137 insertions(+), 46 deletions(-) diff --git a/pandas/conftest.py b/pandas/conftest.py index 0979c80f3f0c9..52a1a2678b6f6 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1228,17 +1228,17 @@ def sort_by_key(request): @pytest.fixture() def fsspectest(): - pytest.importorskip('fsspec') + pytest.importorskip("fsspec") from fsspec.implementations.memory import MemoryFileSystem from fsspec import register_implementation class TestMemoryFS(MemoryFileSystem): - protocol = 'testmem' + protocol = "testmem" test = [None] def __init__(self, **kwargs): - self.test[0] = kwargs.pop('test', None) + self.test[0] = kwargs.pop("test", None) super().__init__(**kwargs) - register_implementation('testmem', TestMemoryFS, True) + register_implementation("testmem", TestMemoryFS, True) return TestMemoryFS() diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 141a24f3dbcb7..d3f07c0ec3ff2 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2260,8 +2260,9 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, _ = get_filepath_or_buffer(buf, mode=mode, - storage_options=storage_options) + buf, _, _, _ = get_filepath_or_buffer( + buf, mode=mode, storage_options=storage_options + ) assert buf is not None # Help mypy. buf.writelines(result) return None diff --git a/pandas/core/generic.py b/pandas/core/generic.py index e46fde1f59f16..a129787ad6adf 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -3010,6 +3010,7 @@ def to_csv( escapechar: Optional[str] = None, decimal: Optional[str] = ".", errors: str = "strict", + storage_options: Optional[Dict[str, Any]] = None, ) -> Optional[str]: r""" Write object to a comma-separated values (csv) file. @@ -3163,6 +3164,7 @@ def to_csv( doublequote=doublequote, escapechar=escapechar, decimal=decimal, + storage_options=storage_options, ) formatter.save() diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index e1eb8bf5f05ea..962d974e353e1 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -341,8 +341,9 @@ def __init__(self, filepath_or_buffer, storage_options=None): if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer, - storage_options=storage_options) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer, storage_options=storage_options + ) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 653e3c51fab45..80a3ee1496a2e 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -4,7 +4,7 @@ from pandas import DataFrame, Int64Index, RangeIndex -from pandas.io.common import get_filepath_or_buffer, stringify_path +from pandas.io.common import get_filepath_or_buffer def to_feather(df: DataFrame, path, storage_options=None, **kwargs): @@ -24,7 +24,8 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): from pyarrow import feather path, _, _, should_close = get_filepath_or_buffer( - path, mode='wb', storage_options=storage_options) + path, mode="wb", storage_options=storage_options + ) if not isinstance(df, DataFrame): raise ValueError("feather only support IO with DataFrames") @@ -65,8 +66,7 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True, - storage_options=None): +def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): """ Load a feather-format object from the file path. @@ -100,7 +100,9 @@ def read_feather(path, columns=None, use_threads: bool = True, import_optional_dependency("pyarrow") from pyarrow import feather - path, _, _, should_close = get_filepath_or_buffer(path, storage_options=storage_options) + path, _, _, should_close = get_filepath_or_buffer( + path, storage_options=storage_options + ) df = feather.read_feather(path, columns=columns, use_threads=bool(use_threads)) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 61f58ff9579ea..dba307c0fad95 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -54,7 +54,7 @@ def __init__( doublequote: bool = True, escapechar: Optional[str] = None, decimal=".", - storage_options: Optional[Dict[str, Any]] = None + storage_options: Optional[Dict[str, Any]] = None, ): self.obj = obj @@ -65,8 +65,11 @@ def __init__( compression, self.compression_args = get_compression_method(compression) self.path_or_buf, _, _, self.should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, mode=mode, - storage_options=storage_options + path_or_buf, + encoding=encoding, + compression=compression, + mode=mode, + storage_options=storage_options, ) self.sep = sep self.na_rep = na_rep diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 93a9fa700e701..429edf323cb0e 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -44,7 +44,7 @@ def to_json( compression: Optional[str] = "infer", index: bool = True, indent: int = 0, - storage_options: Optional[Dict[str, Any]] = None + storage_options: Optional[Dict[str, Any]] = None, ): if not index and orient not in ["split", "table"]: @@ -54,7 +54,10 @@ def to_json( if path_or_buf is not None: path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, compression=compression, mode="w", storage_options=storage_options + path_or_buf, + compression=compression, + mode="w", + storage_options=storage_options, ) if lines and orient != "records": @@ -365,7 +368,7 @@ def read_json( chunksize: Optional[int] = None, compression="infer", nrows: Optional[int] = None, - storage_options: Optional[Dict[str, Any]] = None + storage_options: Optional[Dict[str, Any]] = None, ): """ Convert a JSON string to pandas object. @@ -593,7 +596,10 @@ def read_json( compression = infer_compression(path_or_buf, compression) filepath_or_buffer, _, compression, should_close = get_filepath_or_buffer( - path_or_buf, encoding=encoding, compression=compression, storage_options=storage_options + path_or_buf, + encoding=encoding, + compression=compression, + storage_options=storage_options, ) json_reader = JsonReader( diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 31af355c55bf5..7a66ca1b19587 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -89,8 +89,7 @@ def write( path: FilePathOrBuffer[AnyStr], compression: Optional[str] = "snappy", index: Optional[bool] = None, - partition_cols=None, - storage_options=None, + storage_options: Optional[Dict[str, Any]] = None, partition_cols: Optional[List[str]] = None, **kwargs, ): @@ -126,12 +125,18 @@ def write( # write to single output file self.api.parquet.write_table(table, path, compression=compression, **kwargs) - def read(self, path, columns=None, **kwargs): + def read( + self, + path, + columns=None, + storage_options: Optional[Dict[str, Any]] = None, + **kwargs, + ): if is_fsspec_url(path) and "filesystem" not in kwargs: import_optional_dependency("fsspec") import fsspec.core - fs, path = fsspec.core.url_to_fs(path) + fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) should_close = False else: fs = kwargs.pop("filesystem", None) @@ -167,6 +172,7 @@ def write( compression="snappy", index=None, partition_cols=None, + storage_options: Optional[Dict[str, Any]] = None, **kwargs, ): self.validate_dataframe(df) @@ -189,7 +195,9 @@ def write( fsspec = import_optional_dependency("fsspec") # if filesystem is provided by fsspec, file must be opened in 'wb' mode. - kwargs["open_with"] = lambda path, _: fsspec.open(path, "wb").open() + kwargs["open_with"] = lambda path, _: fsspec.open( + path, "wb", **(storage_options or {}) + ).open() else: path, _, _, _ = get_filepath_or_buffer(path) @@ -203,11 +211,19 @@ def write( **kwargs, ) - def read(self, path, columns=None, **kwargs): + def read( + self, + path, + columns=None, + storage_options: Optional[Dict[str, Any]] = None, + **kwargs, + ): if is_fsspec_url(path): fsspec = import_optional_dependency("fsspec") - open_with = lambda path, _: fsspec.open(path, "rb").open() + open_with = lambda path, _: fsspec.open( + path, "rb", **(storage_options or {}) + ).open() parquet_file = self.api.ParquetFile(path, open_with=open_with) else: path, _, _, _ = get_filepath_or_buffer(path) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8f26870c356a2..9dc0e1f71d13b 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -420,7 +420,7 @@ def _validate_names(names): def _read(filepath_or_buffer: FilePathOrBuffer, kwds): """Generic reader of line files.""" encoding = kwds.get("encoding", None) - storage_options = kwds.get('storage_options', None) + storage_options = kwds.get("storage_options", None) if encoding is not None: encoding = re.sub("_", "-", encoding).lower() kwds["encoding"] = encoding @@ -596,6 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, + storage_options=None, ): # gh-23761 # @@ -682,6 +683,7 @@ def read_csv( mangle_dupe_cols=mangle_dupe_cols, infer_datetime_format=infer_datetime_format, skip_blank_lines=skip_blank_lines, + storage_options=storage_options, ) return _read(filepath_or_buffer, kwds) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 5f591d50bdd6b..c845d898dfae9 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -14,7 +14,7 @@ def to_pickle( filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, - storage_options: Optional[Dict[str, Any]] = None + storage_options: Optional[Dict[str, Any]] = None, ): """ Pickle (serialize) object to file. @@ -77,8 +77,10 @@ def to_pickle( >>> os.remove("./dummy.pkl") """ fp_or_buf, _, compression, should_close = get_filepath_or_buffer( - filepath_or_buffer, compression=compression, mode="wb", - storage_options=storage_options + filepath_or_buffer, + compression=compression, + mode="wb", + storage_options=storage_options, ) if not isinstance(fp_or_buf, str) and compression == "infer": compression = None @@ -99,8 +101,9 @@ def to_pickle( def read_pickle( - filepath_or_buffer: FilePathOrBuffer, compression: Optional[str] = "infer", - storage_options: Optional[Dict[str, Any]] = None + filepath_or_buffer: FilePathOrBuffer, + compression: Optional[str] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ): """ Load pickled pandas object (or any object) from file. diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 7f788abceb5c0..32ef0b20f8a2a 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -110,7 +110,7 @@ def __init__( encoding=None, convert_text=True, convert_header_text=True, - storage_options=None + storage_options=None, ): self.index = index @@ -138,8 +138,9 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf, - storage_options=storage_options) + self._path_or_buf, _, _, _ = get_filepath_or_buffer( + path_or_buf, storage_options=storage_options + ) if isinstance(self._path_or_buf, str): self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index 56c827a944efa..a768589ae6f9a 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -244,7 +244,11 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None, + self, + filepath_or_buffer, + index=None, + encoding="ISO-8859-1", + chunksize=None, storage_options=None, ): @@ -259,8 +263,9 @@ def __init__( encoding, compression, should_close, - ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding, - storage_options=storage_options) + ) = get_filepath_or_buffer( + filepath_or_buffer, encoding=encoding, storage_options=storage_options + ) if isinstance(filepath_or_buffer, (str, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, "rb") diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 39a3755a3bc5c..4f6a13ed867d0 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -11,7 +11,7 @@ """ from collections import abc import datetime -from io import BytesIO, IOBase +from io import BytesIO import os from pathlib import Path import struct @@ -1069,8 +1069,9 @@ def __init__( self._native_byteorder = _set_endianness(sys.byteorder) path_or_buf = stringify_path(path_or_buf) if isinstance(path_or_buf, str): - path_or_buf, encoding, _, should_close = get_filepath_or_buffer(path_or_buf, - storage_options=storage_options) + path_or_buf, encoding, _, should_close = get_filepath_or_buffer( + path_or_buf, storage_options=storage_options + ) if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 767050a00ea7c..2356e9c87ea05 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -189,9 +189,9 @@ def test_http_path(self, feather_file): def test_fsspec_options(fsspectest): - df = pd.DataFrame({'a': [0]}) - df.to_feather('testmem://afile', storage_options={'test': 'feather_write'}) + df = pd.DataFrame({"a": [0]}) + df.to_feather("testmem://afile", storage_options={"test": "feather_write"}) assert fsspectest.test[0] == "feather_write" - out = pd.read_feather('testmem://afile', storage_options={'test': 'feather_read'}) + out = pd.read_feather("testmem://afile", storage_options={"test": "feather_read"}) assert fsspectest.test[0] == "feather_read" tm.assert_frame_equal(df, out) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c397a61616c1c..98a942e7af5fc 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -62,6 +62,16 @@ def test_to_csv(cleared_fs): tm.assert_frame_equal(df1, df2) +def test_csv_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_csv( + "testmem://test/test.csv", storage_options={"test": "csv_write"}, index=False + ) + assert fsspectest.test[0] == "csv_write" + read_csv("testmem://test/test.csv", storage_options={"test": "csv_read"}) + assert fsspectest.test[0] == "csv_read" + + @td.skip_if_no("fastparquet") def test_to_parquet_new_file(monkeypatch, cleared_fs): """Regression test for writing to a not-yet-existent GCS Parquet file.""" @@ -70,6 +80,44 @@ def test_to_parquet_new_file(monkeypatch, cleared_fs): ) +@td.skip_if_no("pyarrow") +def test_arrowparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df = DataFrame({"a": [0]}) + df.to_parquet( + "testmem://test/test.csv", + engine="pyarrow", + compression=None, + storage_options={"test": "parquet_write"}, + ) + assert fsspectest.test[0] == "parquet_write" + read_parquet( + "testmem://test/test.csv", + engine="pyarrow", + storage_options={"test": "parquet_read"}, + ) + assert fsspectest.test[0] == "parquet_read" + + +@td.skip_if_no("fastparquet") +def test_fastparquet_options(fsspectest): + """Regression test for writing to a not-yet-existent GCS Parquet file.""" + df = DataFrame({"a": [0]}) + df.to_parquet( + "testmem://test/test.csv", + engine="fastparquet", + compression=None, + storage_options={"test": "parquet_write"}, + ) + assert fsspectest.test[0] == "parquet_write" + read_parquet( + "testmem://test/test.csv", + engine="fastparquet", + storage_options={"test": "parquet_read"}, + ) + assert fsspectest.test[0] == "parquet_read" + + @td.skip_if_no("s3fs") def test_from_s3_csv(s3_resource, tips_file): tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) diff --git a/pandas/tests/io/test_s3.py b/pandas/tests/io/test_s3.py index 5e0f7edf4d8ae..a137e76b1696b 100644 --- a/pandas/tests/io/test_s3.py +++ b/pandas/tests/io/test_s3.py @@ -32,7 +32,7 @@ def test_read_without_creds_from_pub_bucket(): @tm.network @td.skip_if_no("s3fs") -def test_read_with_creds_from_pub_bucke(): +def test_read_with_creds_from_pub_bucket(): # Ensure we can read from a public bucket with credentials # GH 34626 # Use Amazon Open Data Registry - https://registry.opendata.aws/gdelt From 19f041dbcf02cad68a664e94d10b48d40cfd1c70 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 12:09:08 -0400 Subject: [PATCH 04/59] deeper lint --- pandas/io/formats/csvs.py | 4 ++-- pandas/io/json/_json.py | 2 +- pandas/io/pickle.py | 3 +-- pandas/io/stata.py | 1 + 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index dba307c0fad95..4d8fb9270f5c9 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -5,14 +5,14 @@ import csv as csvlib from io import StringIO import os -from typing import Hashable, List, Mapping, Optional, Sequence, Union +from typing import Any, Dict, Hashable, List, Mapping, Optional, Sequence, Union import warnings from zipfile import ZipFile import numpy as np from pandas._libs import writers as libwriters -from pandas._typing import FilePathOrBuffer, Dict, Any +from pandas._typing import FilePathOrBuffer from pandas.core.dtypes.generic import ( ABCDatetimeIndex, diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 429edf323cb0e..ea37ec4c50a7c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,7 @@ from io import BytesIO, StringIO from itertools import islice import os -from typing import Any, Callable, Optional, Type, Dict +from typing import Any, Callable, Dict, Optional, Type import numpy as np diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index c845d898dfae9..3c55dd4fe043e 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -3,9 +3,8 @@ from typing import Any, Optional import warnings -from pandas._typing import FilePathOrBuffer, Dict +from pandas._typing import Dict, FilePathOrBuffer from pandas.compat import pickle_compat as pc - from pandas.io.common import get_filepath_or_buffer, get_handle diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 4f6a13ed867d0..39c7e37ae0f6e 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1076,6 +1076,7 @@ def __init__( if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") elif hasattr(path_or_buf, "read"): + assert not isinstance(path_or_buf, str) # appease typing # Copy to BytesIO, and ensure no encoding contents = path_or_buf.read() self.path_or_buf = BytesIO(contents) From f9e1e692da85a0e4d97a2bf8835bc75525b85eed Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 15:21:50 -0400 Subject: [PATCH 05/59] more tests --- pandas/core/generic.py | 11 +++++++++- pandas/io/json/_json.py | 4 +++- pandas/io/pickle.py | 4 ++-- pandas/io/stata.py | 4 ++-- pandas/tests/io/test_feather.py | 9 -------- pandas/tests/io/test_fsspec.py | 38 ++++++++++++++++++++++++++++++++- 6 files changed, 54 insertions(+), 16 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index a129787ad6adf..315ffc4a84fe3 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2042,6 +2042,7 @@ def to_json( compression: Optional[str] = "infer", index: bool_t = True, indent: Optional[int] = None, + storage_options: Optional[Dict[str, Any]] = None, ) -> Optional[str]: """ Convert the object to a JSON string. @@ -2303,6 +2304,7 @@ def to_json( compression=compression, index=index, indent=indent, + storage_options=storage_options, ) def to_hdf( @@ -2617,6 +2619,7 @@ def to_pickle( path, compression: Optional[str] = "infer", protocol: int = pickle.HIGHEST_PROTOCOL, + storage_options: Optional[Dict[str, Any]] = None, ) -> None: """ Pickle (serialize) object to file. @@ -2670,7 +2673,13 @@ def to_pickle( """ from pandas.io.pickle import to_pickle - to_pickle(self, path, compression=compression, protocol=protocol) + to_pickle( + self, + path, + compression=compression, + protocol=protocol, + storage_options=storage_options, + ) def to_clipboard( self, excel: bool_t = True, sep: Optional[str] = None, **kwargs diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ea37ec4c50a7c..3f10c4694d10f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -53,7 +53,7 @@ def to_json( ) if path_or_buf is not None: - path_or_buf, _, _, _ = get_filepath_or_buffer( + path_or_buf, _, _, should_close = get_filepath_or_buffer( path_or_buf, compression=compression, mode="w", @@ -101,6 +101,8 @@ def to_json( return s else: path_or_buf.write(s) + if should_close: + path_or_buf.close() class Writer: diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 3c55dd4fe043e..01fd357d03763 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,9 +1,9 @@ """ pickle compat """ import pickle -from typing import Any, Optional +from typing import Any, Dict, Optional import warnings -from pandas._typing import Dict, FilePathOrBuffer +from pandas._typing import FilePathOrBuffer from pandas.compat import pickle_compat as pc from pandas.io.common import get_filepath_or_buffer, get_handle diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 39c7e37ae0f6e..d3fc5f55ce7ac 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, Label +from pandas._typing import FilePathOrBuffer, IO, Label from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1076,8 +1076,8 @@ def __init__( if isinstance(path_or_buf, (str, bytes)): self.path_or_buf = open(path_or_buf, "rb") elif hasattr(path_or_buf, "read"): - assert not isinstance(path_or_buf, str) # appease typing # Copy to BytesIO, and ensure no encoding + assert isinstance(path_or_buf, IO) contents = path_or_buf.read() self.path_or_buf = BytesIO(contents) diff --git a/pandas/tests/io/test_feather.py b/pandas/tests/io/test_feather.py index 2356e9c87ea05..a8a5c8f00e6bf 100644 --- a/pandas/tests/io/test_feather.py +++ b/pandas/tests/io/test_feather.py @@ -186,12 +186,3 @@ def test_http_path(self, feather_file): expected = pd.read_feather(feather_file) res = pd.read_feather(url) tm.assert_frame_equal(expected, res) - - -def test_fsspec_options(fsspectest): - df = pd.DataFrame({"a": [0]}) - df.to_feather("testmem://afile", storage_options={"test": "feather_write"}) - assert fsspectest.test[0] == "feather_write" - out = pd.read_feather("testmem://afile", storage_options={"test": "feather_read"}) - assert fsspectest.test[0] == "feather_read" - tm.assert_frame_equal(df, out) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 98a942e7af5fc..4289e1bf84461 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -1,7 +1,15 @@ import numpy as np import pytest -from pandas import DataFrame, date_range, read_csv, read_parquet +from pandas import ( + DataFrame, + date_range, + read_csv, + read_parquet, + read_feather, + read_pickle, + read_json, +) import pandas._testing as tm from pandas.util import _test_decorators as td @@ -148,3 +156,31 @@ def test_not_present_exception(): with pytest.raises(ImportError) as e: read_csv("memory://test/test.csv") assert "fsspec library is required" in str(e.value) + + +@td.skip_if_no("pyarrow") +def test_feather_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_feather("testmem://afile", storage_options={"test": "feather_write"}) + assert fsspectest.test[0] == "feather_write" + out = read_feather("testmem://afile", storage_options={"test": "feather_read"}) + assert fsspectest.test[0] == "feather_read" + tm.assert_frame_equal(df, out) + + +def test_pickle_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_pickle("testmem://afile", storage_options={"test": "pickle_write"}) + assert fsspectest.test[0] == "pickle_write" + out = read_pickle("testmem://afile", storage_options={"test": "pickle_read"}) + assert fsspectest.test[0] == "pickle_read" + tm.assert_frame_equal(df, out) + + +def test_json_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_json("testmem://afile", storage_options={"test": "json_write"}) + assert fsspectest.test[0] == "json_write" + out = read_json("testmem://afile", storage_options={"test": "json_read"}) + assert fsspectest.test[0] == "json_read" + tm.assert_frame_equal(df, out) From 7f69afea9835f187d70c0f38913822c653d680e3 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 15:31:03 -0400 Subject: [PATCH 06/59] blank line --- pandas/io/pickle.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 01fd357d03763..44b15391da6a4 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -5,6 +5,7 @@ from pandas._typing import FilePathOrBuffer from pandas.compat import pickle_compat as pc + from pandas.io.common import get_filepath_or_buffer, get_handle From cc0e4c30625caf4a4ed120482900496588c80e81 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 15:58:21 -0400 Subject: [PATCH 07/59] attempt relint --- pandas/io/stata.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index d3fc5f55ce7ac..5508bb759c9b2 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import FilePathOrBuffer, IO, Label +from pandas._typing import IO, FilePathOrBuffer, Label from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( @@ -1077,8 +1077,8 @@ def __init__( self.path_or_buf = open(path_or_buf, "rb") elif hasattr(path_or_buf, "read"): # Copy to BytesIO, and ensure no encoding - assert isinstance(path_or_buf, IO) - contents = path_or_buf.read() + pb: Any = path_or_buf + contents = pb.read() self.path_or_buf = BytesIO(contents) self._read_header() From e356e9351d93d637b7c6058a96147442d6a8f2ec Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 16:44:06 -0400 Subject: [PATCH 08/59] unused import --- pandas/io/stata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 5508bb759c9b2..feb76e6c37bda 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -35,7 +35,7 @@ from pandas._libs.lib import infer_dtype from pandas._libs.writers import max_len_string_array -from pandas._typing import IO, FilePathOrBuffer, Label +from pandas._typing import FilePathOrBuffer, Label from pandas.util._decorators import Appender from pandas.core.dtypes.common import ( From c7170dd8adac757cd997e16ef4cd21cbcced88be Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 22 Jul 2020 17:18:48 -0400 Subject: [PATCH 09/59] more order --- pandas/tests/io/test_fsspec.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 4289e1bf84461..4b3f5b87e6583 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -5,10 +5,10 @@ DataFrame, date_range, read_csv, - read_parquet, read_feather, - read_pickle, read_json, + read_parquet, + read_pickle, ) import pandas._testing as tm from pandas.util import _test_decorators as td @@ -45,8 +45,8 @@ def test_read_csv(cleared_fs): def test_reasonable_error(monkeypatch, cleared_fs): - from fsspec.registry import known_implementations from fsspec import registry + from fsspec.registry import known_implementations registry.target.clear() with pytest.raises(ValueError) as e: From b96778dcc69871d33252fd3c9d10c34f5836f4ef Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 11:23:51 -0400 Subject: [PATCH 10/59] plumb stata and test --- pandas/core/frame.py | 2 ++ pandas/io/stata.py | 19 ++++++++++++++++--- pandas/tests/io/test_fsspec.py | 12 ++++++++++++ 3 files changed, 30 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index d3f07c0ec3ff2..344f35b01bce7 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2055,6 +2055,7 @@ def to_stata( version: Optional[int] = 114, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ) -> None: """ Export DataFrame object to Stata dta format. @@ -2187,6 +2188,7 @@ def to_stata( write_index=write_index, variable_labels=variable_labels, compression=compression, + storage_options=storage_options, **kwargs, ) writer.write_file() diff --git a/pandas/io/stata.py b/pandas/io/stata.py index feb76e6c37bda..cc59343fc0534 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1911,6 +1911,7 @@ def read_stata( order_categoricals: bool = True, chunksize: Optional[int] = None, iterator: bool = False, + storage_options: Optional[Dict[str, Any]] = None, ) -> Union[DataFrame, StataReader]: reader = StataReader( @@ -1923,6 +1924,7 @@ def read_stata( columns=columns, order_categoricals=order_categoricals, chunksize=chunksize, + storage_options=storage_options, ) if iterator or chunksize: @@ -1936,7 +1938,9 @@ def read_stata( def _open_file_binary_write( - fname: FilePathOrBuffer, compression: Union[str, Mapping[str, str], None], + fname: FilePathOrBuffer, + compression: Union[str, Mapping[str, str], None], + storage_options: Optional[Dict[str, Any]] = None, ) -> Tuple[BinaryIO, bool, Optional[Union[str, Mapping[str, str]]]]: """ Open a binary file or no-op if file-like. @@ -1963,7 +1967,10 @@ def _open_file_binary_write( compression_typ, compression_args = get_compression_method(compression) compression_typ = infer_compression(fname, compression_typ) path_or_buf, _, compression_typ, _ = get_filepath_or_buffer( - fname, compression=compression_typ + fname, + mode="wb", + compression=compression_typ, + storage_options=storage_options, ) if compression_typ is not None: compression = compression_args @@ -2209,6 +2216,7 @@ def __init__( data_label: Optional[str] = None, variable_labels: Optional[Dict[Label, str]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ): super().__init__() self._convert_dates = {} if convert_dates is None else convert_dates @@ -2221,6 +2229,7 @@ def __init__( self._output_file: Optional[BinaryIO] = None # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) + self.storage_options = storage_options if byteorder is None: byteorder = sys.byteorder @@ -2507,7 +2516,7 @@ def _encode_strings(self) -> None: def write_file(self) -> None: self._file, self._own_file, compression = _open_file_binary_write( - self._fname, self._compression + self._fname, self._compression, storage_options=self.storage_options ) if compression is not None: self._output_file = self._file @@ -3090,6 +3099,7 @@ def __init__( variable_labels: Optional[Dict[Label, str]] = None, convert_strl: Optional[Sequence[Label]] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ): # Copy to new list since convert_strl might be modified later self._convert_strl: List[Label] = [] @@ -3106,6 +3116,7 @@ def __init__( data_label=data_label, variable_labels=variable_labels, compression=compression, + storage_options=storage_options, ) self._map: Dict[str, int] = {} self._strl_blob = b"" @@ -3493,6 +3504,7 @@ def __init__( convert_strl: Optional[Sequence[Label]] = None, version: Optional[int] = None, compression: Union[str, Mapping[str, str], None] = "infer", + storage_options: Optional[Dict[str, Any]] = None, ): if version is None: version = 118 if data.shape[1] <= 32767 else 119 @@ -3515,6 +3527,7 @@ def __init__( variable_labels=variable_labels, convert_strl=convert_strl, compression=compression, + storage_options=storage_options, ) # Override version set in StataWriter117 init self._dta_version = version diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 4b3f5b87e6583..86288255fc566 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -9,6 +9,7 @@ read_json, read_parquet, read_pickle, + read_stata, ) import pandas._testing as tm from pandas.util import _test_decorators as td @@ -184,3 +185,14 @@ def test_json_options(fsspectest): out = read_json("testmem://afile", storage_options={"test": "json_read"}) assert fsspectest.test[0] == "json_read" tm.assert_frame_equal(df, out) + + +def test_stata_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_stata( + "testmem://afile", storage_options={"test": "stata_write"}, write_index=False + ) + assert fsspectest.test[0] == "stata_write" + out = read_stata("testmem://afile", storage_options={"test": "stata_read"}) + assert fsspectest.test[0] == "stata_read" + tm.assert_frame_equal(df, out.astype("int64")) From 1dc41b1acaa48bb56dcbd3a25d80b3cb740985c5 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 11:50:21 -0400 Subject: [PATCH 11/59] Add note about storage_options in whatsnew --- doc/source/whatsnew/v1.1.0.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 55e2a810e6fc3..72687608f64b6 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -265,6 +265,12 @@ SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_. The existing capability to interface with S3 and GCS will be unaffected by this change, as ``fsspec`` will still bring in the same packages as before. +Many read/write functions have acquired the `storage_options` optional argument, +to pass a dictionary of parameters to the storage backend. This allows, for +example, for passing credentials to S3 and GCS storage. The details of what +parameters can be passed to which backends can be found in the documentation +of the individual storage backends. + .. _Azure Datalake and Blob: https://github.com/dask/adlfs .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ From d8829841586a4a3433d5d85aad12f84b99b6df82 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 12:00:24 -0400 Subject: [PATCH 12/59] Plumb and test markdown --- pandas/core/frame.py | 9 ++++++--- pandas/tests/io/test_fsspec.py | 7 +++++++ 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 344f35b01bce7..4354979fcfe3c 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2241,8 +2241,8 @@ def to_feather(self, path, **kwargs) -> None: ) def to_markdown( self, - buf: Optional[IO[str]] = None, - mode: Optional[str] = None, + buf: Optional[Union[IO[str], str]] = None, + mode: Optional[str] = "wt", index: bool = True, storage_options: Optional[Dict[str, Any]] = None, **kwargs, @@ -2262,11 +2262,14 @@ def to_markdown( result = tabulate.tabulate(self, **kwargs) if buf is None: return result - buf, _, _, _ = get_filepath_or_buffer( + buf, _, _, should_close = get_filepath_or_buffer( buf, mode=mode, storage_options=storage_options ) assert buf is not None # Help mypy. + assert not isinstance(buf, str) buf.writelines(result) + if should_close: + buf.close() return None @deprecate_kwarg(old_arg_name="fname", new_arg_name="path") diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 86288255fc566..58b8332920da2 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -196,3 +196,10 @@ def test_stata_options(fsspectest): out = read_stata("testmem://afile", storage_options={"test": "stata_read"}) assert fsspectest.test[0] == "stata_read" tm.assert_frame_equal(df, out.astype("int64")) + + +def test_markdown_options(fsspectest): + df = DataFrame({"a": [0]}) + df.to_markdown("testmem://afile", storage_options={"test": "md_write"}) + assert fsspectest.test[0] == "md_write" + assert fsspectest.cat("afile") From f1e455dbd78d129dbf40fdd9dd155ccd1e012831 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 13:37:57 -0400 Subject: [PATCH 13/59] optional markdown --- pandas/tests/io/test_fsspec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 58b8332920da2..09d82244ca13f 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -198,6 +198,7 @@ def test_stata_options(fsspectest): tm.assert_frame_equal(df, out.astype("int64")) +@td.skip_if_no("tabulate") def test_markdown_options(fsspectest): df = DataFrame({"a": [0]}) df.to_markdown("testmem://afile", storage_options={"test": "md_write"}) From c88b75f2e3038a4115570be775db7d0fa6114309 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 13:52:42 -0400 Subject: [PATCH 14/59] remove extraneous --- pandas/io/excel/_base.py | 6 ++---- pandas/io/sas/sas7bdat.py | 5 +---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 962d974e353e1..2a12f779230b2 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -336,14 +336,12 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer, storage_options=None): + def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( - filepath_or_buffer, storage_options=storage_options - ) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer diff --git a/pandas/io/sas/sas7bdat.py b/pandas/io/sas/sas7bdat.py index 32ef0b20f8a2a..3d9be7c15726b 100644 --- a/pandas/io/sas/sas7bdat.py +++ b/pandas/io/sas/sas7bdat.py @@ -110,7 +110,6 @@ def __init__( encoding=None, convert_text=True, convert_header_text=True, - storage_options=None, ): self.index = index @@ -138,9 +137,7 @@ def __init__( self._current_row_on_page_index = 0 self._current_row_in_file_index = 0 - self._path_or_buf, _, _, _ = get_filepath_or_buffer( - path_or_buf, storage_options=storage_options - ) + self._path_or_buf, _, _, _ = get_filepath_or_buffer(path_or_buf) if isinstance(self._path_or_buf, str): self._path_or_buf = open(self._path_or_buf, "rb") self.handle = self._path_or_buf From 58481a4bef9ff2abf7ade0b9022929fce2591f4b Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 23 Jul 2020 13:54:35 -0400 Subject: [PATCH 15/59] more extraneous --- pandas/io/sas/sas_xport.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/pandas/io/sas/sas_xport.py b/pandas/io/sas/sas_xport.py index a768589ae6f9a..6cf248b748107 100644 --- a/pandas/io/sas/sas_xport.py +++ b/pandas/io/sas/sas_xport.py @@ -244,12 +244,7 @@ class XportReader(ReaderBase, abc.Iterator): __doc__ = _xport_reader_doc def __init__( - self, - filepath_or_buffer, - index=None, - encoding="ISO-8859-1", - chunksize=None, - storage_options=None, + self, filepath_or_buffer, index=None, encoding="ISO-8859-1", chunksize=None, ): self._encoding = encoding @@ -263,9 +258,7 @@ def __init__( encoding, compression, should_close, - ) = get_filepath_or_buffer( - filepath_or_buffer, encoding=encoding, storage_options=storage_options - ) + ) = get_filepath_or_buffer(filepath_or_buffer, encoding=encoding) if isinstance(filepath_or_buffer, (str, bytes)): self.filepath_or_buffer = open(filepath_or_buffer, "rb") From 704770b3598b0c5dc6ecac6a984ea514ac9682d1 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 13:33:22 -0400 Subject: [PATCH 16/59] Add fsspec options error and docstrings --- pandas/core/frame.py | 22 +++++++++++++++++++++- pandas/core/generic.py | 28 ++++++++++++++++++++++++++++ pandas/io/common.py | 20 ++++++++++++++++---- pandas/io/feather_format.py | 10 ++++++++++ pandas/io/json/_json.py | 10 ++++++++++ pandas/io/parquet.py | 14 +++++++++++++- pandas/io/pickle.py | 20 ++++++++++++++++++++ pandas/io/stata.py | 20 ++++++++++++++++++++ pandas/tests/io/test_fsspec.py | 13 +++++++++++++ 9 files changed, 151 insertions(+), 6 deletions(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4354979fcfe3c..f089bdfea3a4a 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2132,6 +2132,16 @@ def to_stata( .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Raises ------ NotImplementedError @@ -2242,7 +2252,7 @@ def to_feather(self, path, **kwargs) -> None: def to_markdown( self, buf: Optional[Union[IO[str], str]] = None, - mode: Optional[str] = "wt", + mode: str = "wt", index: bool = True, storage_options: Optional[Dict[str, Any]] = None, **kwargs, @@ -2329,6 +2339,16 @@ def to_parquet( .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + **kwargs Additional arguments passed to the parquet library. See :ref:`pandas io ` for more details. diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 315ffc4a84fe3..f4854ece0dea4 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2126,6 +2126,16 @@ def to_json( .. versionadded:: 1.0.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- None or str @@ -2640,6 +2650,16 @@ def to_pickle( .. [1] https://docs.python.org/3/library/pickle.html. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + See Also -------- read_pickle : Load pickled pandas object (or any object) from file. @@ -3119,6 +3139,14 @@ def to_csv( See the errors argument for :func:`open` for a full list of options. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + .. versionadded:: 1.1.0 Returns diff --git a/pandas/io/common.py b/pandas/io/common.py index 56041481a9d34..152d09f569371 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -167,8 +167,16 @@ def get_filepath_or_buffer( compression : {{'gzip', 'bz2', 'zip', 'xz', None}}, optional encoding : the encoding to use to decode bytes, default is 'utf-8' mode : str, optional - storage_options: dict, optional - passed on to fsspec, if using it; this is not yet accessed by the public API + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 Returns ------- @@ -181,7 +189,9 @@ def get_filepath_or_buffer( if isinstance(filepath_or_buffer, str) and is_url(filepath_or_buffer): # TODO: fsspec can also handle HTTP via requests, but leaving this unchanged if storage_options: - raise ValueError("storage_options passed with non-fsspec URL") + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) req = urlopen(filepath_or_buffer) content_encoding = req.headers.get("Content-Encoding", None) if content_encoding == "gzip": @@ -237,7 +247,9 @@ def get_filepath_or_buffer( return file_obj, encoding, compression, True elif storage_options: - raise ValueError("storage_options passed with non-fsspec URL") + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) if isinstance(filepath_or_buffer, (str, bytes, mmap.mmap)): return _expand_user(filepath_or_buffer), None, compression, False diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 80a3ee1496a2e..f6760bb3c772f 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -15,6 +15,16 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): ---------- df : DataFrame path : string file path, or file-like object + + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 3f10c4694d10f..119ba47096df9 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -516,6 +516,16 @@ def read_json( .. versionadded:: 1.1 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- Series or DataFrame diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 7a66ca1b19587..6f0c6caaeb385 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -110,7 +110,9 @@ def write( kwargs["filesystem"] = fs else: if storage_options: - raise ValueError("storage_options passed with non-fsspec URL") + raise ValueError( + "storage_options passed with file object or non-fsspec file path" + ) path = _expand_user(path) if partition_cols is not None: # writes to multiple files under the given path @@ -282,6 +284,16 @@ def to_parquet( .. versionadded:: 0.24.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + kwargs Additional keyword arguments passed to the engine """ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 44b15391da6a4..6e91389d567a8 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -43,6 +43,16 @@ def to_pickle( protocol parameter is equivalent to setting its value to HIGHEST_PROTOCOL. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + .. [1] https://docs.python.org/3/library/pickle.html See Also @@ -127,6 +137,16 @@ def read_pickle( compression) If 'infer' and 'path_or_url' is not path-like, then use None (= no decompression). + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- unpickled : same type as object stored in file diff --git a/pandas/io/stata.py b/pandas/io/stata.py index cc59343fc0534..7bb83a07e5f16 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1952,6 +1952,16 @@ def _open_file_binary_write( compression : {str, dict, None} The compression method to use. + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- file : file-like object @@ -2167,6 +2177,16 @@ class StataWriter(StataParser): .. versionadded:: 1.1.0 + storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.1.0 + Returns ------- writer : StataWriter instance diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 09d82244ca13f..4282884885f4d 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -1,3 +1,4 @@ +import io import numpy as np import pytest @@ -204,3 +205,15 @@ def test_markdown_options(fsspectest): df.to_markdown("testmem://afile", storage_options={"test": "md_write"}) assert fsspectest.test[0] == "md_write" assert fsspectest.cat("afile") + + +def test_non_fsspec_options(): + with pytest.raises(ValueError, match="storage_options"): + read_csv("localfile", storage_options={"a": True}) + with pytest.raises(ValueError, match="storage_options"): + # separate test for parquet, which has a different code path + read_parquet("localfile", storage_options={"a": True}) + by = io.BytesIO() + + with pytest.raises(ValueError, match="storage_options"): + read_csv(by, storage_options={"a": True}) From 1b8637e78dfb8041d523939bac93794872245a42 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 14:04:08 -0400 Subject: [PATCH 17/59] fix that --- pandas/io/parquet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 6f0c6caaeb385..46fc18868b1ab 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -141,6 +141,8 @@ def read( fs, path = fsspec.core.url_to_fs(path, **(storage_options or {})) should_close = False else: + if storage_options: + raise ValueError("storage_options passed with buffer or non-fsspec filepath") fs = kwargs.pop("filesystem", None) should_close = False path = _expand_user(path) From bbcef17f0f6d68afe39487b43fb17bce4a8ca098 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 14:04:21 -0400 Subject: [PATCH 18/59] black --- pandas/io/parquet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 46fc18868b1ab..1188574519a5c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -142,7 +142,9 @@ def read( should_close = False else: if storage_options: - raise ValueError("storage_options passed with buffer or non-fsspec filepath") + raise ValueError( + "storage_options passed with buffer or non-fsspec filepath" + ) fs = kwargs.pop("filesystem", None) should_close = False path = _expand_user(path) From a18686cba287750e6aa260bbdaee8d09e89ae785 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 14:32:51 -0400 Subject: [PATCH 19/59] fix it again --- pandas/tests/io/test_fsspec.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 4282884885f4d..c05aa8c0db4d4 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -207,6 +207,7 @@ def test_markdown_options(fsspectest): assert fsspectest.cat("afile") +@td.skip_if_no("pyarrow") def test_non_fsspec_options(): with pytest.raises(ValueError, match="storage_options"): read_csv("localfile", storage_options={"a": True}) From fa656cb14b9d92fbda14788247d0cad2a7c99a47 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 24 Jul 2020 15:08:10 -0400 Subject: [PATCH 20/59] more lint --- pandas/core/series.py | 2 +- pandas/tests/io/test_fsspec.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/core/series.py b/pandas/core/series.py index ef3be854bc3bb..b6755364bf5e6 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1421,7 +1421,7 @@ def to_string( def to_markdown( self, buf: Optional[IO[str]] = None, - mode: Optional[str] = None, + mode: str = "wt", index: bool = True, **kwargs, ) -> Optional[str]: diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index c05aa8c0db4d4..577822dd90bae 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -1,4 +1,5 @@ import io + import numpy as np import pytest From a79a274be7b8c70048a035c089a6445bff2b1978 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 27 Jul 2020 11:12:00 -0400 Subject: [PATCH 21/59] Requested changes --- doc/source/whatsnew/v1.1.0.rst | 2 +- pandas/core/series.py | 2 +- pandas/io/json/_json.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 788a1465bde04..ef6a5ad92904e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -269,7 +269,7 @@ Many read/write functions have acquired the `storage_options` optional argument, to pass a dictionary of parameters to the storage backend. This allows, for example, for passing credentials to S3 and GCS storage. The details of what parameters can be passed to which backends can be found in the documentation -of the individual storage backends. +of the individual storage backends (linked from the fsspec docs). .. _Azure Datalake and Blob: https://github.com/dask/adlfs diff --git a/pandas/core/series.py b/pandas/core/series.py index b6755364bf5e6..2d7a571335510 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -1435,7 +1435,7 @@ def to_markdown( buf : str, Path or StringIO-like, optional, default None Buffer to write to. If None, the output is returned as a string. mode : str, optional - Mode in which file is opened. + Mode in which file is opened, "wt" by default. index : bool, optional, default True Add index (row) labels. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 119ba47096df9..22ff5957e5413 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -56,7 +56,7 @@ def to_json( path_or_buf, _, _, should_close = get_filepath_or_buffer( path_or_buf, compression=compression, - mode="w", + mode="wt", storage_options=storage_options, ) From 28d6d38f0a12f8a6dd2e37bf1eba67dbe5a7a2eb Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 28 Jul 2020 10:59:15 -0400 Subject: [PATCH 22/59] Make moto server process instead of monkey --- pandas/tests/io/conftest.py | 26 +++++++++++--- pandas/tests/io/json/test_compression.py | 4 ++- pandas/tests/io/json/test_pandas.py | 4 ++- pandas/tests/io/parser/test_network.py | 46 ++++++++++++++++-------- pandas/tests/io/test_fsspec.py | 15 ++++---- pandas/tests/io/test_parquet.py | 3 ++ 6 files changed, 72 insertions(+), 26 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index fcee25c258efa..0729131561113 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -72,12 +72,29 @@ def add_tips_files(bucket_name): conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) try: - s3 = moto.mock_s3() - s3.start() + import shlex + import subprocess + import requests + import time + + endpoint_uri = 'http://127.0.0.1:5555/' + + proc = subprocess.Popen(shlex.split("moto_server s3 -p 5555")) + + timeout = 5 + while timeout > 0: + try: + r = requests.get(endpoint_uri) + if r.ok: + break + except Exception: + pass + timeout -= 0.1 + time.sleep(0.1) # see gh-16135 bucket = "pandas-test" - conn = boto3.resource("s3", region_name="us-east-1") + conn = boto3.resource("s3", endpoint_url=endpoint_uri) conn.create_bucket(Bucket=bucket) add_tips_files(bucket) @@ -87,4 +104,5 @@ def add_tips_files(bucket_name): s3fs.S3FileSystem.clear_instance_cache() yield conn finally: - s3.stop() + proc.terminate() + proc.wait() diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 182c21ed1d416..29beb654ae626 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -44,7 +44,9 @@ def test_with_s3_url(compression, s3_resource): with open(path, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) - roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression) + roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression, + storage_options=dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) + ) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index c4db0170ecc90..0c8a88cfa541e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1703,7 +1703,9 @@ def test_to_s3(self, s3_resource): # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) - df.to_json(f"s3://{mock_bucket_name}/{target_file}") + df.to_json(f"s3://{mock_bucket_name}/{target_file}", + storage_options=dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) +) timeout = 5 while True: if target_file in ( diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 509ae89909699..aa7d6d49a6a78 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -52,6 +52,9 @@ def tips_df(datapath): return read_csv(datapath("io", "data", "csv", "tips.csv")) +s3so = dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) + + @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: @@ -61,13 +64,15 @@ def test_parse_public_s3_bucket(self, tips_df): # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp) + df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp, + storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv("s3://cant_get_it/tips.csv") + df = read_csv("s3://cant_get_it/tips.csv", + storage_options=s3fs) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) @@ -75,21 +80,24 @@ def test_parse_public_s3_bucket(self, tips_df): def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL - df = read_csv("s3n://pandas-test/tips.csv", nrows=10) + df = read_csv("s3n://pandas-test/tips.csv", nrows=10, + storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL - df = read_csv("s3a://pandas-test/tips.csv", nrows=10) + df = read_csv("s3a://pandas-test/tips.csv", nrows=10, + storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3_bucket_nrows(self, tips_df): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp) + df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp, + storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) @@ -99,7 +107,8 @@ def test_parse_public_s3_bucket_chunked(self, tips_df): chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df_reader = read_csv( - "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp + "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp, + storage_options=s3so ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -120,6 +129,7 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): chunksize=chunksize, compression=comp, engine="python", + storage_options=s3so ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -133,7 +143,8 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): def test_parse_public_s3_bucket_python(self, tips_df): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp + "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp, + storage_options=s3so ) assert isinstance(df, DataFrame) assert not df.empty @@ -142,7 +153,8 @@ def test_parse_public_s3_bucket_python(self, tips_df): def test_infer_s3_compression(self, tips_df): for ext in ["", ".gz", ".bz2"]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer" + "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer", + storage_options=s3so ) assert isinstance(df, DataFrame) assert not df.empty @@ -155,6 +167,7 @@ def test_parse_public_s3_bucket_nrows_python(self, tips_df): engine="python", nrows=10, compression=comp, + storage_options=s3so ) assert isinstance(df, DataFrame) assert not df.empty @@ -162,7 +175,7 @@ def test_parse_public_s3_bucket_nrows_python(self, tips_df): def test_read_s3_fails(self): with pytest.raises(IOError): - read_csv("s3://nyqpug/asdf.csv") + read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. @@ -180,7 +193,8 @@ def test_write_s3_csv_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_csv("s3://an_s3_bucket_data_doesnt_exit/not_real.csv") + tips_df.to_csv("s3://an_s3_bucket_data_doesnt_exit/not_real.csv", + storage_options=s3so) @td.skip_if_no("pyarrow") def test_write_s3_parquet_fails(self, tips_df): @@ -194,7 +208,8 @@ def test_write_s3_parquet_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_parquet("s3://an_s3_bucket_data_doesnt_exit/not_real.parquet") + tips_df.to_parquet("s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", + storage_options=s3so) def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -230,18 +245,21 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): s3fs.S3FileSystem.clear_instance_cache() with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv("s3://pandas-test/large-file.csv", nrows=5) + read_csv("s3://pandas-test/large-file.csv", nrows=5, + storage_options=s3so) # log of fetch_range (start, stop) assert (0, 5505024) in (x.args[-2:] for x in caplog.records) def test_read_s3_with_hash_in_key(self, tips_df): # GH 25945 - result = read_csv("s3://pandas-test/tips#1.csv") + result = read_csv("s3://pandas-test/tips#1.csv", + storage_options=s3so) tm.assert_frame_equal(tips_df, result) @td.skip_if_no("pyarrow") def test_read_feather_s3_file_path(self, feather_file): # GH 29055 expected = read_feather(feather_file) - res = read_feather("s3://pandas-test/simple_dataset.feather") + res = read_feather("s3://pandas-test/simple_dataset.feather", + storage_options=s3so) tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 577822dd90bae..0350fc0e1fade 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -27,6 +27,7 @@ # the ignore on the following line accounts for to_csv returning Optional(str) # in general, but always str in the case we give no filename text = df1.to_csv(index=False).encode() # type: ignore +s3so = dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) @pytest.fixture @@ -131,17 +132,18 @@ def test_fastparquet_options(fsspectest): @td.skip_if_no("s3fs") def test_from_s3_csv(s3_resource, tips_file): - tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) + tm.assert_equal(read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file)) # the following are decompressed by pandas, not fsspec - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file)) - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file)) + tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz", storage_options=s3so), read_csv(tips_file)) + tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2", storage_options=s3so), read_csv(tips_file)) @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) @td.skip_if_no("s3fs") def test_s3_protocols(s3_resource, tips_file, protocol): tm.assert_equal( - read_csv("%s://pandas-test/tips.csv" % protocol), read_csv(tips_file) + read_csv("%s://pandas-test/tips.csv" % protocol, + storage_options=s3so), read_csv(tips_file) ) @@ -149,8 +151,9 @@ def test_s3_protocols(s3_resource, tips_file, protocol): @td.skip_if_no("fastparquet") def test_s3_parquet(s3_resource): fn = "s3://pandas-test/test.parquet" - df1.to_parquet(fn, index=False, engine="fastparquet", compression=None) - df2 = read_parquet(fn, engine="fastparquet") + df1.to_parquet(fn, index=False, engine="fastparquet", compression=None, + storage_options=s3so) + df2 = read_parquet(fn, engine="fastparquet", storage_options=s3so) tm.assert_equal(df1, df2) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 82157f3d722a9..61f6fde713f56 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -158,6 +158,9 @@ def check_round_trip( """ write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} + s3so = dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) + read_kwargs['storage_options'] = s3so + write_kwargs['storage_options'] = s3so if expected is None: expected = df From e99f8ed37f977403e5bfae32efc8d3e4fde5f444 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 29 Jul 2020 10:06:30 -0400 Subject: [PATCH 23/59] Update versions --- doc/source/whatsnew/v1.1.0.rst | 6 ------ doc/source/whatsnew/v1.2.0.rst | 12 +++++++++++- pandas/core/frame.py | 4 ++-- pandas/core/generic.py | 8 +++++--- pandas/io/common.py | 2 +- pandas/io/feather_format.py | 3 ++- pandas/io/json/_json.py | 2 +- pandas/io/parquet.py | 2 +- pandas/io/pickle.py | 4 ++-- pandas/io/stata.py | 4 ++-- 10 files changed, 27 insertions(+), 20 deletions(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bf8b09839f442..a49b29d691692 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -265,12 +265,6 @@ SSH, FTP, dropbox and github. For docs and capabilities, see the `fsspec docs`_. The existing capability to interface with S3 and GCS will be unaffected by this change, as ``fsspec`` will still bring in the same packages as before. -Many read/write functions have acquired the `storage_options` optional argument, -to pass a dictionary of parameters to the storage backend. This allows, for -example, for passing credentials to S3 and GCS storage. The details of what -parameters can be passed to which backends can be found in the documentation -of the individual storage backends (linked from the fsspec docs). - .. _Azure Datalake and Blob: https://github.com/dask/adlfs .. _fsspec docs: https://filesystem-spec.readthedocs.io/en/latest/ diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 2066858e5de86..d7116df8904f3 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -13,6 +13,16 @@ including other versions of pandas. Enhancements ~~~~~~~~~~~~ +Passing arguments to fsspec backends +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Many read/write functions have acquired the `storage_options` optional argument, +to pass a dictionary of parameters to the storage backend. This allows, for +example, for passing credentials to S3 and GCS storage. The details of what +parameters can be passed to which backends can be found in the documentation +of the individual storage backends (linked from the fsspec docs). + + .. _whatsnew_120.enhancements.other: Other enhancements @@ -165,4 +175,4 @@ Other .. _whatsnew_120.contributors: Contributors -~~~~~~~~~~~~ \ No newline at end of file +~~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index ddd2f7956e699..987e89693c1bb 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2140,7 +2140,7 @@ def to_stata( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Raises ------ @@ -2347,7 +2347,7 @@ def to_parquet( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 **kwargs Additional arguments passed to the parquet library. See diff --git a/pandas/core/generic.py b/pandas/core/generic.py index f4854ece0dea4..544477cc10a59 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -2134,7 +2134,7 @@ def to_json( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- @@ -2658,7 +2658,7 @@ def to_pickle( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 See Also -------- @@ -3139,6 +3139,8 @@ def to_csv( See the errors argument for :func:`open` for a full list of options. + .. versionadded:: 1.1.0 + storage_options : dict, optional Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will @@ -3147,7 +3149,7 @@ def to_csv( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/io/common.py b/pandas/io/common.py index 880a4f01be175..7387867603b12 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -176,7 +176,7 @@ def get_filepath_or_buffer( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index f6760bb3c772f..2c664e73b9463 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -24,7 +24,8 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 + **kwargs : Additional keywords passed to `pyarrow.feather.write_feather`. diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 22ff5957e5413..080d6ced677b2 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -524,7 +524,7 @@ def read_json( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 1188574519a5c..1101261f25184 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -296,7 +296,7 @@ def to_parquet( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 kwargs Additional keyword arguments passed to the engine diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 6e91389d567a8..3aecc9c91d189 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -51,7 +51,7 @@ def to_pickle( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 .. [1] https://docs.python.org/3/library/pickle.html @@ -145,7 +145,7 @@ def read_pickle( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 7bb83a07e5f16..ef5c9b7e1ab83 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1960,7 +1960,7 @@ def _open_file_binary_write( a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- @@ -2185,7 +2185,7 @@ class StataWriter(StataParser): a file-like buffer. See the fsspec and backend storage implementation docs for the set of allowed keys and values - .. versionadded:: 1.1.0 + .. versionadded:: 1.2.0 Returns ------- From 0a2fc29e67d61ea2d54cdb18df0d0bff2c8449bc Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 29 Jul 2020 10:09:56 -0400 Subject: [PATCH 24/59] black and start excel --- pandas/tests/io/conftest.py | 2 +- pandas/tests/io/excel/test_readers.py | 8 ++- pandas/tests/io/json/test_compression.py | 6 +- pandas/tests/io/json/test_pandas.py | 9 ++- pandas/tests/io/parser/test_network.py | 70 ++++++++++++++---------- pandas/tests/io/test_fsspec.py | 25 ++++++--- pandas/tests/io/test_parquet.py | 6 +- 7 files changed, 78 insertions(+), 48 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 0729131561113..78fbc763ffcc6 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -77,7 +77,7 @@ def add_tips_files(bucket_name): import requests import time - endpoint_uri = 'http://127.0.0.1:5555/' + endpoint_uri = "http://127.0.0.1:5555/" proc = subprocess.Popen(shlex.split("moto_server s3 -p 5555")) diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index b610c5ec3a838..9fe3c55eec675 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -630,10 +630,14 @@ def test_read_from_http_url(self, read_ext): def test_read_from_s3_url(self, read_ext, s3_resource): # Bucket "pandas-test" created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: - s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) + s3_resource.Bucket( + "pandas-test", endpoint_url="http://127.0.0.1:5555/" + ).put_object(Key="test1" + read_ext, Body=f) url = "s3://pandas-test/test1" + read_ext - url_table = pd.read_excel(url) + s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + + url_table = pd.read_excel(url, storage_options=s3so) local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 29beb654ae626..5bb205842269e 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -44,8 +44,10 @@ def test_with_s3_url(compression, s3_resource): with open(path, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) - roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression, - storage_options=dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) + roundtripped_df = pd.read_json( + "s3://pandas-test/test-1", + compression=compression, + storage_options=dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}), ) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 0c8a88cfa541e..0af76457ef24e 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1703,9 +1703,12 @@ def test_to_s3(self, s3_resource): # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) - df.to_json(f"s3://{mock_bucket_name}/{target_file}", - storage_options=dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) -) + df.to_json( + f"s3://{mock_bucket_name}/{target_file}", + storage_options=dict( + client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"} + ), + ) timeout = 5 while True: if target_file in ( diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index aa7d6d49a6a78..6bd7c48c90aa9 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -52,7 +52,7 @@ def tips_df(datapath): return read_csv(datapath("io", "data", "csv", "tips.csv")) -s3so = dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) +s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) @pytest.mark.usefixtures("s3_resource") @@ -64,15 +64,17 @@ def test_parse_public_s3_bucket(self, tips_df): # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp, - storage_options=s3so) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv("s3://cant_get_it/tips.csv", - storage_options=s3fs) + df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3fs) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) @@ -80,24 +82,26 @@ def test_parse_public_s3_bucket(self, tips_df): def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL - df = read_csv("s3n://pandas-test/tips.csv", nrows=10, - storage_options=s3so) + df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL - df = read_csv("s3a://pandas-test/tips.csv", nrows=10, - storage_options=s3so) + df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) def test_parse_public_s3_bucket_nrows(self, tips_df): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp, - storage_options=s3so) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + nrows=10, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) @@ -107,8 +111,10 @@ def test_parse_public_s3_bucket_chunked(self, tips_df): chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df_reader = read_csv( - "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp, - storage_options=s3so + "s3://pandas-test/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -129,7 +135,7 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): chunksize=chunksize, compression=comp, engine="python", - storage_options=s3so + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -143,8 +149,10 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): def test_parse_public_s3_bucket_python(self, tips_df): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp, - storage_options=s3so + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression=comp, + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty @@ -153,8 +161,10 @@ def test_parse_public_s3_bucket_python(self, tips_df): def test_infer_s3_compression(self, tips_df): for ext in ["", ".gz", ".bz2"]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer", - storage_options=s3so + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression="infer", + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty @@ -167,7 +177,7 @@ def test_parse_public_s3_bucket_nrows_python(self, tips_df): engine="python", nrows=10, compression=comp, - storage_options=s3so + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty @@ -193,8 +203,9 @@ def test_write_s3_csv_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_csv("s3://an_s3_bucket_data_doesnt_exit/not_real.csv", - storage_options=s3so) + tips_df.to_csv( + "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so + ) @td.skip_if_no("pyarrow") def test_write_s3_parquet_fails(self, tips_df): @@ -208,8 +219,10 @@ def test_write_s3_parquet_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_parquet("s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", - storage_options=s3so) + tips_df.to_parquet( + "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", + storage_options=s3so, + ) def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -245,21 +258,20 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): s3fs.S3FileSystem.clear_instance_cache() with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv("s3://pandas-test/large-file.csv", nrows=5, - storage_options=s3so) + read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so) # log of fetch_range (start, stop) assert (0, 5505024) in (x.args[-2:] for x in caplog.records) def test_read_s3_with_hash_in_key(self, tips_df): # GH 25945 - result = read_csv("s3://pandas-test/tips#1.csv", - storage_options=s3so) + result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) tm.assert_frame_equal(tips_df, result) @td.skip_if_no("pyarrow") def test_read_feather_s3_file_path(self, feather_file): # GH 29055 expected = read_feather(feather_file) - res = read_feather("s3://pandas-test/simple_dataset.feather", - storage_options=s3so) + res = read_feather( + "s3://pandas-test/simple_dataset.feather", storage_options=s3so + ) tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 0350fc0e1fade..1579b96c8e9f9 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -27,7 +27,7 @@ # the ignore on the following line accounts for to_csv returning Optional(str) # in general, but always str in the case we give no filename text = df1.to_csv(index=False).encode() # type: ignore -s3so = dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) +s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) @pytest.fixture @@ -132,18 +132,26 @@ def test_fastparquet_options(fsspectest): @td.skip_if_no("s3fs") def test_from_s3_csv(s3_resource, tips_file): - tm.assert_equal(read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file)) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file) + ) # the following are decompressed by pandas, not fsspec - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz", storage_options=s3so), read_csv(tips_file)) - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2", storage_options=s3so), read_csv(tips_file)) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.gz", storage_options=s3so), + read_csv(tips_file), + ) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.bz2", storage_options=s3so), + read_csv(tips_file), + ) @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) @td.skip_if_no("s3fs") def test_s3_protocols(s3_resource, tips_file, protocol): tm.assert_equal( - read_csv("%s://pandas-test/tips.csv" % protocol, - storage_options=s3so), read_csv(tips_file) + read_csv("%s://pandas-test/tips.csv" % protocol, storage_options=s3so), + read_csv(tips_file), ) @@ -151,8 +159,9 @@ def test_s3_protocols(s3_resource, tips_file, protocol): @td.skip_if_no("fastparquet") def test_s3_parquet(s3_resource): fn = "s3://pandas-test/test.parquet" - df1.to_parquet(fn, index=False, engine="fastparquet", compression=None, - storage_options=s3so) + df1.to_parquet( + fn, index=False, engine="fastparquet", compression=None, storage_options=s3so + ) df2 = read_parquet(fn, engine="fastparquet", storage_options=s3so) tm.assert_equal(df1, df2) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 61f6fde713f56..53e520344f72d 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -158,9 +158,9 @@ def check_round_trip( """ write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} - s3so = dict(client_kwargs={'endpoint_url': 'http://127.0.0.1:5555/'}) - read_kwargs['storage_options'] = s3so - write_kwargs['storage_options'] = s3so + s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + read_kwargs["storage_options"] = s3so + write_kwargs["storage_options"] = s3so if expected is None: expected = df From 6ce6eccadc6f15f055aba77204768851ef85b380 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 29 Jul 2020 12:35:04 -0400 Subject: [PATCH 25/59] Plumb excel --- pandas/io/excel/_base.py | 14 +++++++++----- pandas/io/excel/_odfreader.py | 6 ++++-- pandas/io/excel/_openpyxl.py | 8 ++++++-- pandas/io/excel/_pyxlsb.py | 6 ++++-- pandas/io/excel/_xlrd.py | 6 ++++-- pandas/tests/io/excel/test_readers.py | 4 +--- 6 files changed, 28 insertions(+), 16 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 2a12f779230b2..e5d24882f1d6b 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -298,10 +298,11 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, + storage_options=None, ): if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + io = ExcelFile(io, storage_options=storage_options, engine=engine) elif engine and engine != io.engine: raise ValueError( "Engine should not be specified when passing " @@ -336,12 +337,14 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options=None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer, storage_options=storage_options + ) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer @@ -844,7 +847,7 @@ class ExcelFile: "pyxlsb": _PyxlsbReader, } - def __init__(self, path_or_buffer, engine=None): + def __init__(self, path_or_buffer, storage_options=None, engine=None): if engine is None: engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): @@ -858,13 +861,14 @@ def __init__(self, path_or_buffer, engine=None): raise ValueError(f"Unknown engine: {engine}") self.engine = engine + self.storage_options = storage_options # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io) + self._reader = self._engines[engine](self._io, storage_options) def __fspath__(self): return self._io diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 85ec9afaaec25..abfaae01064ac 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -18,11 +18,13 @@ class _ODFReader(_BaseExcelReader): ---------- filepath_or_buffer: string, path to be parsed or an open readable stream. + storage_options: dict (optional) + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__(self, filepath_or_buffer: FilePathOrBuffer, storage_options=None): import_optional_dependency("odf") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 0696d82e51f34..0476fa820d36a 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -467,7 +467,9 @@ def write_cells( class _OpenpyxlReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: + def __init__( + self, filepath_or_buffer: FilePathOrBuffer, storage_options=None + ) -> None: """ Reader using openpyxl engine. @@ -475,9 +477,11 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options: dict (optional) + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 0d96c8c4acdb8..4c9f03c2ceb3c 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -7,7 +7,7 @@ class _PyxlsbReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__(self, filepath_or_buffer: FilePathOrBuffer, storage_options=None): """ Reader using pyxlsb engine. @@ -15,11 +15,13 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer): ---------- filepath_or_buffer: str, path object, or Workbook Object to be parsed. + storage_options: dict (optional) + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer # And set the result to the book-attribute - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 8f7d3b1368fc7..2cdfcd530d37c 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -8,7 +8,7 @@ class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options=None): """ Reader using xlrd engine. @@ -16,10 +16,12 @@ def __init__(self, filepath_or_buffer): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options: dict (optional) + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 9fe3c55eec675..3fdacc6bd4911 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -630,9 +630,7 @@ def test_read_from_http_url(self, read_ext): def test_read_from_s3_url(self, read_ext, s3_resource): # Bucket "pandas-test" created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: - s3_resource.Bucket( - "pandas-test", endpoint_url="http://127.0.0.1:5555/" - ).put_object(Key="test1" + read_ext, Body=f) + s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) url = "s3://pandas-test/test1" + read_ext s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) From a1dba75ff7af8aa509325ad57bda8f481ac61bc8 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 11:32:18 -0400 Subject: [PATCH 26/59] fix merge --- pandas/core/frame.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 21fe476dde2e3..9d0751fcce460 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2298,7 +2298,7 @@ def to_parquet( compression: Optional[str] = "snappy", index: Optional[bool] = None, partition_cols: Optional[List[str]] = None, -(??) + storage_options: StorageOptions = None, **kwargs, ) -> None: """ From e2717dbfe769753893666bc5a465ce20307a6faa Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 11:33:09 -0400 Subject: [PATCH 27/59] isort --- pandas/tests/io/conftest.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 78fbc763ffcc6..3cc9e529525a2 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -74,9 +74,10 @@ def add_tips_files(bucket_name): try: import shlex import subprocess - import requests import time + import requests + endpoint_uri = "http://127.0.0.1:5555/" proc = subprocess.Popen(shlex.split("moto_server s3 -p 5555")) From e9ed76f78389abbeee96587ff1cd10ae73ba34f9 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 11:39:48 -0400 Subject: [PATCH 28/59] option typo --- pandas/tests/io/parser/test_network.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 6bd7c48c90aa9..ada07712008d7 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -74,7 +74,7 @@ def test_parse_public_s3_bucket(self, tips_df): tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3fs) + df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) From 1fb4b40d331a26ba2bc2ad7d7d43091e1e54fec9 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 12:03:46 -0400 Subject: [PATCH 29/59] remove moto variable --- pandas/tests/io/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 3cc9e529525a2..60c6945212abf 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -55,7 +55,7 @@ def s3_resource(tips_file, jsonl_file, feather_file): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - moto = pytest.importorskip("moto") + pytest.importorskip("moto") test_s3_files = [ ("tips#1.csv", tips_file), From e646c1654d1ec070aae9dcf3c4801b9b9bcde176 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 12:52:30 -0400 Subject: [PATCH 30/59] Add flask where there is moto Apparently moto in server mode needs flask https://github.com/spulec/moto/blob/master/setup.py#L88 --- ci/deps/azure-36-locale.yaml | 1 + ci/deps/azure-36-slow.yaml | 1 + ci/deps/azure-37-locale.yaml | 1 + ci/deps/azure-windows-37.yaml | 1 + ci/deps/travis-36-cov.yaml | 1 + ci/deps/travis-36-locale.yaml | 1 + ci/deps/travis-37-arm64.yaml | 1 + environment.yml | 1 + pandas/tests/io/conftest.py | 1 + requirements-dev.txt | 3 ++- 10 files changed, 11 insertions(+), 1 deletion(-) diff --git a/ci/deps/azure-36-locale.yaml b/ci/deps/azure-36-locale.yaml index 3034ed3dc43af..fc8018b014ece 100644 --- a/ci/deps/azure-36-locale.yaml +++ b/ci/deps/azure-36-locale.yaml @@ -36,3 +36,4 @@ dependencies: - xlsxwriter - xlwt - moto + - flask diff --git a/ci/deps/azure-36-slow.yaml b/ci/deps/azure-36-slow.yaml index 87bad59fa4873..2705690e65a70 100644 --- a/ci/deps/azure-36-slow.yaml +++ b/ci/deps/azure-36-slow.yaml @@ -33,3 +33,4 @@ dependencies: - xlsxwriter - xlwt - moto + - flask diff --git a/ci/deps/azure-37-locale.yaml b/ci/deps/azure-37-locale.yaml index 6f64c81f299d1..1b2716211b9df 100644 --- a/ci/deps/azure-37-locale.yaml +++ b/ci/deps/azure-37-locale.yaml @@ -20,6 +20,7 @@ dependencies: - lxml - matplotlib>=3.3.0 - moto + - flask - nomkl - numexpr - numpy diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 5bbd0e2795d7e..75d31c91437fe 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -22,6 +22,7 @@ dependencies: - lxml - matplotlib=2.2.* - moto + - flask - numexpr - numpy=1.18.* - openpyxl diff --git a/ci/deps/travis-36-cov.yaml b/ci/deps/travis-36-cov.yaml index 177e0d3f4c0af..e4c408f7df2d0 100644 --- a/ci/deps/travis-36-cov.yaml +++ b/ci/deps/travis-36-cov.yaml @@ -24,6 +24,7 @@ dependencies: - html5lib - matplotlib - moto + - flask - nomkl - numexpr - numpy=1.15.* diff --git a/ci/deps/travis-36-locale.yaml b/ci/deps/travis-36-locale.yaml index 03a1e751b6a86..ef8a2c26c0f3a 100644 --- a/ci/deps/travis-36-locale.yaml +++ b/ci/deps/travis-36-locale.yaml @@ -22,6 +22,7 @@ dependencies: - lxml=3.8.0 - matplotlib=3.0.* - moto + - flask - nomkl - numexpr - numpy diff --git a/ci/deps/travis-37-arm64.yaml b/ci/deps/travis-37-arm64.yaml index 5cb53489be225..ea29cbef1272b 100644 --- a/ci/deps/travis-37-arm64.yaml +++ b/ci/deps/travis-37-arm64.yaml @@ -17,5 +17,6 @@ dependencies: - python-dateutil - pytz - pip + - flask - pip: - moto diff --git a/environment.yml b/environment.yml index ed9762e5b8893..0822e8976e046 100644 --- a/environment.yml +++ b/environment.yml @@ -52,6 +52,7 @@ dependencies: - botocore>=1.11 - hypothesis>=3.82 - moto # mock S3 + - flask - pytest>=5.0.1 - pytest-cov - pytest-xdist>=1.21 diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 60c6945212abf..fffa7a3e0e1bd 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -56,6 +56,7 @@ def s3_resource(tips_file, jsonl_file, feather_file): os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") pytest.importorskip("moto") + pytest.importorskip("flask") # server mode needs flask too test_s3_files = [ ("tips#1.csv", tips_file), diff --git a/requirements-dev.txt b/requirements-dev.txt index 6a87b0a99a4f8..bebe99b87ef2d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -32,6 +32,7 @@ boto3 botocore>=1.11 hypothesis>=3.82 moto +flask pytest>=5.0.1 pytest-cov pytest-xdist>=1.21 @@ -74,4 +75,4 @@ pyreadstat tabulate>=0.8.3 git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master git+https://github.com/numpy/numpydoc -pyflakes>=2.2.0 \ No newline at end of file +pyflakes>=2.2.0 From f61bf0b14eabe61b49e875be2332b417439894e6 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 13:21:28 -0400 Subject: [PATCH 31/59] specific options for s3 --- pandas/tests/io/test_parquet.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 53e520344f72d..b635fafeb1241 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -13,6 +13,7 @@ import pandas as pd import pandas._testing as tm +from pandas.io.common import is_fsspec_url from pandas.io.parquet import ( FastParquetImpl, PyArrowImpl, @@ -158,9 +159,10 @@ def check_round_trip( """ write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} - s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) - read_kwargs["storage_options"] = s3so - write_kwargs["storage_options"] = s3so + if isinstance(path, str) and "s3://" in path: + s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + read_kwargs["storage_options"] = s3so + write_kwargs["storage_options"] = s3so if expected is None: expected = df From d7e5b4af6adc543c229f4f58e11548ca4b51607e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 14:16:49 -0400 Subject: [PATCH 32/59] skip some; fix unrelated HDF arg order --- pandas/io/pytables.py | 8 ++++---- pandas/tests/io/conftest.py | 5 +++++ pandas/tests/io/test_parquet.py | 1 - 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index aeb7b3e044794..2abc570a04de3 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -320,6 +320,10 @@ def read_hdf( mode : {'r', 'r+', 'a'}, default 'r' Mode to use when opening the file. Ignored if path_or_buf is a :class:`pandas.HDFStore`. Default is 'r'. + errors : str, default 'strict' + Specifies how encoding and decoding errors are to be handled. + See the errors argument for :func:`open` for a full list + of options. where : list, optional A list of Term (or convertible) objects. start : int, optional @@ -332,10 +336,6 @@ def read_hdf( Return an iterator object. chunksize : int, optional Number of rows to include in an iteration when using an iterator. - errors : str, default 'strict' - Specifies how encoding and decoding errors are to be handled. - See the errors argument for :func:`open` for a full list - of options. **kwargs Additional keyword arguments passed to HDFStore. diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index fffa7a3e0e1bd..b4517d336eb95 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,4 +1,5 @@ import os +import sys import pytest @@ -30,6 +31,10 @@ def feather_file(datapath): return datapath("io", "data", "feather", "feather-0_3_1.feather") +@pytest.mark.skipif( + sys.version_info.major == 3 and sys.version_info.minor == 6, + reason="moto is too old on py36", +) @pytest.fixture def s3_resource(tips_file, jsonl_file, feather_file): """ diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b635fafeb1241..43745f98d3590 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -13,7 +13,6 @@ import pandas as pd import pandas._testing as tm -from pandas.io.common import is_fsspec_url from pandas.io.parquet import ( FastParquetImpl, PyArrowImpl, From df6d48f949b8033dcd24d9eeac445f4dfe509e8e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 14:38:28 -0400 Subject: [PATCH 33/59] rerun generate_pip_deps --- requirements-dev.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements-dev.txt b/requirements-dev.txt index bebe99b87ef2d..67a26c4216eac 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -75,4 +75,4 @@ pyreadstat tabulate>=0.8.3 git+https://github.com/pandas-dev/pydata-sphinx-theme.git@master git+https://github.com/numpy/numpydoc -pyflakes>=2.2.0 +pyflakes>=2.2.0 \ No newline at end of file From 84a8149e2dcada1a6bcba52c156bd14d28353b14 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 16:37:41 -0400 Subject: [PATCH 34/59] try simpler --- pandas/tests/io/conftest.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index b4517d336eb95..b142bb0d8e9f7 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,3 +1,4 @@ +from distutils.version import LooseVersion import os import sys @@ -31,10 +32,7 @@ def feather_file(datapath): return datapath("io", "data", "feather", "feather-0_3_1.feather") -@pytest.mark.skipif( - sys.version_info.major == 3 and sys.version_info.minor == 6, - reason="moto is too old on py36", -) +@pytest.mark.skipif(sys.version_info.major < (3, 7),) @pytest.fixture def s3_resource(tips_file, jsonl_file, feather_file): """ From 32cf3e1bdf0dbcffae42f533f88e5e711978f680 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Mon, 10 Aug 2020 16:54:47 -0400 Subject: [PATCH 35/59] try again --- pandas/tests/io/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index b142bb0d8e9f7..2e4f89fbae8a9 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -32,7 +32,7 @@ def feather_file(datapath): return datapath("io", "data", "feather", "feather-0_3_1.feather") -@pytest.mark.skipif(sys.version_info.major < (3, 7),) +@pytest.mark.skipif(sys.version_info < (3, 7),) @pytest.fixture def s3_resource(tips_file, jsonl_file, feather_file): """ From 867c985f7b266d6779582a1cb7965622f415134d Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 11 Aug 2020 09:31:38 -0400 Subject: [PATCH 36/59] Check moto not py --- pandas/tests/io/conftest.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 2e4f89fbae8a9..efda7955841ef 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,6 +1,5 @@ from distutils.version import LooseVersion import os -import sys import pytest @@ -32,7 +31,6 @@ def feather_file(datapath): return datapath("io", "data", "feather", "feather-0_3_1.feather") -@pytest.mark.skipif(sys.version_info < (3, 7),) @pytest.fixture def s3_resource(tips_file, jsonl_file, feather_file): """ @@ -58,8 +56,10 @@ def s3_resource(tips_file, jsonl_file, feather_file): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - pytest.importorskip("moto") + moto = pytest.importorskip("moto") pytest.importorskip("flask") # server mode needs flask too + if LooseVersion(moto.__version__) < LooseVersion("1.3.14"): + pytest.skip("Moto too old") test_s3_files = [ ("tips#1.csv", tips_file), From 65ec74f49a080856d73d1273885816b3a9564310 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 11 Aug 2020 12:59:11 -0400 Subject: [PATCH 37/59] Suggestions --- pandas/io/excel/_base.py | 18 +++++++++++--- pandas/io/excel/_odfreader.py | 12 ++++++---- pandas/io/excel/_openpyxl.py | 8 ++++--- pandas/io/excel/_pyxlsb.py | 12 ++++++---- pandas/io/excel/_xlrd.py | 5 ++-- pandas/io/feather_format.py | 7 ++++-- pandas/io/parsers.py | 4 ++-- pandas/tests/io/conftest.py | 15 +++++++++++- pandas/tests/io/excel/test_readers.py | 3 +-- pandas/tests/io/parser/test_network.py | 33 ++++++++++++-------------- pandas/tests/io/test_fsspec.py | 7 +++--- 11 files changed, 79 insertions(+), 45 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 49558988d9715..273d9d4707597 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -8,6 +8,7 @@ from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES +from pandas._typing import StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -199,6 +200,15 @@ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. +storage_options : dict, optional + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 Returns ------- @@ -298,7 +308,7 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - storage_options=None, + storage_options: StorageOptions = None, ): if not isinstance(io, ExcelFile): @@ -337,7 +347,7 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer, storage_options=None): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) @@ -847,7 +857,9 @@ class ExcelFile: "pyxlsb": _PyxlsbReader, } - def __init__(self, path_or_buffer, storage_options=None, engine=None): + def __init__( + self, path_or_buffer, storage_options: StorageOptions = None, engine=None + ): if engine is None: engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 9b93e6b7311d1..a6cd8f524503b 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -16,13 +16,17 @@ class _ODFReader(_BaseExcelReader): Parameters ---------- - filepath_or_buffer: string, path to be parsed or + filepath_or_buffer : string, path to be parsed or an open readable stream. - storage_options: dict (optional) + storage_options : StorageOptions passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ - def __init__(self, filepath_or_buffer: FilePathOrBuffer, storage_options=None): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): import_optional_dependency("odf") super().__init__(filepath_or_buffer, storage_options=storage_options) diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index b17e74b2cf864..73239190604db 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader @@ -468,7 +468,9 @@ def write_cells( class _OpenpyxlReader(_BaseExcelReader): def __init__( - self, filepath_or_buffer: FilePathOrBuffer, storage_options=None + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, ) -> None: """ Reader using openpyxl engine. @@ -477,7 +479,7 @@ def __init__( ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. - storage_options: dict (optional) + storage_options : StorageOptions passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 4c9f03c2ceb3c..c0e281ff6c2da 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,21 +1,25 @@ from typing import List -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _PyxlsbReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer, storage_options=None): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): """ Reader using pyxlsb engine. Parameters ---------- - filepath_or_buffer: str, path object, or Workbook + filepath_or_buffer : str, path object, or Workbook Object to be parsed. - storage_options: dict (optional) + storage_options : StorageOptions passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index 36a15331c16e5..ff1b3c8bdb964 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -2,13 +2,14 @@ import numpy as np +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer, storage_options=None): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): """ Reader using xlrd engine. @@ -16,7 +17,7 @@ def __init__(self, filepath_or_buffer, storage_options=None): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. - storage_options: dict (optional) + storage_options : StorageOptions passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 2c664e73b9463..2d86fa44f22a4 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,5 +1,6 @@ """ feather-format compat """ +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas import DataFrame, Int64Index, RangeIndex @@ -7,7 +8,7 @@ from pandas.io.common import get_filepath_or_buffer -def to_feather(df: DataFrame, path, storage_options=None, **kwargs): +def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -77,7 +78,9 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): +def read_feather( + path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None +): """ Load a feather-format object from the file path. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9dc0e1f71d13b..1512c2467c7df 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,7 +20,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer, Union +from pandas._typing import FilePathOrBuffer, StorageOptions, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -596,7 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, - storage_options=None, + storage_options: StorageOptions = None, ): # gh-23761 # diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index efda7955841ef..19b3bc8f0d101 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -31,6 +31,11 @@ def feather_file(datapath): return datapath("io", "data", "feather", "feather-0_3_1.feather") +@pytest.fixture +def s3so(): + return dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + + @pytest.fixture def s3_resource(tips_file, jsonl_file, feather_file): """ @@ -76,6 +81,9 @@ def add_tips_files(bucket_name): conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) try: + # Launching moto in server mode, i.e., as a separate process + # with an S3 endpoint on localhost + import shlex import subprocess import time @@ -84,11 +92,15 @@ def add_tips_files(bucket_name): endpoint_uri = "http://127.0.0.1:5555/" - proc = subprocess.Popen(shlex.split("moto_server s3 -p 5555")) + # pipe to null to avoid logging in terminal + proc = subprocess.Popen( + shlex.split("moto_server s3 -p 5555"), stdout=subprocess.DEVNULL + ) timeout = 5 while timeout > 0: try: + # OK to go once server is accepting connections r = requests.get(endpoint_uri) if r.ok: break @@ -109,5 +121,6 @@ def add_tips_files(bucket_name): s3fs.S3FileSystem.clear_instance_cache() yield conn finally: + # shut down external process proc.terminate() proc.wait() diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 3fdacc6bd4911..43302092daa9e 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -627,13 +627,12 @@ def test_read_from_http_url(self, read_ext): tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale - def test_read_from_s3_url(self, read_ext, s3_resource): + def test_read_from_s3_url(self, read_ext, s3_resource, s3so): # Bucket "pandas-test" created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) url = "s3://pandas-test/test1" + read_ext - s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) url_table = pd.read_excel(url, storage_options=s3so) local_table = pd.read_excel("test1" + read_ext) diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index ada07712008d7..5a08216f01ce4 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -52,14 +52,11 @@ def tips_df(datapath): return read_csv(datapath("io", "data", "csv", "tips.csv")) -s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) - - @pytest.mark.usefixtures("s3_resource") @td.skip_if_not_us_locale() class TestS3: @td.skip_if_no("s3fs") - def test_parse_public_s3_bucket(self, tips_df): + def test_parse_public_s3_bucket(self, tips_df, s3so): # more of an integration test due to the not-public contents portion # can probably mock this though. @@ -79,7 +76,7 @@ def test_parse_public_s3_bucket(self, tips_df): assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self, tips_df): + def test_parse_public_s3n_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3n" URL df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so) @@ -87,14 +84,14 @@ def test_parse_public_s3n_bucket(self, tips_df): assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self, tips_df): + def test_parse_public_s3a_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3a" URL df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self, tips_df): + def test_parse_public_s3_bucket_nrows(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, @@ -106,7 +103,7 @@ def test_parse_public_s3_bucket_nrows(self, tips_df): assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self, tips_df): + def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): # Read with a chunksize chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: @@ -126,7 +123,7 @@ def test_parse_public_s3_bucket_chunked(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self, tips_df): + def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): # Read with a chunksize using the Python parser chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: @@ -146,7 +143,7 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self, tips_df): + def test_parse_public_s3_bucket_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, @@ -158,7 +155,7 @@ def test_parse_public_s3_bucket_python(self, tips_df): assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self, tips_df): + def test_infer_s3_compression(self, tips_df, s3so): for ext in ["", ".gz", ".bz2"]: df = read_csv( "s3://pandas-test/tips.csv" + ext, @@ -170,7 +167,7 @@ def test_infer_s3_compression(self, tips_df): assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self, tips_df): + def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, @@ -183,7 +180,7 @@ def test_parse_public_s3_bucket_nrows_python(self, tips_df): assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_read_s3_fails(self): + def test_read_s3_fails(self, s3so): with pytest.raises(IOError): read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) @@ -192,7 +189,7 @@ def test_read_s3_fails(self): with pytest.raises(IOError): read_csv("s3://cant_get_it/file.csv") - def test_write_s3_csv_fails(self, tips_df): + def test_write_s3_csv_fails(self, tips_df, s3so): # GH 32486 # Attempting to write to an invalid S3 path should raise import botocore @@ -208,7 +205,7 @@ def test_write_s3_csv_fails(self, tips_df): ) @td.skip_if_no("pyarrow") - def test_write_s3_parquet_fails(self, tips_df): + def test_write_s3_parquet_fails(self, tips_df, s3so): # GH 27679 # Attempting to write to an invalid S3 path should raise import botocore @@ -238,7 +235,7 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): expected = read_csv(tips_file) tm.assert_frame_equal(result, expected) - def test_read_csv_chunked_download(self, s3_resource, caplog): + def test_read_csv_chunked_download(self, s3_resource, caplog, s3so): # 8 MB, S3FS usees 5MB chunks import s3fs @@ -262,13 +259,13 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): # log of fetch_range (start, stop) assert (0, 5505024) in (x.args[-2:] for x in caplog.records) - def test_read_s3_with_hash_in_key(self, tips_df): + def test_read_s3_with_hash_in_key(self, tips_df, s3so): # GH 25945 result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) tm.assert_frame_equal(tips_df, result) @td.skip_if_no("pyarrow") - def test_read_feather_s3_file_path(self, feather_file): + def test_read_feather_s3_file_path(self, feather_file, s3so): # GH 29055 expected = read_feather(feather_file) res = read_feather( diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 990d5087afe7f..666da677d702e 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -28,7 +28,6 @@ # in general, but always str in the case we give no filename # error: Item "None" of "Optional[str]" has no attribute "encode" text = df1.to_csv(index=False).encode() # type: ignore[union-attr] -s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) @pytest.fixture @@ -132,7 +131,7 @@ def test_fastparquet_options(fsspectest): @td.skip_if_no("s3fs") -def test_from_s3_csv(s3_resource, tips_file): +def test_from_s3_csv(s3_resource, tips_file, s3so): tm.assert_equal( read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file) ) @@ -149,7 +148,7 @@ def test_from_s3_csv(s3_resource, tips_file): @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) @td.skip_if_no("s3fs") -def test_s3_protocols(s3_resource, tips_file, protocol): +def test_s3_protocols(s3_resource, tips_file, protocol, s3so): tm.assert_equal( read_csv("%s://pandas-test/tips.csv" % protocol, storage_options=s3so), read_csv(tips_file), @@ -158,7 +157,7 @@ def test_s3_protocols(s3_resource, tips_file, protocol): @td.skip_if_no("s3fs") @td.skip_if_no("fastparquet") -def test_s3_parquet(s3_resource): +def test_s3_parquet(s3_resource, s3so): fn = "s3://pandas-test/test.parquet" df1.to_parquet( fn, index=False, engine="fastparquet", compression=None, storage_options=s3so From e5fa34113ec63e5895cb2e776358adbf85c9d804 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 11 Aug 2020 13:40:30 -0400 Subject: [PATCH 38/59] maybe mypy fix --- pandas/io/excel/_base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 273d9d4707597..8b3b2e28d230f 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,7 +3,7 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Union +from typing import Any, Mapping, Union from pandas._config import config @@ -200,7 +200,7 @@ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. -storage_options : dict, optional +storage_options : StorageOptions Extra options that make sense for a particular storage connection, e.g. host, port, username, password, etc., if using a URL that will be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error @@ -850,7 +850,7 @@ class ExcelFile: from pandas.io.excel._pyxlsb import _PyxlsbReader from pandas.io.excel._xlrd import _XlrdReader - _engines = { + _engines: Mapping[str, Any] = { "xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader, @@ -880,7 +880,7 @@ def __init__( # Always a string self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io, storage_options) + self._reader = self._engines[engine](self._io, storage_options=storage_options) def __fspath__(self): return self._io From 8ab270229f96f9f4d0723c91bd86a4533c4de3a3 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Wed, 12 Aug 2020 11:30:37 -0400 Subject: [PATCH 39/59] Move fsspec fixture imports; add whatsnew note --- doc/source/whatsnew/v1.2.0.rst | 3 +++ pandas/tests/io/conftest.py | 10 ++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v1.2.0.rst b/doc/source/whatsnew/v1.2.0.rst index 86f47a5826214..d91f536a1581e 100644 --- a/doc/source/whatsnew/v1.2.0.rst +++ b/doc/source/whatsnew/v1.2.0.rst @@ -24,6 +24,9 @@ of the individual storage backends (detailed from the fsspec docs for `builtin implementations`_ and linked to `external ones`_). See Section :ref:`io.remote`. +:issue:`35655` added fsspec support (including ``storage_options``) +for reading excel files. + .. _builtin implementations: https://filesystem-spec.readthedocs.io/en/latest/api.html#built-in-implementations .. _external ones: https://filesystem-spec.readthedocs.io/en/latest/api.html#other-known-implementations diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 19b3bc8f0d101..82c3cc5052378 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,5 +1,8 @@ from distutils.version import LooseVersion import os +import shlex +import subprocess +import time import pytest @@ -54,6 +57,7 @@ def s3_resource(tips_file, jsonl_file, feather_file): """ s3fs = pytest.importorskip("s3fs") boto3 = pytest.importorskip("boto3") + requests = pytest.importorskip("requests") with tm.ensure_safe_environment_variables(): # temporary workaround as moto fails for botocore >= 1.11 otherwise, @@ -84,12 +88,6 @@ def add_tips_files(bucket_name): # Launching moto in server mode, i.e., as a separate process # with an S3 endpoint on localhost - import shlex - import subprocess - import time - - import requests - endpoint_uri = "http://127.0.0.1:5555/" # pipe to null to avoid logging in terminal From 476e96a773bbf222dd3d9a9bdb6bb4d1b79e0ee3 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 14 Aug 2020 09:16:13 -0400 Subject: [PATCH 40/59] responses --- pandas/io/excel/_base.py | 2 +- pandas/tests/io/conftest.py | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index 8b3b2e28d230f..aaef71910c9ab 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -858,7 +858,7 @@ class ExcelFile: } def __init__( - self, path_or_buffer, storage_options: StorageOptions = None, engine=None + self, path_or_buffer, engine=None, storage_options: StorageOptions = None ): if engine is None: engine = "xlrd" diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 82c3cc5052378..354c6cfe2308f 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -65,10 +65,8 @@ def s3_resource(tips_file, jsonl_file, feather_file): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - moto = pytest.importorskip("moto") + moto = pytest.importorskip("moto", minversion="1.3.14") pytest.importorskip("flask") # server mode needs flask too - if LooseVersion(moto.__version__) < LooseVersion("1.3.14"): - pytest.skip("Moto too old") test_s3_files = [ ("tips#1.csv", tips_file), From 8ae91897aaa0c61c52230a96d68e56947761ec0f Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 14 Aug 2020 09:40:40 -0400 Subject: [PATCH 41/59] relint --- pandas/tests/io/conftest.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 354c6cfe2308f..8bd403f222af7 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,4 +1,3 @@ -from distutils.version import LooseVersion import os import shlex import subprocess @@ -65,7 +64,7 @@ def s3_resource(tips_file, jsonl_file, feather_file): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - moto = pytest.importorskip("moto", minversion="1.3.14") + pytest.importorskip("moto", minversion="1.3.14") pytest.importorskip("flask") # server mode needs flask too test_s3_files = [ From eab08b9499aacc092133f2e82874a0b59838ac68 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 14 Aug 2020 09:44:57 -0400 Subject: [PATCH 42/59] update moto deps --- ci/deps/azure-37-slow.yaml | 1 + ci/deps/azure-windows-37.yaml | 2 +- ci/deps/travis-37-cov.yaml | 2 +- ci/deps/travis-37.yaml | 4 ++-- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/ci/deps/azure-37-slow.yaml b/ci/deps/azure-37-slow.yaml index a8b7fdec3b6ac..d17a8a2b0ed9b 100644 --- a/ci/deps/azure-37-slow.yaml +++ b/ci/deps/azure-37-slow.yaml @@ -27,6 +27,7 @@ dependencies: - python-dateutil - pytz - s3fs>=0.4.0 + - moto>=1.3.14 - scipy - sqlalchemy - xlrd diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index 3e1f4ab2bdc2d..bbe991d6ad804 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -21,7 +21,7 @@ dependencies: - jinja2 - lxml - matplotlib=2.2.* - - moto + - moto>=1.3.14 - flask - numexpr - numpy=1.16.* diff --git a/ci/deps/travis-37-cov.yaml b/ci/deps/travis-37-cov.yaml index e2eb578c12f94..651003b20af07 100644 --- a/ci/deps/travis-37-cov.yaml +++ b/ci/deps/travis-37-cov.yaml @@ -23,7 +23,7 @@ dependencies: - geopandas - html5lib - matplotlib - - moto + - moto>=1.3.14 - flask - nomkl - numexpr diff --git a/ci/deps/travis-37.yaml b/ci/deps/travis-37.yaml index e896233aac63c..26d6c2910a7cc 100644 --- a/ci/deps/travis-37.yaml +++ b/ci/deps/travis-37.yaml @@ -20,8 +20,8 @@ dependencies: - pyarrow - pytz - s3fs>=0.4.0 + - moto>=1.3.14 + - flask - tabulate - pyreadstat - pip - - pip: - - moto From b13614b4e10e0ebb687dde06753b9b208993927c Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 14 Aug 2020 13:19:10 -0400 Subject: [PATCH 43/59] latest on windows env --- ci/deps/azure-windows-37.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ci/deps/azure-windows-37.yaml b/ci/deps/azure-windows-37.yaml index bbe991d6ad804..1d15ca41c0f8e 100644 --- a/ci/deps/azure-windows-37.yaml +++ b/ci/deps/azure-windows-37.yaml @@ -15,7 +15,7 @@ dependencies: # pandas dependencies - beautifulsoup4 - bottleneck - - fsspec>=0.7.4 + - fsspec>=0.8.0 - gcsfs>=0.6.0 - html5lib - jinja2 @@ -30,7 +30,7 @@ dependencies: - pytables - python-dateutil - pytz - - s3fs>=0.4.0 + - s3fs>=0.4.2 - scipy - sqlalchemy - xlrd From 4c2c1a0a2f2191218e58c7eceb23d1d4c1e50dc1 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 14 Aug 2020 14:02:57 -0400 Subject: [PATCH 44/59] Add kwargs These seem to be extra tests that weren't there before? --- pandas/tests/io/json/test_pandas.py | 6 ++++-- pandas/tests/io/test_parquet.py | 4 ++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1ed6853c9ed28..64a666079876f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1213,10 +1213,12 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @td.skip_if_not_us_locale - def test_read_s3_jsonl(self, s3_resource): + def test_read_s3_jsonl(self, s3_resource, s3so): # GH17200 - result = read_json("s3n://pandas-test/items.jsonl", lines=True) + result = read_json( + "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so + ) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 43745f98d3590..b2a7739d7d4fc 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -541,9 +541,9 @@ def test_categorical(self, pa): expected = df.astype(object) check_round_trip(df, pa, expected=expected) - def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): s3fs = pytest.importorskip("s3fs") - s3 = s3fs.S3FileSystem() + s3 = s3fs.S3FileSystem(**s3so) kw = dict(filesystem=s3) check_round_trip( df_compat, From 9c4124d84f806521cf3ebacbb1cb29ec2d5db685 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 14 Aug 2020 16:14:34 -0400 Subject: [PATCH 45/59] try in win-py38 env --- ci/deps/azure-windows-38.yaml | 4 ++++ pandas/tests/io/test_parquet.py | 1 + 2 files changed, 5 insertions(+) diff --git a/ci/deps/azure-windows-38.yaml b/ci/deps/azure-windows-38.yaml index f428a6dadfaa2..23bede5eb26f1 100644 --- a/ci/deps/azure-windows-38.yaml +++ b/ci/deps/azure-windows-38.yaml @@ -16,7 +16,10 @@ dependencies: - blosc - bottleneck - fastparquet>=0.3.2 + - flask + - fsspec>=0.8.0 - matplotlib=3.1.3 + - moto>=1.3.14 - numba - numexpr - numpy=1.18.* @@ -26,6 +29,7 @@ dependencies: - pytables - python-dateutil - pytz + - s3fs>=0.4.0 - scipy - xlrd - xlsxwriter diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index b2a7739d7d4fc..3f94af65449c7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -568,6 +568,7 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): expected_df = df_compat.copy() if partition_col: expected_df[partition_col] = expected_df[partition_col].astype("category") + check_round_trip( df_compat, pa, From 09a8e8eb29ef4df56c857a6e73456f7482bd5235 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 18 Aug 2020 11:51:49 -0400 Subject: [PATCH 46/59] Env only Revert all code changes, to see if environments only are responsibble for failures --- pandas/_libs/tslibs/parsing.pyx | 3 +- pandas/_testing.py | 10 +- pandas/core/generic.py | 2 +- pandas/core/groupby/generic.py | 26 +---- pandas/core/internals/managers.py | 32 ++++++ pandas/io/common.py | 5 +- pandas/io/excel/_base.py | 30 ++---- pandas/io/excel/_odfreader.py | 14 +-- pandas/io/excel/_openpyxl.py | 12 +-- pandas/io/excel/_pyxlsb.py | 14 +-- pandas/io/excel/_xlrd.py | 7 +- pandas/io/feather_format.py | 7 +- pandas/io/parsers.py | 4 +- .../tests/frame/methods/test_interpolate.py | 1 + pandas/tests/io/conftest.py | 41 +------ pandas/tests/io/excel/test_readers.py | 5 +- pandas/tests/io/json/test_compression.py | 6 +- pandas/tests/io/json/test_pandas.py | 13 +-- pandas/tests/io/parser/test_common.py | 1 + pandas/tests/io/parser/test_network.py | 100 ++++++++---------- pandas/tests/io/test_fsspec.py | 29 ++--- pandas/tests/io/test_parquet.py | 9 +- pandas/tests/io/test_sql.py | 5 +- .../util/test_assert_extension_array_equal.py | 9 ++ pandas/tests/util/test_assert_frame_equal.py | 8 ++ pandas/tests/util/test_assert_series_equal.py | 8 ++ 26 files changed, 169 insertions(+), 232 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 8429aebbd85b8..7478179df3b75 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -381,7 +381,8 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default, object freq): cdef: object ret - int year, quarter = -1, month, mnum, date_len + # year initialized to prevent compiler warnings + int year = -1, quarter = -1, month, mnum, date_len # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 assert isinstance(date_string, str) diff --git a/pandas/_testing.py b/pandas/_testing.py index 713f29466f097..ef6232fa6d575 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1377,12 +1377,18 @@ def assert_series_equal( ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( - left._values, right._values, index_values=np.asarray(left.index) + left._values, + right._values, + check_dtype=check_dtype, + index_values=np.asarray(left.index), ) else: _testing.assert_almost_equal( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 9cbe2f714fd57..fe412bc0ce937 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6893,7 +6893,7 @@ def interpolate( obj = self.T if should_transpose else self if obj.empty: - return self + return self.copy() if method not in fillna_methods: axis = self._info_axis_number diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index b806d9856d20f..166631e69f523 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1031,7 +1031,6 @@ def _cython_agg_blocks( data = data.get_numeric_data(copy=False) agg_blocks: List["Block"] = [] - deleted_items: List[np.ndarray] = [] no_result = object() @@ -1111,6 +1110,7 @@ def blk_func(block: "Block") -> List["Block"]: assert len(locs) == result.shape[1] for i, loc in enumerate(locs): agg_block = result.iloc[:, [i]]._mgr.blocks[0] + agg_block.mgr_locs = [loc] new_blocks.append(agg_block) else: result = result._mgr.blocks[0].values @@ -1124,7 +1124,6 @@ def blk_func(block: "Block") -> List["Block"]: return new_blocks skipped: List[int] = [] - new_items: List[np.ndarray] = [] for i, block in enumerate(data.blocks): try: nbs = blk_func(block) @@ -1133,36 +1132,15 @@ def blk_func(block: "Block") -> List["Block"]: # continue and exclude the block # NotImplementedError -> "ohlc" with wrong dtype skipped.append(i) - deleted_items.append(block.mgr_locs.as_array) else: agg_blocks.extend(nbs) - new_items.append(block.mgr_locs.as_array) if not agg_blocks: raise DataError("No numeric types to aggregate") # reset the locs in the blocks to correspond to our # current ordering - indexer = np.concatenate(new_items) - agg_items = data.items.take(np.sort(indexer)) - - if deleted_items: - - # we need to adjust the indexer to account for the - # items we have removed - # really should be done in internals :< - - deleted = np.concatenate(deleted_items) - ai = np.arange(len(data)) - mask = np.zeros(len(data)) - mask[deleted] = 1 - indexer = (ai - mask.cumsum())[indexer] - - offset = 0 - for blk in agg_blocks: - loc = len(blk.mgr_locs) - blk.mgr_locs = indexer[offset : (offset + loc)] - offset += loc + agg_items = data.reset_dropped_locs(agg_blocks, skipped) return agg_blocks, agg_items diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 5a215c4cd5fa3..f05d4cf1c4be6 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1504,6 +1504,38 @@ def unstack(self, unstacker, fill_value) -> "BlockManager": bm = BlockManager(new_blocks, [new_columns, new_index]) return bm + def reset_dropped_locs(self, blocks: List[Block], skipped: List[int]) -> Index: + """ + Decrement the mgr_locs of the given blocks with `skipped` removed. + + Notes + ----- + Alters each block's mgr_locs inplace. + """ + ncols = len(self) + + new_locs = [blk.mgr_locs.as_array for blk in blocks] + indexer = np.concatenate(new_locs) + + new_items = self.items.take(np.sort(indexer)) + + if skipped: + # we need to adjust the indexer to account for the + # items we have removed + deleted_items = [self.blocks[i].mgr_locs.as_array for i in skipped] + deleted = np.concatenate(deleted_items) + ai = np.arange(ncols) + mask = np.zeros(ncols) + mask[deleted] = 1 + indexer = (ai - mask.cumsum())[indexer] + + offset = 0 + for blk in blocks: + loc = len(blk.mgr_locs) + blk.mgr_locs = indexer[offset : (offset + loc)] + offset += loc + return new_items + class SingleBlockManager(BlockManager): """ manage a single block with """ diff --git a/pandas/io/common.py b/pandas/io/common.py index 54f35e689aac8..d1305c9cabe0e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,6 +18,7 @@ Optional, Tuple, Type, + Union, ) from urllib.parse import ( urljoin, @@ -452,7 +453,7 @@ def get_handle( except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) - handles: List[IO] = list() + handles: List[Union[IO, _MMapWrapper]] = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string @@ -535,6 +536,8 @@ def get_handle( try: wrapped = _MMapWrapper(f) f.close() + handles.remove(f) + handles.append(wrapped) f = wrapped except Exception: # we catch any errors that may have occurred diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index aaef71910c9ab..b1bbda4a4b7e0 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,12 +3,11 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Any, Mapping, Union +from typing import Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES -from pandas._typing import StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -200,15 +199,6 @@ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. -storage_options : StorageOptions - Extra options that make sense for a particular storage connection, e.g. - host, port, username, password, etc., if using a URL that will - be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error - will be raised if providing this argument with a local path or - a file-like buffer. See the fsspec and backend storage implementation - docs for the set of allowed keys and values - - .. versionadded:: 1.2.0 Returns ------- @@ -308,11 +298,10 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, - storage_options: StorageOptions = None, ): if not isinstance(io, ExcelFile): - io = ExcelFile(io, storage_options=storage_options, engine=engine) + io = ExcelFile(io, engine=engine) elif engine and engine != io.engine: raise ValueError( "Engine should not be specified when passing " @@ -347,14 +336,12 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): + def __init__(self, filepath_or_buffer): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer( - filepath_or_buffer, storage_options=storage_options - ) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer @@ -850,16 +837,14 @@ class ExcelFile: from pandas.io.excel._pyxlsb import _PyxlsbReader from pandas.io.excel._xlrd import _XlrdReader - _engines: Mapping[str, Any] = { + _engines = { "xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader, "pyxlsb": _PyxlsbReader, } - def __init__( - self, path_or_buffer, engine=None, storage_options: StorageOptions = None - ): + def __init__(self, path_or_buffer, engine=None): if engine is None: engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): @@ -873,14 +858,13 @@ def __init__( raise ValueError(f"Unknown engine: {engine}") self.engine = engine - self.storage_options = storage_options # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io, storage_options=storage_options) + self._reader = self._engines[engine](self._io) def __fspath__(self): return self._io diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index a6cd8f524503b..44abaf5d3b3c9 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions +from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -16,19 +16,13 @@ class _ODFReader(_BaseExcelReader): Parameters ---------- - filepath_or_buffer : string, path to be parsed or + filepath_or_buffer: string, path to be parsed or an open readable stream. - storage_options : StorageOptions - passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ - def __init__( - self, - filepath_or_buffer: FilePathOrBuffer, - storage_options: StorageOptions = None, - ): + def __init__(self, filepath_or_buffer: FilePathOrBuffer): import_optional_dependency("odf") - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__(filepath_or_buffer) @property def _workbook_class(self): diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 73239190604db..03a30cbd62f9a 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions +from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader @@ -467,11 +467,7 @@ def write_cells( class _OpenpyxlReader(_BaseExcelReader): - def __init__( - self, - filepath_or_buffer: FilePathOrBuffer, - storage_options: StorageOptions = None, - ) -> None: + def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: """ Reader using openpyxl engine. @@ -479,11 +475,9 @@ def __init__( ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. - storage_options : StorageOptions - passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__(filepath_or_buffer) @property def _workbook_class(self): diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index c0e281ff6c2da..0d96c8c4acdb8 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,31 +1,25 @@ from typing import List -from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions +from pandas._typing import FilePathOrBuffer, Scalar from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _PyxlsbReader(_BaseExcelReader): - def __init__( - self, - filepath_or_buffer: FilePathOrBuffer, - storage_options: StorageOptions = None, - ): + def __init__(self, filepath_or_buffer: FilePathOrBuffer): """ Reader using pyxlsb engine. Parameters ---------- - filepath_or_buffer : str, path object, or Workbook + filepath_or_buffer: str, path object, or Workbook Object to be parsed. - storage_options : StorageOptions - passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer # And set the result to the book-attribute - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__(filepath_or_buffer) @property def _workbook_class(self): diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index ff1b3c8bdb964..af82c15fd6b66 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -2,14 +2,13 @@ import numpy as np -from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): + def __init__(self, filepath_or_buffer): """ Reader using xlrd engine. @@ -17,12 +16,10 @@ def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. - storage_options : StorageOptions - passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) - super().__init__(filepath_or_buffer, storage_options=storage_options) + super().__init__(filepath_or_buffer) @property def _workbook_class(self): diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 2d86fa44f22a4..2c664e73b9463 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,6 +1,5 @@ """ feather-format compat """ -from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas import DataFrame, Int64Index, RangeIndex @@ -8,7 +7,7 @@ from pandas.io.common import get_filepath_or_buffer -def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kwargs): +def to_feather(df: DataFrame, path, storage_options=None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -78,9 +77,7 @@ def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kw feather.write_feather(df, path, **kwargs) -def read_feather( - path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None -): +def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): """ Load a feather-format object from the file path. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 983aa56324083..5d49757ce7d58 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,7 +20,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer, StorageOptions, Union +from pandas._typing import FilePathOrBuffer, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -596,7 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, - storage_options: StorageOptions = None, + storage_options=None, ): # gh-23761 # diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 3c9d79397e4bd..6b86a13fcf1b9 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -38,6 +38,7 @@ def test_interp_empty(self): # https://github.com/pandas-dev/pandas/issues/35598 df = DataFrame() result = df.interpolate() + assert result is not df expected = df tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 8bd403f222af7..fcee25c258efa 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,7 +1,4 @@ import os -import shlex -import subprocess -import time import pytest @@ -33,11 +30,6 @@ def feather_file(datapath): return datapath("io", "data", "feather", "feather-0_3_1.feather") -@pytest.fixture -def s3so(): - return dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) - - @pytest.fixture def s3_resource(tips_file, jsonl_file, feather_file): """ @@ -56,7 +48,6 @@ def s3_resource(tips_file, jsonl_file, feather_file): """ s3fs = pytest.importorskip("s3fs") boto3 = pytest.importorskip("boto3") - requests = pytest.importorskip("requests") with tm.ensure_safe_environment_variables(): # temporary workaround as moto fails for botocore >= 1.11 otherwise, @@ -64,8 +55,7 @@ def s3_resource(tips_file, jsonl_file, feather_file): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - pytest.importorskip("moto", minversion="1.3.14") - pytest.importorskip("flask") # server mode needs flask too + moto = pytest.importorskip("moto") test_s3_files = [ ("tips#1.csv", tips_file), @@ -82,31 +72,12 @@ def add_tips_files(bucket_name): conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) try: - # Launching moto in server mode, i.e., as a separate process - # with an S3 endpoint on localhost - - endpoint_uri = "http://127.0.0.1:5555/" - - # pipe to null to avoid logging in terminal - proc = subprocess.Popen( - shlex.split("moto_server s3 -p 5555"), stdout=subprocess.DEVNULL - ) - - timeout = 5 - while timeout > 0: - try: - # OK to go once server is accepting connections - r = requests.get(endpoint_uri) - if r.ok: - break - except Exception: - pass - timeout -= 0.1 - time.sleep(0.1) + s3 = moto.mock_s3() + s3.start() # see gh-16135 bucket = "pandas-test" - conn = boto3.resource("s3", endpoint_url=endpoint_uri) + conn = boto3.resource("s3", region_name="us-east-1") conn.create_bucket(Bucket=bucket) add_tips_files(bucket) @@ -116,6 +87,4 @@ def add_tips_files(bucket_name): s3fs.S3FileSystem.clear_instance_cache() yield conn finally: - # shut down external process - proc.terminate() - proc.wait() + s3.stop() diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 431a50477fccc..51fbbf836a03f 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -606,14 +606,13 @@ def test_read_from_http_url(self, read_ext): tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale - def test_read_from_s3_url(self, read_ext, s3_resource, s3so): + def test_read_from_s3_url(self, read_ext, s3_resource): # Bucket "pandas-test" created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) url = "s3://pandas-test/test1" + read_ext - - url_table = pd.read_excel(url, storage_options=s3so) + url_table = pd.read_excel(url) local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 5bb205842269e..182c21ed1d416 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -44,11 +44,7 @@ def test_with_s3_url(compression, s3_resource): with open(path, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) - roundtripped_df = pd.read_json( - "s3://pandas-test/test-1", - compression=compression, - storage_options=dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}), - ) + roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 64a666079876f..1280d0fd434d5 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1213,12 +1213,10 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @td.skip_if_not_us_locale - def test_read_s3_jsonl(self, s3_resource, s3so): + def test_read_s3_jsonl(self, s3_resource): # GH17200 - result = read_json( - "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so - ) + result = read_json("s3n://pandas-test/items.jsonl", lines=True) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -1708,12 +1706,7 @@ def test_to_s3(self, s3_resource): # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) - df.to_json( - f"s3://{mock_bucket_name}/{target_file}", - storage_options=dict( - client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"} - ), - ) + df.to_json(f"s3://{mock_bucket_name}/{target_file}") timeout = 5 while True: if target_file in ( diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 3d5f6ae3a4af9..1d8d5a29686a4 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1836,6 +1836,7 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) +@td.check_file_leaks def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") parser = all_parsers diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index 5a08216f01ce4..b30a7b1ef34de 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -46,6 +46,21 @@ def check_compressed_urls(salaries_table, compression, extension, mode, engine): tm.assert_frame_equal(url_table, salaries_table) +@tm.network("https://raw.githubusercontent.com/", check_before_test=True) +def test_url_encoding_csv(): + """ + read_csv should honor the requested encoding for URLs. + + GH 10424 + """ + path = ( + "https://raw.githubusercontent.com/pandas-dev/pandas/master/" + + "pandas/tests/io/parser/data/unicode_series.csv" + ) + df = read_csv(path, encoding="latin-1", header=None) + assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)" + + @pytest.fixture def tips_df(datapath): """DataFrame with the tips dataset.""" @@ -56,62 +71,50 @@ def tips_df(datapath): @td.skip_if_not_us_locale() class TestS3: @td.skip_if_no("s3fs") - def test_parse_public_s3_bucket(self, tips_df, s3so): + def test_parse_public_s3_bucket(self, tips_df): # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - "s3://pandas-test/tips.csv" + ext, - compression=comp, - storage_options=s3so, - ) + df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so) + df = read_csv("s3://cant_get_it/tips.csv") assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self, tips_df, s3so): + def test_parse_public_s3n_bucket(self, tips_df): # Read from AWS s3 as "s3n" URL - df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so) + df = read_csv("s3n://pandas-test/tips.csv", nrows=10) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self, tips_df, s3so): + def test_parse_public_s3a_bucket(self, tips_df): # Read from AWS s3 as "s3a" URL - df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so) + df = read_csv("s3a://pandas-test/tips.csv", nrows=10) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self, tips_df, s3so): + def test_parse_public_s3_bucket_nrows(self, tips_df): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv( - "s3://pandas-test/tips.csv" + ext, - nrows=10, - compression=comp, - storage_options=s3so, - ) + df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): + def test_parse_public_s3_bucket_chunked(self, tips_df): # Read with a chunksize chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df_reader = read_csv( - "s3://pandas-test/tips.csv" + ext, - chunksize=chunksize, - compression=comp, - storage_options=s3so, + "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -123,7 +126,7 @@ def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): + def test_parse_public_s3_bucket_chunked_python(self, tips_df): # Read with a chunksize using the Python parser chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: @@ -132,7 +135,6 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): chunksize=chunksize, compression=comp, engine="python", - storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -143,53 +145,46 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self, tips_df, s3so): + def test_parse_public_s3_bucket_python(self, tips_df): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, - engine="python", - compression=comp, - storage_options=s3so, + "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self, tips_df, s3so): + def test_infer_s3_compression(self, tips_df): for ext in ["", ".gz", ".bz2"]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, - engine="python", - compression="infer", - storage_options=s3so, + "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer" ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so): + def test_parse_public_s3_bucket_nrows_python(self, tips_df): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, engine="python", nrows=10, compression=comp, - storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_read_s3_fails(self, s3so): + def test_read_s3_fails(self): with pytest.raises(IOError): - read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) + read_csv("s3://nyqpug/asdf.csv") # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): read_csv("s3://cant_get_it/file.csv") - def test_write_s3_csv_fails(self, tips_df, s3so): + def test_write_s3_csv_fails(self, tips_df): # GH 32486 # Attempting to write to an invalid S3 path should raise import botocore @@ -200,12 +195,10 @@ def test_write_s3_csv_fails(self, tips_df, s3so): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_csv( - "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so - ) + tips_df.to_csv("s3://an_s3_bucket_data_doesnt_exit/not_real.csv") @td.skip_if_no("pyarrow") - def test_write_s3_parquet_fails(self, tips_df, s3so): + def test_write_s3_parquet_fails(self, tips_df): # GH 27679 # Attempting to write to an invalid S3 path should raise import botocore @@ -216,10 +209,7 @@ def test_write_s3_parquet_fails(self, tips_df, s3so): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_parquet( - "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", - storage_options=s3so, - ) + tips_df.to_parquet("s3://an_s3_bucket_data_doesnt_exit/not_real.parquet") def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -235,7 +225,7 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): expected = read_csv(tips_file) tm.assert_frame_equal(result, expected) - def test_read_csv_chunked_download(self, s3_resource, caplog, s3so): + def test_read_csv_chunked_download(self, s3_resource, caplog): # 8 MB, S3FS usees 5MB chunks import s3fs @@ -255,20 +245,18 @@ def test_read_csv_chunked_download(self, s3_resource, caplog, s3so): s3fs.S3FileSystem.clear_instance_cache() with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so) + read_csv("s3://pandas-test/large-file.csv", nrows=5) # log of fetch_range (start, stop) assert (0, 5505024) in (x.args[-2:] for x in caplog.records) - def test_read_s3_with_hash_in_key(self, tips_df, s3so): + def test_read_s3_with_hash_in_key(self, tips_df): # GH 25945 - result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) + result = read_csv("s3://pandas-test/tips#1.csv") tm.assert_frame_equal(tips_df, result) @td.skip_if_no("pyarrow") - def test_read_feather_s3_file_path(self, feather_file, s3so): + def test_read_feather_s3_file_path(self, feather_file): # GH 29055 expected = read_feather(feather_file) - res = read_feather( - "s3://pandas-test/simple_dataset.feather", storage_options=s3so - ) + res = read_feather("s3://pandas-test/simple_dataset.feather") tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 666da677d702e..3e89f6ca4ae16 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -131,38 +131,27 @@ def test_fastparquet_options(fsspectest): @td.skip_if_no("s3fs") -def test_from_s3_csv(s3_resource, tips_file, s3so): - tm.assert_equal( - read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file) - ) +def test_from_s3_csv(s3_resource, tips_file): + tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) # the following are decompressed by pandas, not fsspec - tm.assert_equal( - read_csv("s3://pandas-test/tips.csv.gz", storage_options=s3so), - read_csv(tips_file), - ) - tm.assert_equal( - read_csv("s3://pandas-test/tips.csv.bz2", storage_options=s3so), - read_csv(tips_file), - ) + tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file)) + tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file)) @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) @td.skip_if_no("s3fs") -def test_s3_protocols(s3_resource, tips_file, protocol, s3so): +def test_s3_protocols(s3_resource, tips_file, protocol): tm.assert_equal( - read_csv("%s://pandas-test/tips.csv" % protocol, storage_options=s3so), - read_csv(tips_file), + read_csv("%s://pandas-test/tips.csv" % protocol), read_csv(tips_file) ) @td.skip_if_no("s3fs") @td.skip_if_no("fastparquet") -def test_s3_parquet(s3_resource, s3so): +def test_s3_parquet(s3_resource): fn = "s3://pandas-test/test.parquet" - df1.to_parquet( - fn, index=False, engine="fastparquet", compression=None, storage_options=s3so - ) - df2 = read_parquet(fn, engine="fastparquet", storage_options=s3so) + df1.to_parquet(fn, index=False, engine="fastparquet", compression=None) + df2 = read_parquet(fn, engine="fastparquet") tm.assert_equal(df1, df2) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 3f94af65449c7..82157f3d722a9 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -158,10 +158,6 @@ def check_round_trip( """ write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} - if isinstance(path, str) and "s3://" in path: - s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) - read_kwargs["storage_options"] = s3so - write_kwargs["storage_options"] = s3so if expected is None: expected = df @@ -541,9 +537,9 @@ def test_categorical(self, pa): expected = df.astype(object) check_round_trip(df, pa, expected=expected) - def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): s3fs = pytest.importorskip("s3fs") - s3 = s3fs.S3FileSystem(**s3so) + s3 = s3fs.S3FileSystem() kw = dict(filesystem=s3) check_round_trip( df_compat, @@ -568,7 +564,6 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): expected_df = df_compat.copy() if partition_col: expected_df[partition_col] = expected_df[partition_col].astype("category") - check_round_trip( df_compat, pa, diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index 29b787d39c09d..a7e3162ed7b73 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -263,7 +263,8 @@ def _get_all_tables(self): return table_list def _close_conn(self): - pass + # https://docs.sqlalchemy.org/en/13/core/connections.html#engine-disposal + self.conn.dispose() class PandasSQLTest: @@ -1242,7 +1243,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): def setup_class(cls): cls.setup_import() cls.setup_driver() - conn = cls.connect() + conn = cls.conn = cls.connect() conn.connect() def load_test_data_and_sql(self): diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index d9fdf1491c328..f9259beab5d13 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -1,6 +1,7 @@ import numpy as np import pytest +from pandas import array import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -102,3 +103,11 @@ def test_assert_extension_array_equal_non_extension_array(side): with pytest.raises(AssertionError, match=msg): tm.assert_extension_array_equal(*args) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_extension_array_equal_ignore_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = array([1, 2, 3], dtype="Int64") + right = array([1, 2, 3], dtype=right_dtype) + tm.assert_extension_array_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index fe3e1ff906919..3aa3c64923b14 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -260,3 +260,11 @@ def test_assert_frame_equal_interval_dtype_mismatch(): with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") + right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) + tm.assert_frame_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index a7b5aeac560e4..f3c66052b1904 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -296,3 +296,11 @@ def test_series_equal_exact_for_nonnumeric(): tm.assert_series_equal(s1, s3, check_exact=True) with pytest.raises(AssertionError): tm.assert_series_equal(s3, s1, check_exact=True) + + +@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) +def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): + # https://github.com/pandas-dev/pandas/issues/35715 + left = pd.Series([1, 2, 3], dtype="Int64") + right = pd.Series([1, 2, 3], dtype=right_dtype) + tm.assert_series_equal(left, right, check_dtype=False) From 51bb02b116d547c6d28141efb1561e2857c1f8ad Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 18 Aug 2020 14:51:08 -0400 Subject: [PATCH 47/59] Revert - reintroduce code --- pandas/_libs/tslibs/parsing.pyx | 3 +- pandas/_testing.py | 10 +- pandas/core/generic.py | 2 +- pandas/core/groupby/generic.py | 26 ++++- pandas/core/internals/managers.py | 32 ------ pandas/io/common.py | 5 +- pandas/io/excel/_base.py | 30 ++++-- pandas/io/excel/_odfreader.py | 14 ++- pandas/io/excel/_openpyxl.py | 12 ++- pandas/io/excel/_pyxlsb.py | 14 ++- pandas/io/excel/_xlrd.py | 7 +- pandas/io/feather_format.py | 7 +- pandas/io/parsers.py | 4 +- .../tests/frame/methods/test_interpolate.py | 1 - pandas/tests/io/conftest.py | 41 ++++++- pandas/tests/io/excel/test_readers.py | 5 +- pandas/tests/io/json/test_compression.py | 6 +- pandas/tests/io/json/test_pandas.py | 13 ++- pandas/tests/io/parser/test_common.py | 1 - pandas/tests/io/parser/test_network.py | 100 ++++++++++-------- pandas/tests/io/test_fsspec.py | 29 +++-- pandas/tests/io/test_parquet.py | 9 +- pandas/tests/io/test_sql.py | 5 +- .../util/test_assert_extension_array_equal.py | 9 -- pandas/tests/util/test_assert_frame_equal.py | 8 -- pandas/tests/util/test_assert_series_equal.py | 8 -- 26 files changed, 232 insertions(+), 169 deletions(-) diff --git a/pandas/_libs/tslibs/parsing.pyx b/pandas/_libs/tslibs/parsing.pyx index 7478179df3b75..8429aebbd85b8 100644 --- a/pandas/_libs/tslibs/parsing.pyx +++ b/pandas/_libs/tslibs/parsing.pyx @@ -381,8 +381,7 @@ cdef inline object _parse_dateabbr_string(object date_string, datetime default, object freq): cdef: object ret - # year initialized to prevent compiler warnings - int year = -1, quarter = -1, month, mnum, date_len + int year, quarter = -1, month, mnum, date_len # special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1 assert isinstance(date_string, str) diff --git a/pandas/_testing.py b/pandas/_testing.py index ef6232fa6d575..713f29466f097 100644 --- a/pandas/_testing.py +++ b/pandas/_testing.py @@ -1377,18 +1377,12 @@ def assert_series_equal( ) elif is_extension_array_dtype(left.dtype) and is_extension_array_dtype(right.dtype): assert_extension_array_equal( - left._values, - right._values, - check_dtype=check_dtype, - index_values=np.asarray(left.index), + left._values, right._values, index_values=np.asarray(left.index) ) elif needs_i8_conversion(left.dtype) or needs_i8_conversion(right.dtype): # DatetimeArray or TimedeltaArray assert_extension_array_equal( - left._values, - right._values, - check_dtype=check_dtype, - index_values=np.asarray(left.index), + left._values, right._values, index_values=np.asarray(left.index) ) else: _testing.assert_almost_equal( diff --git a/pandas/core/generic.py b/pandas/core/generic.py index fe412bc0ce937..9cbe2f714fd57 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -6893,7 +6893,7 @@ def interpolate( obj = self.T if should_transpose else self if obj.empty: - return self.copy() + return self if method not in fillna_methods: axis = self._info_axis_number diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 166631e69f523..b806d9856d20f 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -1031,6 +1031,7 @@ def _cython_agg_blocks( data = data.get_numeric_data(copy=False) agg_blocks: List["Block"] = [] + deleted_items: List[np.ndarray] = [] no_result = object() @@ -1110,7 +1111,6 @@ def blk_func(block: "Block") -> List["Block"]: assert len(locs) == result.shape[1] for i, loc in enumerate(locs): agg_block = result.iloc[:, [i]]._mgr.blocks[0] - agg_block.mgr_locs = [loc] new_blocks.append(agg_block) else: result = result._mgr.blocks[0].values @@ -1124,6 +1124,7 @@ def blk_func(block: "Block") -> List["Block"]: return new_blocks skipped: List[int] = [] + new_items: List[np.ndarray] = [] for i, block in enumerate(data.blocks): try: nbs = blk_func(block) @@ -1132,15 +1133,36 @@ def blk_func(block: "Block") -> List["Block"]: # continue and exclude the block # NotImplementedError -> "ohlc" with wrong dtype skipped.append(i) + deleted_items.append(block.mgr_locs.as_array) else: agg_blocks.extend(nbs) + new_items.append(block.mgr_locs.as_array) if not agg_blocks: raise DataError("No numeric types to aggregate") # reset the locs in the blocks to correspond to our # current ordering - agg_items = data.reset_dropped_locs(agg_blocks, skipped) + indexer = np.concatenate(new_items) + agg_items = data.items.take(np.sort(indexer)) + + if deleted_items: + + # we need to adjust the indexer to account for the + # items we have removed + # really should be done in internals :< + + deleted = np.concatenate(deleted_items) + ai = np.arange(len(data)) + mask = np.zeros(len(data)) + mask[deleted] = 1 + indexer = (ai - mask.cumsum())[indexer] + + offset = 0 + for blk in agg_blocks: + loc = len(blk.mgr_locs) + blk.mgr_locs = indexer[offset : (offset + loc)] + offset += loc return agg_blocks, agg_items diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index f05d4cf1c4be6..5a215c4cd5fa3 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -1504,38 +1504,6 @@ def unstack(self, unstacker, fill_value) -> "BlockManager": bm = BlockManager(new_blocks, [new_columns, new_index]) return bm - def reset_dropped_locs(self, blocks: List[Block], skipped: List[int]) -> Index: - """ - Decrement the mgr_locs of the given blocks with `skipped` removed. - - Notes - ----- - Alters each block's mgr_locs inplace. - """ - ncols = len(self) - - new_locs = [blk.mgr_locs.as_array for blk in blocks] - indexer = np.concatenate(new_locs) - - new_items = self.items.take(np.sort(indexer)) - - if skipped: - # we need to adjust the indexer to account for the - # items we have removed - deleted_items = [self.blocks[i].mgr_locs.as_array for i in skipped] - deleted = np.concatenate(deleted_items) - ai = np.arange(ncols) - mask = np.zeros(ncols) - mask[deleted] = 1 - indexer = (ai - mask.cumsum())[indexer] - - offset = 0 - for blk in blocks: - loc = len(blk.mgr_locs) - blk.mgr_locs = indexer[offset : (offset + loc)] - offset += loc - return new_items - class SingleBlockManager(BlockManager): """ manage a single block with """ diff --git a/pandas/io/common.py b/pandas/io/common.py index d1305c9cabe0e..54f35e689aac8 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -18,7 +18,6 @@ Optional, Tuple, Type, - Union, ) from urllib.parse import ( urljoin, @@ -453,7 +452,7 @@ def get_handle( except ImportError: need_text_wrapping = (BufferedIOBase, RawIOBase) - handles: List[Union[IO, _MMapWrapper]] = list() + handles: List[IO] = list() f = path_or_buf # Convert pathlib.Path/py.path.local or string @@ -536,8 +535,6 @@ def get_handle( try: wrapped = _MMapWrapper(f) f.close() - handles.remove(f) - handles.append(wrapped) f = wrapped except Exception: # we catch any errors that may have occurred diff --git a/pandas/io/excel/_base.py b/pandas/io/excel/_base.py index b1bbda4a4b7e0..aaef71910c9ab 100644 --- a/pandas/io/excel/_base.py +++ b/pandas/io/excel/_base.py @@ -3,11 +3,12 @@ from io import BufferedIOBase, BytesIO, RawIOBase import os from textwrap import fill -from typing import Union +from typing import Any, Mapping, Union from pandas._config import config from pandas._libs.parsers import STR_NA_VALUES +from pandas._typing import StorageOptions from pandas.errors import EmptyDataError from pandas.util._decorators import Appender, deprecate_nonkeyword_arguments @@ -199,6 +200,15 @@ Duplicate columns will be specified as 'X', 'X.1', ...'X.N', rather than 'X'...'X'. Passing in False will cause data to be overwritten if there are duplicate names in the columns. +storage_options : StorageOptions + Extra options that make sense for a particular storage connection, e.g. + host, port, username, password, etc., if using a URL that will + be parsed by ``fsspec``, e.g., starting "s3://", "gcs://". An error + will be raised if providing this argument with a local path or + a file-like buffer. See the fsspec and backend storage implementation + docs for the set of allowed keys and values + + .. versionadded:: 1.2.0 Returns ------- @@ -298,10 +308,11 @@ def read_excel( skipfooter=0, convert_float=True, mangle_dupe_cols=True, + storage_options: StorageOptions = None, ): if not isinstance(io, ExcelFile): - io = ExcelFile(io, engine=engine) + io = ExcelFile(io, storage_options=storage_options, engine=engine) elif engine and engine != io.engine: raise ValueError( "Engine should not be specified when passing " @@ -336,12 +347,14 @@ def read_excel( class _BaseExcelReader(metaclass=abc.ABCMeta): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): # If filepath_or_buffer is a url, load the data into a BytesIO if is_url(filepath_or_buffer): filepath_or_buffer = BytesIO(urlopen(filepath_or_buffer).read()) elif not isinstance(filepath_or_buffer, (ExcelFile, self._workbook_class)): - filepath_or_buffer, _, _, _ = get_filepath_or_buffer(filepath_or_buffer) + filepath_or_buffer, _, _, _ = get_filepath_or_buffer( + filepath_or_buffer, storage_options=storage_options + ) if isinstance(filepath_or_buffer, self._workbook_class): self.book = filepath_or_buffer @@ -837,14 +850,16 @@ class ExcelFile: from pandas.io.excel._pyxlsb import _PyxlsbReader from pandas.io.excel._xlrd import _XlrdReader - _engines = { + _engines: Mapping[str, Any] = { "xlrd": _XlrdReader, "openpyxl": _OpenpyxlReader, "odf": _ODFReader, "pyxlsb": _PyxlsbReader, } - def __init__(self, path_or_buffer, engine=None): + def __init__( + self, path_or_buffer, engine=None, storage_options: StorageOptions = None + ): if engine is None: engine = "xlrd" if isinstance(path_or_buffer, (BufferedIOBase, RawIOBase)): @@ -858,13 +873,14 @@ def __init__(self, path_or_buffer, engine=None): raise ValueError(f"Unknown engine: {engine}") self.engine = engine + self.storage_options = storage_options # Could be a str, ExcelFile, Book, etc. self.io = path_or_buffer # Always a string self._io = stringify_path(path_or_buffer) - self._reader = self._engines[engine](self._io) + self._reader = self._engines[engine](self._io, storage_options=storage_options) def __fspath__(self): return self._io diff --git a/pandas/io/excel/_odfreader.py b/pandas/io/excel/_odfreader.py index 44abaf5d3b3c9..a6cd8f524503b 100644 --- a/pandas/io/excel/_odfreader.py +++ b/pandas/io/excel/_odfreader.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency import pandas as pd @@ -16,13 +16,19 @@ class _ODFReader(_BaseExcelReader): Parameters ---------- - filepath_or_buffer: string, path to be parsed or + filepath_or_buffer : string, path to be parsed or an open readable stream. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): import_optional_dependency("odf") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_openpyxl.py b/pandas/io/excel/_openpyxl.py index 03a30cbd62f9a..73239190604db 100644 --- a/pandas/io/excel/_openpyxl.py +++ b/pandas/io/excel/_openpyxl.py @@ -2,7 +2,7 @@ import numpy as np -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import ExcelWriter, _BaseExcelReader @@ -467,7 +467,11 @@ def write_cells( class _OpenpyxlReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ) -> None: """ Reader using openpyxl engine. @@ -475,9 +479,11 @@ def __init__(self, filepath_or_buffer: FilePathOrBuffer) -> None: ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("openpyxl") - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_pyxlsb.py b/pandas/io/excel/_pyxlsb.py index 0d96c8c4acdb8..c0e281ff6c2da 100644 --- a/pandas/io/excel/_pyxlsb.py +++ b/pandas/io/excel/_pyxlsb.py @@ -1,25 +1,31 @@ from typing import List -from pandas._typing import FilePathOrBuffer, Scalar +from pandas._typing import FilePathOrBuffer, Scalar, StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _PyxlsbReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer: FilePathOrBuffer): + def __init__( + self, + filepath_or_buffer: FilePathOrBuffer, + storage_options: StorageOptions = None, + ): """ Reader using pyxlsb engine. Parameters ---------- - filepath_or_buffer: str, path object, or Workbook + filepath_or_buffer : str, path object, or Workbook Object to be parsed. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ import_optional_dependency("pyxlsb") # This will call load_workbook on the filepath or buffer # And set the result to the book-attribute - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/excel/_xlrd.py b/pandas/io/excel/_xlrd.py index af82c15fd6b66..ff1b3c8bdb964 100644 --- a/pandas/io/excel/_xlrd.py +++ b/pandas/io/excel/_xlrd.py @@ -2,13 +2,14 @@ import numpy as np +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas.io.excel._base import _BaseExcelReader class _XlrdReader(_BaseExcelReader): - def __init__(self, filepath_or_buffer): + def __init__(self, filepath_or_buffer, storage_options: StorageOptions = None): """ Reader using xlrd engine. @@ -16,10 +17,12 @@ def __init__(self, filepath_or_buffer): ---------- filepath_or_buffer : string, path object or Workbook Object to be parsed. + storage_options : StorageOptions + passed to fsspec for appropriate URLs (see ``get_filepath_or_buffer``) """ err_msg = "Install xlrd >= 1.0.0 for Excel support" import_optional_dependency("xlrd", extra=err_msg) - super().__init__(filepath_or_buffer) + super().__init__(filepath_or_buffer, storage_options=storage_options) @property def _workbook_class(self): diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index 2c664e73b9463..2d86fa44f22a4 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -1,5 +1,6 @@ """ feather-format compat """ +from pandas._typing import StorageOptions from pandas.compat._optional import import_optional_dependency from pandas import DataFrame, Int64Index, RangeIndex @@ -7,7 +8,7 @@ from pandas.io.common import get_filepath_or_buffer -def to_feather(df: DataFrame, path, storage_options=None, **kwargs): +def to_feather(df: DataFrame, path, storage_options: StorageOptions = None, **kwargs): """ Write a DataFrame to the binary Feather format. @@ -77,7 +78,9 @@ def to_feather(df: DataFrame, path, storage_options=None, **kwargs): feather.write_feather(df, path, **kwargs) -def read_feather(path, columns=None, use_threads: bool = True, storage_options=None): +def read_feather( + path, columns=None, use_threads: bool = True, storage_options: StorageOptions = None +): """ Load a feather-format object from the file path. diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 5d49757ce7d58..983aa56324083 100644 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -20,7 +20,7 @@ import pandas._libs.parsers as parsers from pandas._libs.parsers import STR_NA_VALUES from pandas._libs.tslibs import parsing -from pandas._typing import FilePathOrBuffer, Union +from pandas._typing import FilePathOrBuffer, StorageOptions, Union from pandas.errors import ( AbstractMethodError, EmptyDataError, @@ -596,7 +596,7 @@ def read_csv( low_memory=_c_parser_defaults["low_memory"], memory_map=False, float_precision=None, - storage_options=None, + storage_options: StorageOptions = None, ): # gh-23761 # diff --git a/pandas/tests/frame/methods/test_interpolate.py b/pandas/tests/frame/methods/test_interpolate.py index 6b86a13fcf1b9..3c9d79397e4bd 100644 --- a/pandas/tests/frame/methods/test_interpolate.py +++ b/pandas/tests/frame/methods/test_interpolate.py @@ -38,7 +38,6 @@ def test_interp_empty(self): # https://github.com/pandas-dev/pandas/issues/35598 df = DataFrame() result = df.interpolate() - assert result is not df expected = df tm.assert_frame_equal(result, expected) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index fcee25c258efa..8bd403f222af7 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -1,4 +1,7 @@ import os +import shlex +import subprocess +import time import pytest @@ -30,6 +33,11 @@ def feather_file(datapath): return datapath("io", "data", "feather", "feather-0_3_1.feather") +@pytest.fixture +def s3so(): + return dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + + @pytest.fixture def s3_resource(tips_file, jsonl_file, feather_file): """ @@ -48,6 +56,7 @@ def s3_resource(tips_file, jsonl_file, feather_file): """ s3fs = pytest.importorskip("s3fs") boto3 = pytest.importorskip("boto3") + requests = pytest.importorskip("requests") with tm.ensure_safe_environment_variables(): # temporary workaround as moto fails for botocore >= 1.11 otherwise, @@ -55,7 +64,8 @@ def s3_resource(tips_file, jsonl_file, feather_file): os.environ.setdefault("AWS_ACCESS_KEY_ID", "foobar_key") os.environ.setdefault("AWS_SECRET_ACCESS_KEY", "foobar_secret") - moto = pytest.importorskip("moto") + pytest.importorskip("moto", minversion="1.3.14") + pytest.importorskip("flask") # server mode needs flask too test_s3_files = [ ("tips#1.csv", tips_file), @@ -72,12 +82,31 @@ def add_tips_files(bucket_name): conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) try: - s3 = moto.mock_s3() - s3.start() + # Launching moto in server mode, i.e., as a separate process + # with an S3 endpoint on localhost + + endpoint_uri = "http://127.0.0.1:5555/" + + # pipe to null to avoid logging in terminal + proc = subprocess.Popen( + shlex.split("moto_server s3 -p 5555"), stdout=subprocess.DEVNULL + ) + + timeout = 5 + while timeout > 0: + try: + # OK to go once server is accepting connections + r = requests.get(endpoint_uri) + if r.ok: + break + except Exception: + pass + timeout -= 0.1 + time.sleep(0.1) # see gh-16135 bucket = "pandas-test" - conn = boto3.resource("s3", region_name="us-east-1") + conn = boto3.resource("s3", endpoint_url=endpoint_uri) conn.create_bucket(Bucket=bucket) add_tips_files(bucket) @@ -87,4 +116,6 @@ def add_tips_files(bucket_name): s3fs.S3FileSystem.clear_instance_cache() yield conn finally: - s3.stop() + # shut down external process + proc.terminate() + proc.wait() diff --git a/pandas/tests/io/excel/test_readers.py b/pandas/tests/io/excel/test_readers.py index 51fbbf836a03f..431a50477fccc 100644 --- a/pandas/tests/io/excel/test_readers.py +++ b/pandas/tests/io/excel/test_readers.py @@ -606,13 +606,14 @@ def test_read_from_http_url(self, read_ext): tm.assert_frame_equal(url_table, local_table) @td.skip_if_not_us_locale - def test_read_from_s3_url(self, read_ext, s3_resource): + def test_read_from_s3_url(self, read_ext, s3_resource, s3so): # Bucket "pandas-test" created in tests/io/conftest.py with open("test1" + read_ext, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test1" + read_ext, Body=f) url = "s3://pandas-test/test1" + read_ext - url_table = pd.read_excel(url) + + url_table = pd.read_excel(url, storage_options=s3so) local_table = pd.read_excel("test1" + read_ext) tm.assert_frame_equal(url_table, local_table) diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py index 182c21ed1d416..5bb205842269e 100644 --- a/pandas/tests/io/json/test_compression.py +++ b/pandas/tests/io/json/test_compression.py @@ -44,7 +44,11 @@ def test_with_s3_url(compression, s3_resource): with open(path, "rb") as f: s3_resource.Bucket("pandas-test").put_object(Key="test-1", Body=f) - roundtripped_df = pd.read_json("s3://pandas-test/test-1", compression=compression) + roundtripped_df = pd.read_json( + "s3://pandas-test/test-1", + compression=compression, + storage_options=dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}), + ) tm.assert_frame_equal(df, roundtripped_df) diff --git a/pandas/tests/io/json/test_pandas.py b/pandas/tests/io/json/test_pandas.py index 1280d0fd434d5..64a666079876f 100644 --- a/pandas/tests/io/json/test_pandas.py +++ b/pandas/tests/io/json/test_pandas.py @@ -1213,10 +1213,12 @@ def test_read_inline_jsonl(self): tm.assert_frame_equal(result, expected) @td.skip_if_not_us_locale - def test_read_s3_jsonl(self, s3_resource): + def test_read_s3_jsonl(self, s3_resource, s3so): # GH17200 - result = read_json("s3n://pandas-test/items.jsonl", lines=True) + result = read_json( + "s3n://pandas-test/items.jsonl", lines=True, storage_options=s3so + ) expected = DataFrame([[1, 2], [1, 2]], columns=["a", "b"]) tm.assert_frame_equal(result, expected) @@ -1706,7 +1708,12 @@ def test_to_s3(self, s3_resource): # GH 28375 mock_bucket_name, target_file = "pandas-test", "test.json" df = DataFrame({"x": [1, 2, 3], "y": [2, 4, 6]}) - df.to_json(f"s3://{mock_bucket_name}/{target_file}") + df.to_json( + f"s3://{mock_bucket_name}/{target_file}", + storage_options=dict( + client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"} + ), + ) timeout = 5 while True: if target_file in ( diff --git a/pandas/tests/io/parser/test_common.py b/pandas/tests/io/parser/test_common.py index 1d8d5a29686a4..3d5f6ae3a4af9 100644 --- a/pandas/tests/io/parser/test_common.py +++ b/pandas/tests/io/parser/test_common.py @@ -1836,7 +1836,6 @@ def test_raise_on_no_columns(all_parsers, nrows): parser.read_csv(StringIO(data)) -@td.check_file_leaks def test_memory_map(all_parsers, csv_dir_path): mmap_file = os.path.join(csv_dir_path, "test_mmap.csv") parser = all_parsers diff --git a/pandas/tests/io/parser/test_network.py b/pandas/tests/io/parser/test_network.py index b30a7b1ef34de..5a08216f01ce4 100644 --- a/pandas/tests/io/parser/test_network.py +++ b/pandas/tests/io/parser/test_network.py @@ -46,21 +46,6 @@ def check_compressed_urls(salaries_table, compression, extension, mode, engine): tm.assert_frame_equal(url_table, salaries_table) -@tm.network("https://raw.githubusercontent.com/", check_before_test=True) -def test_url_encoding_csv(): - """ - read_csv should honor the requested encoding for URLs. - - GH 10424 - """ - path = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/master/" - + "pandas/tests/io/parser/data/unicode_series.csv" - ) - df = read_csv(path, encoding="latin-1", header=None) - assert df.loc[15, 1] == "Á köldum klaka (Cold Fever) (1994)" - - @pytest.fixture def tips_df(datapath): """DataFrame with the tips dataset.""" @@ -71,50 +56,62 @@ def tips_df(datapath): @td.skip_if_not_us_locale() class TestS3: @td.skip_if_no("s3fs") - def test_parse_public_s3_bucket(self, tips_df): + def test_parse_public_s3_bucket(self, tips_df, s3so): # more of an integration test due to the not-public contents portion # can probably mock this though. for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, compression=comp) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) # Read public file from bucket with not-public contents - df = read_csv("s3://cant_get_it/tips.csv") + df = read_csv("s3://cant_get_it/tips.csv", storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3n_bucket(self, tips_df): + def test_parse_public_s3n_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3n" URL - df = read_csv("s3n://pandas-test/tips.csv", nrows=10) + df = read_csv("s3n://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3a_bucket(self, tips_df): + def test_parse_public_s3a_bucket(self, tips_df, s3so): # Read from AWS s3 as "s3a" URL - df = read_csv("s3a://pandas-test/tips.csv", nrows=10) + df = read_csv("s3a://pandas-test/tips.csv", nrows=10, storage_options=s3so) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_nrows(self, tips_df): + def test_parse_public_s3_bucket_nrows(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: - df = read_csv("s3://pandas-test/tips.csv" + ext, nrows=10, compression=comp) + df = read_csv( + "s3://pandas-test/tips.csv" + ext, + nrows=10, + compression=comp, + storage_options=s3so, + ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_parse_public_s3_bucket_chunked(self, tips_df): + def test_parse_public_s3_bucket_chunked(self, tips_df, s3so): # Read with a chunksize chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df_reader = read_csv( - "s3://pandas-test/tips.csv" + ext, chunksize=chunksize, compression=comp + "s3://pandas-test/tips.csv" + ext, + chunksize=chunksize, + compression=comp, + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -126,7 +123,7 @@ def test_parse_public_s3_bucket_chunked(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_chunked_python(self, tips_df): + def test_parse_public_s3_bucket_chunked_python(self, tips_df, s3so): # Read with a chunksize using the Python parser chunksize = 5 for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: @@ -135,6 +132,7 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): chunksize=chunksize, compression=comp, engine="python", + storage_options=s3so, ) assert df_reader.chunksize == chunksize for i_chunk in [0, 1, 2]: @@ -145,46 +143,53 @@ def test_parse_public_s3_bucket_chunked_python(self, tips_df): true_df = tips_df.iloc[chunksize * i_chunk : chunksize * (i_chunk + 1)] tm.assert_frame_equal(true_df, df) - def test_parse_public_s3_bucket_python(self, tips_df): + def test_parse_public_s3_bucket_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression=comp + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression=comp, + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_infer_s3_compression(self, tips_df): + def test_infer_s3_compression(self, tips_df, s3so): for ext in ["", ".gz", ".bz2"]: df = read_csv( - "s3://pandas-test/tips.csv" + ext, engine="python", compression="infer" + "s3://pandas-test/tips.csv" + ext, + engine="python", + compression="infer", + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(df, tips_df) - def test_parse_public_s3_bucket_nrows_python(self, tips_df): + def test_parse_public_s3_bucket_nrows_python(self, tips_df, s3so): for ext, comp in [("", None), (".gz", "gzip"), (".bz2", "bz2")]: df = read_csv( "s3://pandas-test/tips.csv" + ext, engine="python", nrows=10, compression=comp, + storage_options=s3so, ) assert isinstance(df, DataFrame) assert not df.empty tm.assert_frame_equal(tips_df.iloc[:10], df) - def test_read_s3_fails(self): + def test_read_s3_fails(self, s3so): with pytest.raises(IOError): - read_csv("s3://nyqpug/asdf.csv") + read_csv("s3://nyqpug/asdf.csv", storage_options=s3so) # Receive a permission error when trying to read a private bucket. # It's irrelevant here that this isn't actually a table. with pytest.raises(IOError): read_csv("s3://cant_get_it/file.csv") - def test_write_s3_csv_fails(self, tips_df): + def test_write_s3_csv_fails(self, tips_df, s3so): # GH 32486 # Attempting to write to an invalid S3 path should raise import botocore @@ -195,10 +200,12 @@ def test_write_s3_csv_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_csv("s3://an_s3_bucket_data_doesnt_exit/not_real.csv") + tips_df.to_csv( + "s3://an_s3_bucket_data_doesnt_exit/not_real.csv", storage_options=s3so + ) @td.skip_if_no("pyarrow") - def test_write_s3_parquet_fails(self, tips_df): + def test_write_s3_parquet_fails(self, tips_df, s3so): # GH 27679 # Attempting to write to an invalid S3 path should raise import botocore @@ -209,7 +216,10 @@ def test_write_s3_parquet_fails(self, tips_df): error = (FileNotFoundError, botocore.exceptions.ClientError) with pytest.raises(error, match="The specified bucket does not exist"): - tips_df.to_parquet("s3://an_s3_bucket_data_doesnt_exit/not_real.parquet") + tips_df.to_parquet( + "s3://an_s3_bucket_data_doesnt_exit/not_real.parquet", + storage_options=s3so, + ) def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): # see gh-16135 @@ -225,7 +235,7 @@ def test_read_csv_handles_boto_s3_object(self, s3_resource, tips_file): expected = read_csv(tips_file) tm.assert_frame_equal(result, expected) - def test_read_csv_chunked_download(self, s3_resource, caplog): + def test_read_csv_chunked_download(self, s3_resource, caplog, s3so): # 8 MB, S3FS usees 5MB chunks import s3fs @@ -245,18 +255,20 @@ def test_read_csv_chunked_download(self, s3_resource, caplog): s3fs.S3FileSystem.clear_instance_cache() with caplog.at_level(logging.DEBUG, logger="s3fs"): - read_csv("s3://pandas-test/large-file.csv", nrows=5) + read_csv("s3://pandas-test/large-file.csv", nrows=5, storage_options=s3so) # log of fetch_range (start, stop) assert (0, 5505024) in (x.args[-2:] for x in caplog.records) - def test_read_s3_with_hash_in_key(self, tips_df): + def test_read_s3_with_hash_in_key(self, tips_df, s3so): # GH 25945 - result = read_csv("s3://pandas-test/tips#1.csv") + result = read_csv("s3://pandas-test/tips#1.csv", storage_options=s3so) tm.assert_frame_equal(tips_df, result) @td.skip_if_no("pyarrow") - def test_read_feather_s3_file_path(self, feather_file): + def test_read_feather_s3_file_path(self, feather_file, s3so): # GH 29055 expected = read_feather(feather_file) - res = read_feather("s3://pandas-test/simple_dataset.feather") + res = read_feather( + "s3://pandas-test/simple_dataset.feather", storage_options=s3so + ) tm.assert_frame_equal(expected, res) diff --git a/pandas/tests/io/test_fsspec.py b/pandas/tests/io/test_fsspec.py index 3e89f6ca4ae16..666da677d702e 100644 --- a/pandas/tests/io/test_fsspec.py +++ b/pandas/tests/io/test_fsspec.py @@ -131,27 +131,38 @@ def test_fastparquet_options(fsspectest): @td.skip_if_no("s3fs") -def test_from_s3_csv(s3_resource, tips_file): - tm.assert_equal(read_csv("s3://pandas-test/tips.csv"), read_csv(tips_file)) +def test_from_s3_csv(s3_resource, tips_file, s3so): + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv", storage_options=s3so), read_csv(tips_file) + ) # the following are decompressed by pandas, not fsspec - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.gz"), read_csv(tips_file)) - tm.assert_equal(read_csv("s3://pandas-test/tips.csv.bz2"), read_csv(tips_file)) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.gz", storage_options=s3so), + read_csv(tips_file), + ) + tm.assert_equal( + read_csv("s3://pandas-test/tips.csv.bz2", storage_options=s3so), + read_csv(tips_file), + ) @pytest.mark.parametrize("protocol", ["s3", "s3a", "s3n"]) @td.skip_if_no("s3fs") -def test_s3_protocols(s3_resource, tips_file, protocol): +def test_s3_protocols(s3_resource, tips_file, protocol, s3so): tm.assert_equal( - read_csv("%s://pandas-test/tips.csv" % protocol), read_csv(tips_file) + read_csv("%s://pandas-test/tips.csv" % protocol, storage_options=s3so), + read_csv(tips_file), ) @td.skip_if_no("s3fs") @td.skip_if_no("fastparquet") -def test_s3_parquet(s3_resource): +def test_s3_parquet(s3_resource, s3so): fn = "s3://pandas-test/test.parquet" - df1.to_parquet(fn, index=False, engine="fastparquet", compression=None) - df2 = read_parquet(fn, engine="fastparquet") + df1.to_parquet( + fn, index=False, engine="fastparquet", compression=None, storage_options=s3so + ) + df2 = read_parquet(fn, engine="fastparquet", storage_options=s3so) tm.assert_equal(df1, df2) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 82157f3d722a9..3f94af65449c7 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -158,6 +158,10 @@ def check_round_trip( """ write_kwargs = write_kwargs or {"compression": None} read_kwargs = read_kwargs or {} + if isinstance(path, str) and "s3://" in path: + s3so = dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + read_kwargs["storage_options"] = s3so + write_kwargs["storage_options"] = s3so if expected is None: expected = df @@ -537,9 +541,9 @@ def test_categorical(self, pa): expected = df.astype(object) check_round_trip(df, pa, expected=expected) - def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa): + def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): s3fs = pytest.importorskip("s3fs") - s3 = s3fs.S3FileSystem() + s3 = s3fs.S3FileSystem(**s3so) kw = dict(filesystem=s3) check_round_trip( df_compat, @@ -564,6 +568,7 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): expected_df = df_compat.copy() if partition_col: expected_df[partition_col] = expected_df[partition_col].astype("category") + check_round_trip( df_compat, pa, diff --git a/pandas/tests/io/test_sql.py b/pandas/tests/io/test_sql.py index a7e3162ed7b73..29b787d39c09d 100644 --- a/pandas/tests/io/test_sql.py +++ b/pandas/tests/io/test_sql.py @@ -263,8 +263,7 @@ def _get_all_tables(self): return table_list def _close_conn(self): - # https://docs.sqlalchemy.org/en/13/core/connections.html#engine-disposal - self.conn.dispose() + pass class PandasSQLTest: @@ -1243,7 +1242,7 @@ class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): def setup_class(cls): cls.setup_import() cls.setup_driver() - conn = cls.conn = cls.connect() + conn = cls.connect() conn.connect() def load_test_data_and_sql(self): diff --git a/pandas/tests/util/test_assert_extension_array_equal.py b/pandas/tests/util/test_assert_extension_array_equal.py index f9259beab5d13..d9fdf1491c328 100644 --- a/pandas/tests/util/test_assert_extension_array_equal.py +++ b/pandas/tests/util/test_assert_extension_array_equal.py @@ -1,7 +1,6 @@ import numpy as np import pytest -from pandas import array import pandas._testing as tm from pandas.core.arrays.sparse import SparseArray @@ -103,11 +102,3 @@ def test_assert_extension_array_equal_non_extension_array(side): with pytest.raises(AssertionError, match=msg): tm.assert_extension_array_equal(*args) - - -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_extension_array_equal_ignore_dtype_mismatch(right_dtype): - # https://github.com/pandas-dev/pandas/issues/35715 - left = array([1, 2, 3], dtype="Int64") - right = array([1, 2, 3], dtype=right_dtype) - tm.assert_extension_array_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_frame_equal.py b/pandas/tests/util/test_assert_frame_equal.py index 3aa3c64923b14..fe3e1ff906919 100644 --- a/pandas/tests/util/test_assert_frame_equal.py +++ b/pandas/tests/util/test_assert_frame_equal.py @@ -260,11 +260,3 @@ def test_assert_frame_equal_interval_dtype_mismatch(): with pytest.raises(AssertionError, match=msg): tm.assert_frame_equal(left, right, check_dtype=True) - - -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_frame_equal_ignore_extension_dtype_mismatch(right_dtype): - # https://github.com/pandas-dev/pandas/issues/35715 - left = pd.DataFrame({"a": [1, 2, 3]}, dtype="Int64") - right = pd.DataFrame({"a": [1, 2, 3]}, dtype=right_dtype) - tm.assert_frame_equal(left, right, check_dtype=False) diff --git a/pandas/tests/util/test_assert_series_equal.py b/pandas/tests/util/test_assert_series_equal.py index f3c66052b1904..a7b5aeac560e4 100644 --- a/pandas/tests/util/test_assert_series_equal.py +++ b/pandas/tests/util/test_assert_series_equal.py @@ -296,11 +296,3 @@ def test_series_equal_exact_for_nonnumeric(): tm.assert_series_equal(s1, s3, check_exact=True) with pytest.raises(AssertionError): tm.assert_series_equal(s3, s1, check_exact=True) - - -@pytest.mark.parametrize("right_dtype", ["Int32", "int64"]) -def test_assert_series_equal_ignore_extension_dtype_mismatch(right_dtype): - # https://github.com/pandas-dev/pandas/issues/35715 - left = pd.Series([1, 2, 3], dtype="Int64") - right = pd.Series([1, 2, 3], dtype=right_dtype) - tm.assert_series_equal(left, right, check_dtype=False) From 8387ea6fa5b92a09cf76da3f904ceb408ecda6dd Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 18 Aug 2020 14:57:28 -0400 Subject: [PATCH 48/59] Skip test on win --- pandas/tests/io/test_parquet.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 3f94af65449c7..e4ca6c09edafb 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -564,6 +564,8 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716 # As per pyarrow partitioned columns become 'categorical' dtypes # and are added to back of dataframe on read + if partition_col and pd.comapt.is_platform_windows(): + pytest.skip(reason="pyarrow/win incompatibility #35791") expected_df = df_compat.copy() if partition_col: From 5e2a86f19617014a1ff90ef404f648c1508baf75 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 18 Aug 2020 14:58:09 -0400 Subject: [PATCH 49/59] not "reason" in skip (that would be for skipif) --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index e4ca6c09edafb..36b08f4bba07f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -565,7 +565,7 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): # As per pyarrow partitioned columns become 'categorical' dtypes # and are added to back of dataframe on read if partition_col and pd.comapt.is_platform_windows(): - pytest.skip(reason="pyarrow/win incompatibility #35791") + pytest.skip("pyarrow/win incompatibility #35791") expected_df = df_compat.copy() if partition_col: From 8711c9621312fc7791e98d89a5808a7ce4cfc3ca Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 18 Aug 2020 15:24:42 -0400 Subject: [PATCH 50/59] typo --- pandas/tests/io/test_parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 36b08f4bba07f..6c51486078742 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -564,7 +564,7 @@ def test_s3_roundtrip_for_dir(self, df_compat, s3_resource, pa, partition_col): # https://github.com/apache/arrow/blob/master/python/pyarrow/tests/test_parquet.py#L2716 # As per pyarrow partitioned columns become 'categorical' dtypes # and are added to back of dataframe on read - if partition_col and pd.comapt.is_platform_windows(): + if partition_col and pd.compat.is_platform_windows(): pytest.skip("pyarrow/win incompatibility #35791") expected_df = df_compat.copy() From 6d21fa0974cccedac10b74d397191758becbad12 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 20 Aug 2020 09:40:19 -0400 Subject: [PATCH 51/59] Fewer moto server processes --- pandas/tests/io/conftest.py | 144 +++++++++++++++++++----------------- 1 file changed, 78 insertions(+), 66 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 8bd403f222af7..386d20cfe9f43 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -38,24 +38,15 @@ def s3so(): return dict(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) -@pytest.fixture -def s3_resource(tips_file, jsonl_file, feather_file): +@pytest.fixture(scope="module") +def s3_base(): """ Fixture for mocking S3 interaction. - The primary bucket name is "pandas-test". The following datasets - are loaded. - - - tips.csv - - tips.csv.gz - - tips.csv.bz2 - - items.jsonl - - A private bucket "cant_get_it" is also created. The boto3 s3 resource - is yielded by the fixture. + Sets up moto server in separate process """ - s3fs = pytest.importorskip("s3fs") - boto3 = pytest.importorskip("boto3") + pytest.importorskip("s3fs") + pytest.importorskip("boto3") requests = pytest.importorskip("requests") with tm.ensure_safe_environment_variables(): @@ -67,55 +58,76 @@ def s3_resource(tips_file, jsonl_file, feather_file): pytest.importorskip("moto", minversion="1.3.14") pytest.importorskip("flask") # server mode needs flask too - test_s3_files = [ - ("tips#1.csv", tips_file), - ("tips.csv", tips_file), - ("tips.csv.gz", tips_file + ".gz"), - ("tips.csv.bz2", tips_file + ".bz2"), - ("items.jsonl", jsonl_file), - ("simple_dataset.feather", feather_file), - ] - - def add_tips_files(bucket_name): - for s3_key, file_name in test_s3_files: - with open(file_name, "rb") as f: - conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) - - try: - # Launching moto in server mode, i.e., as a separate process - # with an S3 endpoint on localhost - - endpoint_uri = "http://127.0.0.1:5555/" - - # pipe to null to avoid logging in terminal - proc = subprocess.Popen( - shlex.split("moto_server s3 -p 5555"), stdout=subprocess.DEVNULL - ) - - timeout = 5 - while timeout > 0: - try: - # OK to go once server is accepting connections - r = requests.get(endpoint_uri) - if r.ok: - break - except Exception: - pass - timeout -= 0.1 - time.sleep(0.1) - - # see gh-16135 - bucket = "pandas-test" - conn = boto3.resource("s3", endpoint_url=endpoint_uri) - - conn.create_bucket(Bucket=bucket) - add_tips_files(bucket) - - conn.create_bucket(Bucket="cant_get_it", ACL="private") - add_tips_files("cant_get_it") - s3fs.S3FileSystem.clear_instance_cache() - yield conn - finally: - # shut down external process - proc.terminate() - proc.wait() + # Launching moto in server mode, i.e., as a separate process + # with an S3 endpoint on localhost + + endpoint_uri = "http://127.0.0.1:5555/" + + # pipe to null to avoid logging in terminal + proc = subprocess.Popen( + shlex.split("moto_server s3 -p 5555"), stdout=subprocess.DEVNULL + ) + + timeout = 5 + while timeout > 0: + try: + # OK to go once server is accepting connections + r = requests.get(endpoint_uri) + if r.ok: + break + except Exception: + pass + timeout -= 0.1 + time.sleep(0.1) + yield + + proc.terminate() + proc.wait() + + +@pytest.fixture() +def s3_resource(s3_base, tips_file, jsonl_file, feather_file): + """ + Sets up S3 bucket with contents + + The primary bucket name is "pandas-test". The following datasets + are loaded. + + - tips.csv + - tips.csv.gz + - tips.csv.bz2 + - items.jsonl + + A private bucket "cant_get_it" is also created. The boto3 s3 resource + is yielded by the fixture. + """ + import boto3 + import s3fs + + test_s3_files = [ + ("tips#1.csv", tips_file), + ("tips.csv", tips_file), + ("tips.csv.gz", tips_file + ".gz"), + ("tips.csv.bz2", tips_file + ".bz2"), + ("items.jsonl", jsonl_file), + ("simple_dataset.feather", feather_file), + ] + + def add_tips_files(bucket_name): + for s3_key, file_name in test_s3_files: + with open(file_name, "rb") as f: + conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) + + bucket = "pandas-test" + endpoint_uri = "http://127.0.0.1:5555/" + conn = boto3.resource("s3", endpoint_url=endpoint_uri) + + conn.create_bucket(Bucket=bucket) + add_tips_files(bucket) + + conn.create_bucket(Bucket="cant_get_it", ACL="private") + add_tips_files("cant_get_it") + s3fs.S3FileSystem.clear_instance_cache() + yield conn + s3fs.rm(bucket, recursive=True) + s3fs.rm("cant_get_it", recursive=True) From 1e1d3fef322fa7cd9b34bb21dccc8bb26df2a1dc Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 20 Aug 2020 10:06:19 -0400 Subject: [PATCH 52/59] Dumb --- pandas/tests/io/conftest.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 386d20cfe9f43..61d9fa1b12e55 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -129,5 +129,6 @@ def add_tips_files(bucket_name): add_tips_files("cant_get_it") s3fs.S3FileSystem.clear_instance_cache() yield conn - s3fs.rm(bucket, recursive=True) - s3fs.rm("cant_get_it", recursive=True) + s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) + s3.rm(bucket, recursive=True) + s3.rm("cant_get_it", recursive=True) From 9f8ad5a019340f4c6678aa6d32716ff5b5665a7e Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 20 Aug 2020 10:59:10 -0400 Subject: [PATCH 53/59] With exceptions --- pandas/tests/io/conftest.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 61d9fa1b12e55..9467a9cd8c72d 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -122,13 +122,30 @@ def add_tips_files(bucket_name): endpoint_uri = "http://127.0.0.1:5555/" conn = boto3.resource("s3", endpoint_url=endpoint_uri) - conn.create_bucket(Bucket=bucket) + try: + conn.create_bucket(Bucket=bucket) + except: # noqa + # OK is bucket already exists + pass add_tips_files(bucket) - conn.create_bucket(Bucket="cant_get_it", ACL="private") + try: + conn.create_bucket(Bucket="cant_get_it", ACL="private") + except: # noqa + # OK is bucket already exists + pass + add_tips_files("cant_get_it") s3fs.S3FileSystem.clear_instance_cache() yield conn + s3 = s3fs.S3FileSystem(client_kwargs={"endpoint_url": "http://127.0.0.1:5555/"}) - s3.rm(bucket, recursive=True) - s3.rm("cant_get_it", recursive=True) + + try: + s3.rm(bucket, recursive=True) + except: # noqa + pass + try: + s3.rm("cant_get_it", recursive=True) + except: # noqa + pass From 015512dd7155342fdc38ce2f18eec40e058800f0 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 20 Aug 2020 14:31:56 -0400 Subject: [PATCH 54/59] Skip old pyarrow; remove moto where no s3fs in env --- ci/deps/travis-37-locale.yaml | 2 -- pandas/tests/io/test_parquet.py | 4 ++++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 573328f685adb..24799785a25ec 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -21,8 +21,6 @@ dependencies: - jinja2 - lxml=4.3.0 - matplotlib=3.0.* - - moto - - flask - nomkl - numexpr - numpy diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index 6c51486078742..be1e7e882e72f 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -543,6 +543,8 @@ def test_categorical(self, pa): def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): s3fs = pytest.importorskip("s3fs") + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.17.0"): + pytest.skip() s3 = s3fs.S3FileSystem(**s3so) kw = dict(filesystem=s3) check_round_trip( @@ -554,6 +556,8 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): ) def test_s3_roundtrip(self, df_compat, s3_resource, pa): + if LooseVersion(pyarrow.__version__) >= LooseVersion("0.17.0"): + pytest.skip() # GH #19134 check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") From 989a9f12037a03a080173c5e9528a5a8a5c8a369 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 20 Aug 2020 15:11:57 -0400 Subject: [PATCH 55/59] sign error --- pandas/tests/io/test_parquet.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/test_parquet.py b/pandas/tests/io/test_parquet.py index be1e7e882e72f..3a3ba99484a3a 100644 --- a/pandas/tests/io/test_parquet.py +++ b/pandas/tests/io/test_parquet.py @@ -543,7 +543,7 @@ def test_categorical(self, pa): def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): s3fs = pytest.importorskip("s3fs") - if LooseVersion(pyarrow.__version__) >= LooseVersion("0.17.0"): + if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): pytest.skip() s3 = s3fs.S3FileSystem(**s3so) kw = dict(filesystem=s3) @@ -556,7 +556,7 @@ def test_s3_roundtrip_explicit_fs(self, df_compat, s3_resource, pa, s3so): ) def test_s3_roundtrip(self, df_compat, s3_resource, pa): - if LooseVersion(pyarrow.__version__) >= LooseVersion("0.17.0"): + if LooseVersion(pyarrow.__version__) <= LooseVersion("0.17.0"): pytest.skip() # GH #19134 check_round_trip(df_compat, pa, path="s3://pandas-test/pyarrow.parquet") From 761725676fad7399833aac4cfa16147c7eeae39c Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 20 Aug 2020 16:39:49 -0400 Subject: [PATCH 56/59] Add arrow to env that was getting it anyway (but an old one) --- ci/deps/travis-37-locale.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/ci/deps/travis-37-locale.yaml b/ci/deps/travis-37-locale.yaml index 24799785a25ec..2cf0e12027401 100644 --- a/ci/deps/travis-37-locale.yaml +++ b/ci/deps/travis-37-locale.yaml @@ -26,6 +26,7 @@ dependencies: - numpy - openpyxl - pandas-gbq=0.12.0 + - pyarrow>=0.17 - psycopg2=2.7 - pymysql=0.7.11 - pytables From 4624cefbcfe172d95bfe9c93c7eff17f3091de3f Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 21 Aug 2020 09:00:32 -0400 Subject: [PATCH 57/59] Add reasonable timeouts --- pandas/tests/io/conftest.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 9467a9cd8c72d..2f12a38e5fda2 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -116,25 +116,28 @@ def s3_resource(s3_base, tips_file, jsonl_file, feather_file): def add_tips_files(bucket_name): for s3_key, file_name in test_s3_files: with open(file_name, "rb") as f: - conn.Bucket(bucket_name).put_object(Key=s3_key, Body=f) + conn.put_object(Bucket=bucket_name, Key=s3_key, Body=f) bucket = "pandas-test" endpoint_uri = "http://127.0.0.1:5555/" - conn = boto3.resource("s3", endpoint_url=endpoint_uri) + conn = boto3.client("s3", endpoint_url=endpoint_uri) try: conn.create_bucket(Bucket=bucket) except: # noqa # OK is bucket already exists pass - add_tips_files(bucket) - try: conn.create_bucket(Bucket="cant_get_it", ACL="private") except: # noqa # OK is bucket already exists pass + timeout = 2 + while not conn.list_buckets()['Buckets'] and timeout > 0: + time.sleep(0.1) + timeout -= 0.1 + add_tips_files(bucket) add_tips_files("cant_get_it") s3fs.S3FileSystem.clear_instance_cache() yield conn @@ -149,3 +152,7 @@ def add_tips_files(bucket_name): s3.rm("cant_get_it", recursive=True) except: # noqa pass + timeout = 2 + while conn.list_buckets()['Buckets'] and timeout > 0: + time.sleep(0.1) + timeout -= 0.1 From f5c2a44f3c96143ef4d51b8dbc968d6d171ac3da Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 21 Aug 2020 09:00:49 -0400 Subject: [PATCH 58/59] lint --- pandas/tests/io/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 2f12a38e5fda2..972165ae166a7 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -133,7 +133,7 @@ def add_tips_files(bucket_name): # OK is bucket already exists pass timeout = 2 - while not conn.list_buckets()['Buckets'] and timeout > 0: + while not conn.list_buckets()["Buckets"] and timeout > 0: time.sleep(0.1) timeout -= 0.1 @@ -153,6 +153,6 @@ def add_tips_files(bucket_name): except: # noqa pass timeout = 2 - while conn.list_buckets()['Buckets'] and timeout > 0: + while conn.list_buckets()["Buckets"] and timeout > 0: time.sleep(0.1) timeout -= 0.1 From 5d50cda4fdd3dec8f12b15b4aee00e585c85ee13 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Fri, 21 Aug 2020 09:54:04 -0400 Subject: [PATCH 59/59] small revert --- pandas/tests/io/conftest.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/pandas/tests/io/conftest.py b/pandas/tests/io/conftest.py index 972165ae166a7..518f31d73efa9 100644 --- a/pandas/tests/io/conftest.py +++ b/pandas/tests/io/conftest.py @@ -116,24 +116,25 @@ def s3_resource(s3_base, tips_file, jsonl_file, feather_file): def add_tips_files(bucket_name): for s3_key, file_name in test_s3_files: with open(file_name, "rb") as f: - conn.put_object(Bucket=bucket_name, Key=s3_key, Body=f) + cli.put_object(Bucket=bucket_name, Key=s3_key, Body=f) bucket = "pandas-test" endpoint_uri = "http://127.0.0.1:5555/" - conn = boto3.client("s3", endpoint_url=endpoint_uri) + conn = boto3.resource("s3", endpoint_url=endpoint_uri) + cli = boto3.client("s3", endpoint_url=endpoint_uri) try: - conn.create_bucket(Bucket=bucket) + cli.create_bucket(Bucket=bucket) except: # noqa # OK is bucket already exists pass try: - conn.create_bucket(Bucket="cant_get_it", ACL="private") + cli.create_bucket(Bucket="cant_get_it", ACL="private") except: # noqa # OK is bucket already exists pass timeout = 2 - while not conn.list_buckets()["Buckets"] and timeout > 0: + while not cli.list_buckets()["Buckets"] and timeout > 0: time.sleep(0.1) timeout -= 0.1 @@ -153,6 +154,6 @@ def add_tips_files(bucket_name): except: # noqa pass timeout = 2 - while conn.list_buckets()["Buckets"] and timeout > 0: + while cli.list_buckets()["Buckets"] and timeout > 0: time.sleep(0.1) timeout -= 0.1