From 71a813ddd08d22518598a4f0f8d610b65afeb4fb Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Tue, 6 Aug 2019 08:38:51 -0400 Subject: [PATCH 1/9] Avoid calling S3File.s3 When reading from s3 using fastparquet. This attribute was removed in s3fs 0.3.0. This change avoids accessing it by using a new method get_file_and_filesystem which returns the filesystem in addition to the file. --- pandas/io/parquet.py | 5 +++-- pandas/io/s3.py | 19 +++++++++++++------ 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 82c460300582b..175ecdac2ef5c 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -8,6 +8,7 @@ from pandas import DataFrame, get_option from pandas.io.common import get_filepath_or_buffer, is_s3_url +from pandas.io.s3 import get_file_and_filesystem def get_engine(engine): @@ -187,9 +188,9 @@ def read(self, path, columns=None, **kwargs): # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. - s3, _, _, should_close = get_filepath_or_buffer(path) + s3, filesystem = get_file_and_filesystem(path) try: - parquet_file = self.api.ParquetFile(path, open_with=s3.s3.open) + parquet_file = self.api.ParquetFile(path, open_with=filesystem.open) finally: s3.close() else: diff --git a/pandas/io/s3.py b/pandas/io/s3.py index 0a7c082fec51c..ff3a9d26111ff 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -14,9 +14,7 @@ def _strip_schema(url): return result.netloc + result.path -def get_filepath_or_buffer( - filepath_or_buffer, encoding=None, compression=None, mode=None -): +def get_file_and_filesystem(filepath_or_buffer, encoding=None, mode=None): from botocore.exceptions import NoCredentialsError if mode is None: @@ -24,7 +22,7 @@ def get_filepath_or_buffer( fs = s3fs.S3FileSystem(anon=False) try: - filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) + file = fs.open(_strip_schema(filepath_or_buffer), mode) except (FileNotFoundError, NoCredentialsError): # boto3 has troubles when trying to access a public file # when credentialed... @@ -33,5 +31,14 @@ def get_filepath_or_buffer( # A NoCredentialsError is raised if you don't have creds # for that bucket. fs = s3fs.S3FileSystem(anon=True) - filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer), mode) - return filepath_or_buffer, None, compression, True + file = fs.open(_strip_schema(filepath_or_buffer), mode) + return file, fs + + +def get_filepath_or_buffer( + filepath_or_buffer, encoding=None, compression=None, mode=None +): + file, _fs = get_file_and_filesystem( + filepath_or_buffer, encoding=encoding, mode=mode + ) + return file, None, compression, True From dc17b6656a87cfad904cdeb7bb0b3d346cc98d2a Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Tue, 6 Aug 2019 09:05:44 -0400 Subject: [PATCH 2/9] Only import s3fs when necessary --- pandas/io/parquet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 175ecdac2ef5c..12d1fdd94f4df 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -8,7 +8,6 @@ from pandas import DataFrame, get_option from pandas.io.common import get_filepath_or_buffer, is_s3_url -from pandas.io.s3 import get_file_and_filesystem def get_engine(engine): @@ -185,6 +184,7 @@ def write( def read(self, path, columns=None, **kwargs): if is_s3_url(path): + from pandas.io.s3 import get_file_and_filesystem # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. From 01ea49de34b86b91c354d34db632e449cd70690e Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Tue, 6 Aug 2019 09:29:59 -0400 Subject: [PATCH 3/9] formatting --- pandas/io/parquet.py | 1 + 1 file changed, 1 insertion(+) diff --git a/pandas/io/parquet.py b/pandas/io/parquet.py index 12d1fdd94f4df..6fc70e9f4a737 100644 --- a/pandas/io/parquet.py +++ b/pandas/io/parquet.py @@ -185,6 +185,7 @@ def write( def read(self, path, columns=None, **kwargs): if is_s3_url(path): from pandas.io.s3 import get_file_and_filesystem + # When path is s3:// an S3File is returned. # We need to retain the original path(str) while also # pass the S3File().open function to fsatparquet impl. From cfc6db0f5ede750f8c76ff80885c70fdfa19888a Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Tue, 6 Aug 2019 14:21:30 -0400 Subject: [PATCH 4/9] Add whatsnew entry --- doc/source/whatsnew/v0.25.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 943a6adb7944e..48358792a407e 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -103,7 +103,7 @@ MultiIndex I/O ^^^ -- +- Avoid calling ``S3File.s3`` when reading parquet as this was removed in s3fs version 0.3.0 (:issue:`27756`) - - From 6f3958cf8b9b04bc8bf763f4933cb1a72c9d5181 Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Wed, 7 Aug 2019 09:32:34 -0400 Subject: [PATCH 5/9] Add comma in whatsnew entry Co-Authored-By: gfyoung --- doc/source/whatsnew/v0.25.1.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.25.1.rst b/doc/source/whatsnew/v0.25.1.rst index 48358792a407e..7d78a8fe6dd84 100644 --- a/doc/source/whatsnew/v0.25.1.rst +++ b/doc/source/whatsnew/v0.25.1.rst @@ -103,7 +103,7 @@ MultiIndex I/O ^^^ -- Avoid calling ``S3File.s3`` when reading parquet as this was removed in s3fs version 0.3.0 (:issue:`27756`) +- Avoid calling ``S3File.s3`` when reading parquet, as this was removed in s3fs version 0.3.0 (:issue:`27756`) - - From 2a014619213a1473af8f1ad632c55ee09bcef62f Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Thu, 8 Aug 2019 13:24:28 -0400 Subject: [PATCH 6/9] Add type annotations --- pandas/io/s3.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/pandas/io/s3.py b/pandas/io/s3.py index ff3a9d26111ff..3e971d80e39b5 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -1,11 +1,17 @@ +from typing import Optional, Tuple, TYPE_CHECKING +from pandas._typing import FilePathOrBuffer + """ s3 support for remote file interactivity """ from urllib.parse import urlparse as parse_url from pandas.compat._optional import import_optional_dependency -s3fs = import_optional_dependency( - "s3fs", extra="The s3fs package is required to handle s3 files." -) +if TYPE_CHECKING: + import s3fs +else: + s3fs = import_optional_dependency( + "s3fs", extra="The s3fs package is required to handle s3 files." + ) def _strip_schema(url): @@ -14,7 +20,9 @@ def _strip_schema(url): return result.netloc + result.path -def get_file_and_filesystem(filepath_or_buffer, encoding=None, mode=None): +def get_file_and_filesystem( + filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None +) -> Tuple[s3fs.S3File, s3fs.S3FileSystem]: from botocore.exceptions import NoCredentialsError if mode is None: @@ -36,9 +44,10 @@ def get_file_and_filesystem(filepath_or_buffer, encoding=None, mode=None): def get_filepath_or_buffer( - filepath_or_buffer, encoding=None, compression=None, mode=None -): - file, _fs = get_file_and_filesystem( - filepath_or_buffer, encoding=encoding, mode=mode - ) + filepath_or_buffer: FilePathOrBuffer, + encoding: Optional[str] = None, + compression: Optional[str] = None, + mode: Optional[str] = None, +) -> Tuple[s3fs.S3File, Optional[str], Optional[str], bool]: + file, _fs = get_file_and_filesystem(filepath_or_buffer, mode=mode) return file, None, compression, True From 35fb68735cfe55a4c59e353123f46c964f879508 Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Thu, 8 Aug 2019 15:47:04 -0400 Subject: [PATCH 7/9] Sort imports --- pandas/io/s3.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pandas/io/s3.py b/pandas/io/s3.py index 3e971d80e39b5..633419bf70069 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -1,10 +1,12 @@ -from typing import Optional, Tuple, TYPE_CHECKING +from typing import TYPE_CHECKING, Optional, Tuple +from urllib.parse import urlparse as parse_url + +from pandas.compat._optional import import_optional_dependency + from pandas._typing import FilePathOrBuffer """ s3 support for remote file interactivity """ -from urllib.parse import urlparse as parse_url -from pandas.compat._optional import import_optional_dependency if TYPE_CHECKING: import s3fs From fcda260470433fdd2a0373b3da259970e739fd40 Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Thu, 8 Aug 2019 16:51:18 -0400 Subject: [PATCH 8/9] Use IO instead of s3fs types --- pandas/io/s3.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/pandas/io/s3.py b/pandas/io/s3.py index 633419bf70069..9fc0103f6ead7 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, Optional, Tuple +from typing import IO, Any, Optional, Tuple from urllib.parse import urlparse as parse_url from pandas.compat._optional import import_optional_dependency @@ -7,13 +7,9 @@ """ s3 support for remote file interactivity """ - -if TYPE_CHECKING: - import s3fs -else: - s3fs = import_optional_dependency( - "s3fs", extra="The s3fs package is required to handle s3 files." - ) +s3fs = import_optional_dependency( + "s3fs", extra="The s3fs package is required to handle s3 files." +) def _strip_schema(url): @@ -24,7 +20,7 @@ def _strip_schema(url): def get_file_and_filesystem( filepath_or_buffer: FilePathOrBuffer, mode: Optional[str] = None -) -> Tuple[s3fs.S3File, s3fs.S3FileSystem]: +) -> Tuple[IO, Any]: from botocore.exceptions import NoCredentialsError if mode is None: @@ -50,6 +46,6 @@ def get_filepath_or_buffer( encoding: Optional[str] = None, compression: Optional[str] = None, mode: Optional[str] = None, -) -> Tuple[s3fs.S3File, Optional[str], Optional[str], bool]: +) -> Tuple[IO, Optional[str], Optional[str], bool]: file, _fs = get_file_and_filesystem(filepath_or_buffer, mode=mode) return file, None, compression, True From 1569685478297046a914a46901041e5f1f26741f Mon Sep 17 00:00:00 2001 From: Chris Stadler Date: Fri, 9 Aug 2019 09:30:40 -0400 Subject: [PATCH 9/9] Move comment to top of file --- pandas/io/s3.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pandas/io/s3.py b/pandas/io/s3.py index 9fc0103f6ead7..7e0a37e8cba20 100644 --- a/pandas/io/s3.py +++ b/pandas/io/s3.py @@ -1,3 +1,4 @@ +""" s3 support for remote file interactivity """ from typing import IO, Any, Optional, Tuple from urllib.parse import urlparse as parse_url @@ -5,8 +6,6 @@ from pandas._typing import FilePathOrBuffer -""" s3 support for remote file interactivity """ - s3fs = import_optional_dependency( "s3fs", extra="The s3fs package is required to handle s3 files." )