Skip to content

Commit 5950249

Browse files
committed
COMPAT/REF: Use s3fs for s3 IO
1 parent 2466ecb commit 5950249

File tree

6 files changed

+36
-107
lines changed

6 files changed

+36
-107
lines changed

doc/source/whatsnew/v0.20.0.txt

+7-2
Original file line numberDiff line numberDiff line change
@@ -61,11 +61,16 @@ Backwards incompatible API changes
6161

6262
.. _whatsnew_0200.api:
6363

64+
.. _whatsnew_2000.api.s3
6465

65-
- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
66-
66+
S3 File Handling
67+
^^^^^^^^^^^^^^^^
6768

69+
pandas now uses `s3fs <http://s3fs.readthedocs.io/>`_ for handling S3 connections. This shouldn't break
70+
any code, but since s3fs is not a required dependency, you will need to install it separately (like boto
71+
in prior versions of pandas).
6872

73+
- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
6974

7075
Other API Changes
7176
^^^^^^^^^^^^^^^^^

pandas/io/common.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -251,10 +251,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
251251
return tuple(to_return)
252252

253253
if _is_s3_url(filepath_or_buffer):
254-
from pandas.io.s3 import get_filepath_or_buffer
255-
return get_filepath_or_buffer(filepath_or_buffer,
256-
encoding=encoding,
257-
compression=compression)
254+
from pandas.io import s3
255+
return s3.get_filepath_or_buffer(filepath_or_buffer,
256+
encoding=encoding,
257+
compression=compression)
258258

259259
# It is a pathlib.Path/py.path.local or string
260260
filepath_or_buffer = _stringify_path(filepath_or_buffer)

pandas/io/parsers.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,12 @@
4040
import pandas.lib as lib
4141
import pandas.parser as _parser
4242

43+
try:
44+
from s3fs import S3File
45+
need_text_wrapping = (compat.BytesIO, S3File)
46+
except ImportError:
47+
need_text_wrapping = (compat.BtyesIO,)
48+
4349

4450
# BOM character (byte order mark)
4551
# This exists at the beginning of a file to indicate endianness
@@ -1899,7 +1905,7 @@ def __init__(self, f, **kwds):
18991905
f = _wrap_compressed(f, self.compression, self.encoding)
19001906
self.handles.append(f)
19011907
# in Python 3, convert BytesIO or fileobjects passed with an encoding
1902-
elif compat.PY3 and isinstance(f, compat.BytesIO):
1908+
elif compat.PY3 and isinstance(f, need_text_wrapping):
19031909
from io import TextIOWrapper
19041910

19051911
f = TextIOWrapper(f, encoding=self.encoding)

pandas/io/s3.py

+13-92
Original file line numberDiff line numberDiff line change
@@ -1,112 +1,33 @@
11
""" s3 support for remote file interactivity """
2-
3-
import os
42
from pandas import compat
5-
from pandas.compat import BytesIO
6-
73
try:
8-
import boto
9-
from boto.s3 import key
4+
import s3fs
105
except:
11-
raise ImportError("boto is required to handle s3 files")
6+
raise ImportError("The s3fs library is required to handle s3 files")
127

138
if compat.PY3:
149
from urllib.parse import urlparse as parse_url
1510
else:
1611
from urlparse import urlparse as parse_url
1712

1813

19-
class BotoFileLikeReader(key.Key):
20-
"""boto Key modified to be more file-like
21-
22-
This modification of the boto Key will read through a supplied
23-
S3 key once, then stop. The unmodified boto Key object will repeatedly
24-
cycle through a file in S3: after reaching the end of the file,
25-
boto will close the file. Then the next call to `read` or `next` will
26-
re-open the file and start reading from the beginning.
27-
28-
Also adds a `readline` function which will split the returned
29-
values by the `\n` character.
30-
"""
31-
32-
def __init__(self, *args, **kwargs):
33-
encoding = kwargs.pop("encoding", None) # Python 2 compat
34-
super(BotoFileLikeReader, self).__init__(*args, **kwargs)
35-
# Add a flag to mark the end of the read.
36-
self.finished_read = False
37-
self.buffer = ""
38-
self.lines = []
39-
if encoding is None and compat.PY3:
40-
encoding = "utf-8"
41-
self.encoding = encoding
42-
self.lines = []
43-
44-
def next(self):
45-
return self.readline()
46-
47-
__next__ = next
48-
49-
def read(self, *args, **kwargs):
50-
if self.finished_read:
51-
return b'' if compat.PY3 else ''
52-
return super(BotoFileLikeReader, self).read(*args, **kwargs)
53-
54-
def close(self, *args, **kwargs):
55-
self.finished_read = True
56-
return super(BotoFileLikeReader, self).close(*args, **kwargs)
57-
58-
def seekable(self):
59-
"""Needed for reading by bz2"""
60-
return False
61-
62-
def readline(self):
63-
"""Split the contents of the Key by '\n' characters."""
64-
if self.lines:
65-
retval = self.lines[0]
66-
self.lines = self.lines[1:]
67-
return retval
68-
if self.finished_read:
69-
if self.buffer:
70-
retval, self.buffer = self.buffer, ""
71-
return retval
72-
else:
73-
raise StopIteration
74-
75-
if self.encoding:
76-
self.buffer = "{}{}".format(
77-
self.buffer, self.read(8192).decode(self.encoding))
78-
else:
79-
self.buffer = "{}{}".format(self.buffer, self.read(8192))
80-
81-
split_buffer = self.buffer.split("\n")
82-
self.lines.extend(split_buffer[:-1])
83-
self.buffer = split_buffer[-1]
84-
85-
return self.readline()
14+
def _strip_schema(url):
15+
"""Returns the url without the s3:// part"""
16+
result = parse_url(url)
17+
return result.netloc + result.path
8618

8719

8820
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
8921
compression=None):
9022

9123
# Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST
9224
# are environment variables
93-
parsed_url = parse_url(filepath_or_buffer)
94-
s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com')
95-
25+
fs = s3fs.S3FileSystem(anon=False)
9626
try:
97-
conn = boto.connect_s3(host=s3_host)
98-
except boto.exception.NoAuthHandlerFound:
99-
conn = boto.connect_s3(host=s3_host, anon=True)
100-
101-
b = conn.get_bucket(parsed_url.netloc, validate=False)
102-
if compat.PY2 and (compression == 'gzip' or
103-
(compression == 'infer' and
104-
filepath_or_buffer.endswith(".gz"))):
105-
k = boto.s3.key.Key(b, parsed_url.path)
106-
filepath_or_buffer = BytesIO(k.get_contents_as_string(
107-
encoding=encoding))
108-
else:
109-
k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding)
110-
k.open('r') # Expose read errors immediately
111-
filepath_or_buffer = k
27+
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
28+
except OSError:
29+
# boto3 has troubles when trying to access a public file
30+
# when credentialed...
31+
fs = s3fs.S3FileSystem(anon=True)
32+
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
11233
return filepath_or_buffer, None, compression

pandas/io/tests/parser/test_network.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -39,9 +39,9 @@ class TestS3(tm.TestCase):
3939

4040
def setUp(self):
4141
try:
42-
import boto # noqa
42+
import s3fs # noqa
4343
except ImportError:
44-
raise nose.SkipTest("boto not installed")
44+
raise nose.SkipTest("s3fs not installed")
4545

4646
@tm.network
4747
def test_parse_public_s3_bucket(self):
@@ -175,15 +175,12 @@ def test_parse_public_s3_bucket_nrows_python(self):
175175

176176
@tm.network
177177
def test_s3_fails(self):
178-
import boto
179-
with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
180-
'S3ResponseError: 404 Not Found'):
178+
with tm.assertRaises(IOError):
181179
read_csv('s3://nyqpug/asdf.csv')
182180

183181
# Receive a permission error when trying to read a private bucket.
184182
# It's irrelevant here that this isn't actually a table.
185-
with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
186-
'S3ResponseError: 403 Forbidden'):
183+
with tm.assertRaises(IOError):
187184
read_csv('s3://cant_get_it/')
188185

189186
if __name__ == '__main__':

pandas/io/tests/test_excel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def _skip_if_no_excelsuite():
6666

6767
def _skip_if_no_boto():
6868
try:
69-
import boto # NOQA
69+
import s3fs # noqa
7070
except ImportError:
7171
raise nose.SkipTest('boto not installed, skipping')
7272

0 commit comments

Comments
 (0)