Skip to content

Commit fb32200

Browse files
committed
COMPAT/REF: Use s3fs for s3 IO
1 parent d1b1720 commit fb32200

File tree

5 files changed

+38
-104
lines changed

5 files changed

+38
-104
lines changed

doc/source/whatsnew/v0.20.0.txt

+9-1
Original file line numberDiff line numberDiff line change
@@ -93,12 +93,20 @@ Backwards incompatible API changes
9393

9494
.. _whatsnew_0200.api:
9595

96+
.. _whatsnew_2000.api.s3
97+
98+
S3 File Handling
99+
^^^^^^^^^^^^^^^^
100+
101+
pandas now uses `s3fs <http://s3fs.readthedocs.io/>`_ for handling S3 connections. This shouldn't break
102+
any code, but since s3fs is not a required dependency, you will need to install it separately (like boto
103+
in prior versions of pandas).
96104

97105
- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
98106
- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
99107

100108

101-
109+
- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
102110

103111
Other API Changes
104112
^^^^^^^^^^^^^^^^^

pandas/io/common.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@
1212
from pandas.core.common import AbstractMethodError
1313
from pandas.types.common import is_number
1414

15+
try:
16+
from s3fs import S3File
17+
need_text_wrapping = (BytesIO, S3File)
18+
except ImportError:
19+
need_text_wrapping = (BytesIO,)
20+
1521
# common NA values
1622
# no longer excluding inf representations
1723
# '1.#INF','-1.#INF', '1.#INF000000',
@@ -212,10 +218,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
212218
return reader, encoding, compression
213219

214220
if _is_s3_url(filepath_or_buffer):
215-
from pandas.io.s3 import get_filepath_or_buffer
216-
return get_filepath_or_buffer(filepath_or_buffer,
217-
encoding=encoding,
218-
compression=compression)
221+
from pandas.io import s3
222+
return s3.get_filepath_or_buffer(filepath_or_buffer,
223+
encoding=encoding,
224+
compression=compression)
219225

220226
# It is a pathlib.Path/py.path.local or string
221227
filepath_or_buffer = _stringify_path(filepath_or_buffer)
@@ -388,7 +394,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
388394
handles.append(f)
389395

390396
# in Python 3, convert BytesIO or fileobjects passed with an encoding
391-
if compat.PY3 and (compression or isinstance(f, compat.BytesIO)):
397+
if compat.PY3 and (compression or isinstance(f, need_text_wrapping)):
392398
from io import TextIOWrapper
393399
f = TextIOWrapper(f, encoding=encoding)
394400
handles.append(f)

pandas/io/s3.py

+13-90
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,33 @@
11
""" s3 support for remote file interactivity """
2-
3-
import os
42
from pandas import compat
5-
from pandas.compat import BytesIO
6-
73
try:
8-
import boto
9-
from boto.s3 import key
4+
import s3fs
105
except:
11-
raise ImportError("boto is required to handle s3 files")
6+
raise ImportError("The s3fs library is required to handle s3 files")
127

138
if compat.PY3:
149
from urllib.parse import urlparse as parse_url
1510
else:
1611
from urlparse import urlparse as parse_url
1712

1813

19-
class BotoFileLikeReader(key.Key):
20-
"""boto Key modified to be more file-like
21-
22-
This modification of the boto Key will read through a supplied
23-
S3 key once, then stop. The unmodified boto Key object will repeatedly
24-
cycle through a file in S3: after reaching the end of the file,
25-
boto will close the file. Then the next call to `read` or `next` will
26-
re-open the file and start reading from the beginning.
27-
28-
Also adds a `readline` function which will split the returned
29-
values by the `\n` character.
30-
"""
31-
32-
def __init__(self, *args, **kwargs):
33-
encoding = kwargs.pop("encoding", None) # Python 2 compat
34-
super(BotoFileLikeReader, self).__init__(*args, **kwargs)
35-
# Add a flag to mark the end of the read.
36-
self.finished_read = False
37-
self.buffer = ""
38-
self.lines = []
39-
if encoding is None and compat.PY3:
40-
encoding = "utf-8"
41-
self.encoding = encoding
42-
self.lines = []
43-
44-
def next(self):
45-
return self.readline()
46-
47-
__next__ = next
48-
49-
def read(self, *args, **kwargs):
50-
if self.finished_read:
51-
return b'' if compat.PY3 else ''
52-
return super(BotoFileLikeReader, self).read(*args, **kwargs)
53-
54-
def close(self, *args, **kwargs):
55-
self.finished_read = True
56-
return super(BotoFileLikeReader, self).close(*args, **kwargs)
57-
58-
def seekable(self):
59-
"""Needed for reading by bz2"""
60-
return False
61-
62-
def readline(self):
63-
"""Split the contents of the Key by '\n' characters."""
64-
if self.lines:
65-
retval = self.lines[0]
66-
self.lines = self.lines[1:]
67-
return retval
68-
if self.finished_read:
69-
if self.buffer:
70-
retval, self.buffer = self.buffer, ""
71-
return retval
72-
else:
73-
raise StopIteration
74-
75-
if self.encoding:
76-
self.buffer = "{}{}".format(
77-
self.buffer, self.read(8192).decode(self.encoding))
78-
else:
79-
self.buffer = "{}{}".format(self.buffer, self.read(8192))
80-
81-
split_buffer = self.buffer.split("\n")
82-
self.lines.extend(split_buffer[:-1])
83-
self.buffer = split_buffer[-1]
84-
85-
return self.readline()
14+
def _strip_schema(url):
15+
"""Returns the url without the s3:// part"""
16+
result = parse_url(url)
17+
return result.netloc + result.path
8618

8719

8820
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
8921
compression=None):
9022

9123
# Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST
9224
# are environment variables
93-
parsed_url = parse_url(filepath_or_buffer)
94-
s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com')
95-
25+
fs = s3fs.S3FileSystem(anon=False)
9626
try:
97-
conn = boto.connect_s3(host=s3_host)
98-
except boto.exception.NoAuthHandlerFound:
99-
conn = boto.connect_s3(host=s3_host, anon=True)
100-
101-
b = conn.get_bucket(parsed_url.netloc, validate=False)
102-
if compat.PY2 and compression:
103-
k = boto.s3.key.Key(b, parsed_url.path)
104-
filepath_or_buffer = BytesIO(k.get_contents_as_string(
105-
encoding=encoding))
106-
else:
107-
k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding)
108-
k.open('r') # Expose read errors immediately
109-
filepath_or_buffer = k
27+
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
28+
except OSError:
29+
# boto3 has troubles when trying to access a public file
30+
# when credentialed...
31+
fs = s3fs.S3FileSystem(anon=True)
32+
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
11033
return filepath_or_buffer, None, compression

pandas/io/tests/parser/test_network.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,9 @@ class TestS3(tm.TestCase):
5757

5858
def setUp(self):
5959
try:
60-
import boto # noqa
60+
import s3fs # noqa
6161
except ImportError:
62-
raise nose.SkipTest("boto not installed")
62+
raise nose.SkipTest("s3fs not installed")
6363

6464
@tm.network
6565
def test_parse_public_s3_bucket(self):
@@ -193,15 +193,12 @@ def test_parse_public_s3_bucket_nrows_python(self):
193193

194194
@tm.network
195195
def test_s3_fails(self):
196-
import boto
197-
with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
198-
'S3ResponseError: 404 Not Found'):
196+
with tm.assertRaises(IOError):
199197
read_csv('s3://nyqpug/asdf.csv')
200198

201199
# Receive a permission error when trying to read a private bucket.
202200
# It's irrelevant here that this isn't actually a table.
203-
with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
204-
'S3ResponseError: 403 Forbidden'):
201+
with tm.assertRaises(IOError):
205202
read_csv('s3://cant_get_it/')
206203

207204
if __name__ == '__main__':

pandas/io/tests/test_excel.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ def _skip_if_no_excelsuite():
6666

6767
def _skip_if_no_boto():
6868
try:
69-
import boto # NOQA
69+
import s3fs # noqa
7070
except ImportError:
7171
raise nose.SkipTest('boto not installed, skipping')
7272

0 commit comments

Comments
 (0)