Skip to content

Commit dc4b070

Browse files
TomAugspurgerjreback
authored andcommitted
COMPAT/REF: Use s3fs for s3 IO
closes #11915 Author: Tom Augspurger <[email protected]> Closes #13137 from TomAugspurger/s3fs and squashes the following commits: 92ac063 [Tom Augspurger] CI: Update deps, docs 81690b5 [Tom Augspurger] COMPAT/REF: Use s3fs for s3 IO
1 parent 8c798c0 commit dc4b070

14 files changed

+72
-120
lines changed

asv_bench/benchmarks/io_bench.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ def setup(self, compression, engine):
153153
# The Python 2 C parser can't read bz2 from open files.
154154
raise NotImplementedError
155155
try:
156-
import boto
156+
import s3fs
157157
except ImportError:
158158
# Skip these benchmarks if `boto` is not installed.
159159
raise NotImplementedError

ci/requirements-2.7-64.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ sqlalchemy
1111
lxml=3.2.1
1212
scipy
1313
xlsxwriter
14-
boto
14+
s3fs
1515
bottleneck
1616
html5lib
1717
beautiful-soup

ci/requirements-2.7.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ sqlalchemy=0.9.6
1111
lxml=3.2.1
1212
scipy
1313
xlsxwriter=0.4.6
14-
boto=2.36.0
14+
s3fs
1515
bottleneck
1616
psycopg2=2.5.2
1717
patsy

ci/requirements-2.7_SLOW.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ numexpr
1313
pytables
1414
sqlalchemy
1515
lxml
16-
boto
16+
s3fs
1717
bottleneck
1818
psycopg2
1919
pymysql

ci/requirements-3.5.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ sqlalchemy
1717
pymysql
1818
psycopg2
1919
xarray
20-
boto
20+
s3fs
2121

2222
# incompat with conda ATM
2323
# beautiful-soup

ci/requirements-3.5_OSX.run

+1-1
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ matplotlib
1212
jinja2
1313
bottleneck
1414
xarray
15-
boto
15+
s3fs
1616

1717
# incompat with conda ATM
1818
# beautiful-soup

doc/source/install.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -262,7 +262,7 @@ Optional Dependencies
262262
* `XlsxWriter <https://pypi.python.org/pypi/XlsxWriter>`__: Alternative Excel writer
263263

264264
* `Jinja2 <http://jinja.pocoo.org/>`__: Template engine for conditional HTML formatting.
265-
* `boto <https://pypi.python.org/pypi/boto>`__: necessary for Amazon S3 access.
265+
* `s3fs <http://s3fs.readthedocs.io/>`__: necessary for Amazon S3 access (s3fs >= 0.0.7).
266266
* `blosc <https://pypi.python.org/pypi/blosc>`__: for msgpack compression using ``blosc``
267267
* One of `PyQt4
268268
<http://www.riverbankcomputing.com/software/pyqt/download>`__, `PySide

doc/source/io.rst

+17
Original file line numberDiff line numberDiff line change
@@ -1487,6 +1487,23 @@ options include:
14871487
Specifying any of the above options will produce a ``ParserWarning`` unless the
14881488
python engine is selected explicitly using ``engine='python'``.
14891489

1490+
Reading remote files
1491+
''''''''''''''''''''
1492+
1493+
You can pass in a URL to a CSV file:
1494+
1495+
.. code-block:: python
1496+
1497+
df = pd.read_csv('https://download.bls.gov/pub/time.series/cu/cu.item',
1498+
sep='\t')
1499+
1500+
S3 URLs are handled as well:
1501+
1502+
.. code-block:: python
1503+
1504+
df = pd.read_csv('s3://pandas-test/tips.csv')
1505+
1506+
14901507
Writing out Data
14911508
''''''''''''''''
14921509

doc/source/whatsnew/v0.20.0.txt

+10-3
Original file line numberDiff line numberDiff line change
@@ -108,12 +108,12 @@ Other enhancements
108108

109109
- ``.select_dtypes()`` now allows `datetimetz` to generically select datetimes with tz (:issue:`14910`)
110110

111+
111112
.. _whatsnew_0200.api_breaking:
112113

113114
Backwards incompatible API changes
114115
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
115116

116-
117117
.. _whatsnew.api_breaking.index_map
118118

119119
Map on Index types now return other Index types
@@ -182,8 +182,16 @@ Map on Index types now return other Index types
182182

183183
s.map(lambda x: x.hour)
184184

185+
.. _whatsnew_0200.s3:
186+
187+
S3 File Handling
188+
^^^^^^^^^^^^^^^^
185189

186-
.. _whatsnew_0200.api:
190+
pandas now uses `s3fs <http://s3fs.readthedocs.io/>`_ for handling S3 connections. This shouldn't break
191+
any code. However, since s3fs is not a required dependency, you will need to install it separately (like boto
192+
in prior versions of pandas) (:issue:`11915`).
193+
194+
.. _whatsnew_0200.api:
187195

188196
- ``CParserError`` has been renamed to ``ParserError`` in ``pd.read_csv`` and will be removed in the future (:issue:`12665`)
189197
- ``SparseArray.cumsum()`` and ``SparseSeries.cumsum()`` will now always return ``SparseArray`` and ``SparseSeries`` respectively (:issue:`12855`)
@@ -194,7 +202,6 @@ Map on Index types now return other Index types
194202
Other API Changes
195203
^^^^^^^^^^^^^^^^^
196204

197-
198205
.. _whatsnew_0200.deprecations:
199206

200207
Deprecations

pandas/io/common.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,12 @@
1212
from pandas.core.common import AbstractMethodError
1313
from pandas.types.common import is_number
1414

15+
try:
16+
from s3fs import S3File
17+
need_text_wrapping = (BytesIO, S3File)
18+
except ImportError:
19+
need_text_wrapping = (BytesIO,)
20+
1521
# common NA values
1622
# no longer excluding inf representations
1723
# '1.#INF','-1.#INF', '1.#INF000000',
@@ -212,10 +218,10 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
212218
return reader, encoding, compression
213219

214220
if _is_s3_url(filepath_or_buffer):
215-
from pandas.io.s3 import get_filepath_or_buffer
216-
return get_filepath_or_buffer(filepath_or_buffer,
217-
encoding=encoding,
218-
compression=compression)
221+
from pandas.io import s3
222+
return s3.get_filepath_or_buffer(filepath_or_buffer,
223+
encoding=encoding,
224+
compression=compression)
219225

220226
# It is a pathlib.Path/py.path.local or string
221227
filepath_or_buffer = _stringify_path(filepath_or_buffer)
@@ -391,7 +397,7 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
391397
handles.append(f)
392398

393399
# in Python 3, convert BytesIO or fileobjects passed with an encoding
394-
if compat.PY3 and (compression or isinstance(f, compat.BytesIO)):
400+
if compat.PY3 and (compression or isinstance(f, need_text_wrapping)):
395401
from io import TextIOWrapper
396402
f = TextIOWrapper(f, encoding=encoding)
397403
handles.append(f)

pandas/io/s3.py

+18-93
Original file line numberDiff line numberDiff line change
@@ -1,110 +1,35 @@
11
""" s3 support for remote file interactivity """
2-
3-
import os
42
from pandas import compat
5-
from pandas.compat import BytesIO
6-
73
try:
8-
import boto
9-
from boto.s3 import key
4+
import s3fs
5+
from botocore.exceptions import NoCredentialsError
106
except:
11-
raise ImportError("boto is required to handle s3 files")
7+
raise ImportError("The s3fs library is required to handle s3 files")
128

139
if compat.PY3:
1410
from urllib.parse import urlparse as parse_url
1511
else:
1612
from urlparse import urlparse as parse_url
1713

1814

19-
class BotoFileLikeReader(key.Key):
20-
"""boto Key modified to be more file-like
21-
22-
This modification of the boto Key will read through a supplied
23-
S3 key once, then stop. The unmodified boto Key object will repeatedly
24-
cycle through a file in S3: after reaching the end of the file,
25-
boto will close the file. Then the next call to `read` or `next` will
26-
re-open the file and start reading from the beginning.
27-
28-
Also adds a `readline` function which will split the returned
29-
values by the `\n` character.
30-
"""
31-
32-
def __init__(self, *args, **kwargs):
33-
encoding = kwargs.pop("encoding", None) # Python 2 compat
34-
super(BotoFileLikeReader, self).__init__(*args, **kwargs)
35-
# Add a flag to mark the end of the read.
36-
self.finished_read = False
37-
self.buffer = ""
38-
self.lines = []
39-
if encoding is None and compat.PY3:
40-
encoding = "utf-8"
41-
self.encoding = encoding
42-
self.lines = []
43-
44-
def next(self):
45-
return self.readline()
46-
47-
__next__ = next
48-
49-
def read(self, *args, **kwargs):
50-
if self.finished_read:
51-
return b'' if compat.PY3 else ''
52-
return super(BotoFileLikeReader, self).read(*args, **kwargs)
53-
54-
def close(self, *args, **kwargs):
55-
self.finished_read = True
56-
return super(BotoFileLikeReader, self).close(*args, **kwargs)
57-
58-
def seekable(self):
59-
"""Needed for reading by bz2"""
60-
return False
61-
62-
def readline(self):
63-
"""Split the contents of the Key by '\n' characters."""
64-
if self.lines:
65-
retval = self.lines[0]
66-
self.lines = self.lines[1:]
67-
return retval
68-
if self.finished_read:
69-
if self.buffer:
70-
retval, self.buffer = self.buffer, ""
71-
return retval
72-
else:
73-
raise StopIteration
74-
75-
if self.encoding:
76-
self.buffer = "{}{}".format(
77-
self.buffer, self.read(8192).decode(self.encoding))
78-
else:
79-
self.buffer = "{}{}".format(self.buffer, self.read(8192))
80-
81-
split_buffer = self.buffer.split("\n")
82-
self.lines.extend(split_buffer[:-1])
83-
self.buffer = split_buffer[-1]
84-
85-
return self.readline()
15+
def _strip_schema(url):
16+
"""Returns the url without the s3:// part"""
17+
result = parse_url(url)
18+
return result.netloc + result.path
8619

8720

8821
def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
8922
compression=None):
90-
91-
# Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST
92-
# are environment variables
93-
parsed_url = parse_url(filepath_or_buffer)
94-
s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com')
95-
23+
fs = s3fs.S3FileSystem(anon=False)
9624
try:
97-
conn = boto.connect_s3(host=s3_host)
98-
except boto.exception.NoAuthHandlerFound:
99-
conn = boto.connect_s3(host=s3_host, anon=True)
100-
101-
b = conn.get_bucket(parsed_url.netloc, validate=False)
102-
if compat.PY2 and compression:
103-
k = boto.s3.key.Key(b, parsed_url.path)
104-
filepath_or_buffer = BytesIO(k.get_contents_as_string(
105-
encoding=encoding))
106-
else:
107-
k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding)
108-
k.open('r') # Expose read errors immediately
109-
filepath_or_buffer = k
25+
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
26+
except (OSError, NoCredentialsError):
27+
# boto3 has troubles when trying to access a public file
28+
# when credentialed...
29+
# An OSError is raised if you have credentials, but they
30+
# aren't valid for that bucket.
31+
# A NoCredentialsError is raised if you don't have creds
32+
# for that bucket.
33+
fs = s3fs.S3FileSystem(anon=True)
34+
filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer))
11035
return filepath_or_buffer, None, compression

pandas/io/tests/parser/test_network.py

+4-7
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,9 @@ class TestS3(tm.TestCase):
5656

5757
def setUp(self):
5858
try:
59-
import boto # noqa
59+
import s3fs # noqa
6060
except ImportError:
61-
raise nose.SkipTest("boto not installed")
61+
raise nose.SkipTest("s3fs not installed")
6262

6363
@tm.network
6464
def test_parse_public_s3_bucket(self):
@@ -174,15 +174,12 @@ def test_parse_public_s3_bucket_nrows_python(self):
174174

175175
@tm.network
176176
def test_s3_fails(self):
177-
import boto
178-
with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
179-
'S3ResponseError: 404 Not Found'):
177+
with tm.assertRaises(IOError):
180178
read_csv('s3://nyqpug/asdf.csv')
181179

182180
# Receive a permission error when trying to read a private bucket.
183181
# It's irrelevant here that this isn't actually a table.
184-
with tm.assertRaisesRegexp(boto.exception.S3ResponseError,
185-
'S3ResponseError: 403 Forbidden'):
182+
with tm.assertRaises(IOError):
186183
read_csv('s3://cant_get_it/')
187184

188185
if __name__ == '__main__':

pandas/io/tests/test_excel.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -64,11 +64,11 @@ def _skip_if_no_excelsuite():
6464
_skip_if_no_openpyxl()
6565

6666

67-
def _skip_if_no_boto():
67+
def _skip_if_no_s3fs():
6868
try:
69-
import boto # NOQA
69+
import s3fs # noqa
7070
except ImportError:
71-
raise nose.SkipTest('boto not installed, skipping')
71+
raise nose.SkipTest('s3fs not installed, skipping')
7272

7373

7474
_seriesd = tm.getSeriesData()
@@ -582,7 +582,7 @@ def test_read_from_http_url(self):
582582

583583
@tm.network(check_before_test=True)
584584
def test_read_from_s3_url(self):
585-
_skip_if_no_boto()
585+
_skip_if_no_s3fs()
586586

587587
url = ('s3://pandas-test/test1' + self.ext)
588588
url_table = read_excel(url)

pandas/util/print_versions.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,7 @@ def show_versions(as_json=False):
9494
("pymysql", lambda mod: mod.__version__),
9595
("psycopg2", lambda mod: mod.__version__),
9696
("jinja2", lambda mod: mod.__version__),
97-
("boto", lambda mod: mod.__version__),
97+
("s3fs", lambda mod: mod.__version__),
9898
("pandas_datareader", lambda mod: mod.__version__)
9999
]
100100

0 commit comments

Comments
 (0)