|
1 | 1 | """ s3 support for remote file interactivity """
|
2 |
| - |
3 |
| -import os |
4 | 2 | from pandas import compat
|
5 |
| -from pandas.compat import BytesIO |
6 |
| - |
7 | 3 | try:
|
8 |
| - import boto |
9 |
| - from boto.s3 import key |
| 4 | + import s3fs |
| 5 | + from botocore.exceptions import NoCredentialsError |
10 | 6 | except:
|
11 |
| - raise ImportError("boto is required to handle s3 files") |
| 7 | + raise ImportError("The s3fs library is required to handle s3 files") |
12 | 8 |
|
13 | 9 | if compat.PY3:
|
14 | 10 | from urllib.parse import urlparse as parse_url
|
15 | 11 | else:
|
16 | 12 | from urlparse import urlparse as parse_url
|
17 | 13 |
|
18 | 14 |
|
19 |
| -class BotoFileLikeReader(key.Key): |
20 |
| - """boto Key modified to be more file-like |
21 |
| -
|
22 |
| - This modification of the boto Key will read through a supplied |
23 |
| - S3 key once, then stop. The unmodified boto Key object will repeatedly |
24 |
| - cycle through a file in S3: after reaching the end of the file, |
25 |
| - boto will close the file. Then the next call to `read` or `next` will |
26 |
| - re-open the file and start reading from the beginning. |
27 |
| -
|
28 |
| - Also adds a `readline` function which will split the returned |
29 |
| - values by the `\n` character. |
30 |
| - """ |
31 |
| - |
32 |
| - def __init__(self, *args, **kwargs): |
33 |
| - encoding = kwargs.pop("encoding", None) # Python 2 compat |
34 |
| - super(BotoFileLikeReader, self).__init__(*args, **kwargs) |
35 |
| - # Add a flag to mark the end of the read. |
36 |
| - self.finished_read = False |
37 |
| - self.buffer = "" |
38 |
| - self.lines = [] |
39 |
| - if encoding is None and compat.PY3: |
40 |
| - encoding = "utf-8" |
41 |
| - self.encoding = encoding |
42 |
| - self.lines = [] |
43 |
| - |
44 |
| - def next(self): |
45 |
| - return self.readline() |
46 |
| - |
47 |
| - __next__ = next |
48 |
| - |
49 |
| - def read(self, *args, **kwargs): |
50 |
| - if self.finished_read: |
51 |
| - return b'' if compat.PY3 else '' |
52 |
| - return super(BotoFileLikeReader, self).read(*args, **kwargs) |
53 |
| - |
54 |
| - def close(self, *args, **kwargs): |
55 |
| - self.finished_read = True |
56 |
| - return super(BotoFileLikeReader, self).close(*args, **kwargs) |
57 |
| - |
58 |
| - def seekable(self): |
59 |
| - """Needed for reading by bz2""" |
60 |
| - return False |
61 |
| - |
62 |
| - def readline(self): |
63 |
| - """Split the contents of the Key by '\n' characters.""" |
64 |
| - if self.lines: |
65 |
| - retval = self.lines[0] |
66 |
| - self.lines = self.lines[1:] |
67 |
| - return retval |
68 |
| - if self.finished_read: |
69 |
| - if self.buffer: |
70 |
| - retval, self.buffer = self.buffer, "" |
71 |
| - return retval |
72 |
| - else: |
73 |
| - raise StopIteration |
74 |
| - |
75 |
| - if self.encoding: |
76 |
| - self.buffer = "{}{}".format( |
77 |
| - self.buffer, self.read(8192).decode(self.encoding)) |
78 |
| - else: |
79 |
| - self.buffer = "{}{}".format(self.buffer, self.read(8192)) |
80 |
| - |
81 |
| - split_buffer = self.buffer.split("\n") |
82 |
| - self.lines.extend(split_buffer[:-1]) |
83 |
| - self.buffer = split_buffer[-1] |
84 |
| - |
85 |
| - return self.readline() |
| 15 | +def _strip_schema(url): |
| 16 | + """Returns the url without the s3:// part""" |
| 17 | + result = parse_url(url) |
| 18 | + return result.netloc + result.path |
86 | 19 |
|
87 | 20 |
|
88 | 21 | def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
|
89 | 22 | compression=None):
|
90 |
| - |
91 |
| - # Assuming AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY and AWS_S3_HOST |
92 |
| - # are environment variables |
93 |
| - parsed_url = parse_url(filepath_or_buffer) |
94 |
| - s3_host = os.environ.get('AWS_S3_HOST', 's3.amazonaws.com') |
95 |
| - |
| 23 | + fs = s3fs.S3FileSystem(anon=False) |
96 | 24 | try:
|
97 |
| - conn = boto.connect_s3(host=s3_host) |
98 |
| - except boto.exception.NoAuthHandlerFound: |
99 |
| - conn = boto.connect_s3(host=s3_host, anon=True) |
100 |
| - |
101 |
| - b = conn.get_bucket(parsed_url.netloc, validate=False) |
102 |
| - if compat.PY2 and compression: |
103 |
| - k = boto.s3.key.Key(b, parsed_url.path) |
104 |
| - filepath_or_buffer = BytesIO(k.get_contents_as_string( |
105 |
| - encoding=encoding)) |
106 |
| - else: |
107 |
| - k = BotoFileLikeReader(b, parsed_url.path, encoding=encoding) |
108 |
| - k.open('r') # Expose read errors immediately |
109 |
| - filepath_or_buffer = k |
| 25 | + filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer)) |
| 26 | + except (OSError, NoCredentialsError): |
| 27 | + # boto3 has troubles when trying to access a public file |
| 28 | + # when credentialed... |
| 29 | + # An OSError is raised if you have credentials, but they |
| 30 | + # aren't valid for that bucket. |
| 31 | + # A NoCredentialsError is raised if you don't have creds |
| 32 | + # for that bucket. |
| 33 | + fs = s3fs.S3FileSystem(anon=True) |
| 34 | + filepath_or_buffer = fs.open(_strip_schema(filepath_or_buffer)) |
110 | 35 | return filepath_or_buffer, None, compression
|
0 commit comments