Skip to content

Commit da6ad3f

Browse files
committed
Merge pull request #11074 from stephen-hoover/infer-s3-compression
ENH Add check for inferred compression before `get_filepath_or_buffer`
2 parents 6bdf948 + a49b2cd commit da6ad3f

File tree

5 files changed

+33
-25
lines changed

5 files changed

+33
-25
lines changed

doc/source/whatsnew/v0.17.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -481,6 +481,8 @@ Other enhancements
481481

482482
- Read CSV files from AWS S3 incrementally, instead of first downloading the entire file. (Full file download still required for compressed files in Python 2.) (:issue:`11070`, :issue:`11073`)
483483

484+
- ``pd.read_csv`` is now able to infer compression type for files read from AWS S3 storage (:issue:`11070`, :issue:`11074`).
485+
484486
.. _whatsnew_0170.api:
485487

486488
.. _whatsnew_0170.api_breaking:

pandas/io/common.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -217,6 +217,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
217217
content_encoding = req.headers.get('Content-Encoding', None)
218218
if content_encoding == 'gzip':
219219
compression = 'gzip'
220+
else:
221+
compression = None
220222
# cat on the compression to the tuple returned by the function
221223
to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \
222224
[compression]
@@ -237,7 +239,9 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None,
237239
conn = boto.connect_s3(anon=True)
238240

239241
b = conn.get_bucket(parsed_url.netloc, validate=False)
240-
if compat.PY2 and compression == 'gzip':
242+
if compat.PY2 and (compression == 'gzip' or
243+
(compression == 'infer' and
244+
filepath_or_buffer.endswith(".gz"))):
241245
k = boto.s3.key.Key(b, parsed_url.path)
242246
filepath_or_buffer = BytesIO(k.get_contents_as_string(
243247
encoding=encoding))

pandas/io/parsers.py

+17-13
Original file line numberDiff line numberDiff line change
@@ -235,10 +235,25 @@ def _read(filepath_or_buffer, kwds):
235235
if skipfooter is not None:
236236
kwds['skip_footer'] = skipfooter
237237

238+
# If the input could be a filename, check for a recognizable compression extension.
239+
# If we're reading from a URL, the `get_filepath_or_buffer` will use header info
240+
# to determine compression, so use what it finds in that case.
241+
inferred_compression = kwds.get('compression')
242+
if inferred_compression == 'infer':
243+
if isinstance(filepath_or_buffer, compat.string_types):
244+
if filepath_or_buffer.endswith('.gz'):
245+
inferred_compression = 'gzip'
246+
elif filepath_or_buffer.endswith('.bz2'):
247+
inferred_compression = 'bz2'
248+
else:
249+
inferred_compression = None
250+
else:
251+
inferred_compression = None
252+
238253
filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer,
239254
encoding,
240255
compression=kwds.get('compression', None))
241-
kwds['compression'] = compression
256+
kwds['compression'] = inferred_compression if compression == 'infer' else compression
242257

243258
if kwds.get('date_parser', None) is not None:
244259
if isinstance(kwds['parse_dates'], bool):
@@ -301,7 +316,7 @@ def _read(filepath_or_buffer, kwds):
301316
'verbose': False,
302317
'encoding': None,
303318
'squeeze': False,
304-
'compression': 'infer',
319+
'compression': None,
305320
'mangle_dupe_cols': True,
306321
'tupleize_cols': False,
307322
'infer_datetime_format': False,
@@ -1402,17 +1417,6 @@ def __init__(self, f, **kwds):
14021417
self.comment = kwds['comment']
14031418
self._comment_lines = []
14041419

1405-
if self.compression == 'infer':
1406-
if isinstance(f, compat.string_types):
1407-
if f.endswith('.gz'):
1408-
self.compression = 'gzip'
1409-
elif f.endswith('.bz2'):
1410-
self.compression = 'bz2'
1411-
else:
1412-
self.compression = None
1413-
else:
1414-
self.compression = None
1415-
14161420
if isinstance(f, compat.string_types):
14171421
f = com._get_handle(f, 'r', encoding=self.encoding,
14181422
compression=self.compression)

pandas/io/tests/test_parsers.py

+9
Original file line numberDiff line numberDiff line change
@@ -4341,6 +4341,15 @@ def test_parse_public_s3_bucket_python(self):
43414341
self.assertFalse(df.empty)
43424342
tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df)
43434343

4344+
@tm.network
4345+
def test_infer_s3_compression(self):
4346+
for ext in ['', '.gz', '.bz2']:
4347+
df = pd.read_csv('s3://pandas-test/tips.csv' + ext,
4348+
engine='python', compression='infer')
4349+
self.assertTrue(isinstance(df, pd.DataFrame))
4350+
self.assertFalse(df.empty)
4351+
tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df)
4352+
43444353
@tm.network
43454354
def test_parse_public_s3_bucket_nrows_python(self):
43464355
for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]:

pandas/parser.pyx

-11
Original file line numberDiff line numberDiff line change
@@ -541,17 +541,6 @@ cdef class TextReader:
541541
self.parser.cb_io = NULL
542542
self.parser.cb_cleanup = NULL
543543

544-
if self.compression == 'infer':
545-
if isinstance(source, basestring):
546-
if source.endswith('.gz'):
547-
self.compression = 'gzip'
548-
elif source.endswith('.bz2'):
549-
self.compression = 'bz2'
550-
else:
551-
self.compression = None
552-
else:
553-
self.compression = None
554-
555544
if self.compression:
556545
if self.compression == 'gzip':
557546
import gzip

0 commit comments

Comments
 (0)