From a49b2cd8b42d823909d5fcd3d391c3d9529471e3 Mon Sep 17 00:00:00 2001 From: Stephen Hoover Date: Wed, 9 Sep 2015 20:47:18 -0500 Subject: [PATCH] ENH Move check for inferred compression to before `get_filepath_or_buffer` When reading CSVs, if `compression='infer'`, check the input before calling `get_filepath_or_buffer` in the `_read` function. This way we can catch compresion extensions on S3 files. We now attempt to infer compression from an input filename only in the `_read` function, instead of separately in each parser. --- doc/source/whatsnew/v0.17.0.txt | 2 ++ pandas/io/common.py | 6 +++++- pandas/io/parsers.py | 30 +++++++++++++++++------------- pandas/io/tests/test_parsers.py | 9 +++++++++ pandas/parser.pyx | 11 ----------- 5 files changed, 33 insertions(+), 25 deletions(-) diff --git a/doc/source/whatsnew/v0.17.0.txt b/doc/source/whatsnew/v0.17.0.txt index 615bfc9e23253..a6e6c9f2f2b3c 100644 --- a/doc/source/whatsnew/v0.17.0.txt +++ b/doc/source/whatsnew/v0.17.0.txt @@ -481,6 +481,8 @@ Other enhancements - Read CSV files from AWS S3 incrementally, instead of first downloading the entire file. (Full file download still required for compressed files in Python 2.) (:issue:`11070`, :issue:`11073`) +- ``pd.read_csv`` is now able to infer compression type for files read from AWS S3 storage (:issue:`11070`, :issue:`11074`). + .. _whatsnew_0170.api: .. _whatsnew_0170.api_breaking: diff --git a/pandas/io/common.py b/pandas/io/common.py index 7095a0fd60f2a..e13c402b454d1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -217,6 +217,8 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, content_encoding = req.headers.get('Content-Encoding', None) if content_encoding == 'gzip': compression = 'gzip' + else: + compression = None # cat on the compression to the tuple returned by the function to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \ [compression] @@ -237,7 +239,9 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, conn = boto.connect_s3(anon=True) b = conn.get_bucket(parsed_url.netloc, validate=False) - if compat.PY2 and compression == 'gzip': + if compat.PY2 and (compression == 'gzip' or + (compression == 'infer' and + filepath_or_buffer.endswith(".gz"))): k = boto.s3.key.Key(b, parsed_url.path) filepath_or_buffer = BytesIO(k.get_contents_as_string( encoding=encoding)) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 736c08f72dee8..15e11193fd1b7 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -235,10 +235,25 @@ def _read(filepath_or_buffer, kwds): if skipfooter is not None: kwds['skip_footer'] = skipfooter + # If the input could be a filename, check for a recognizable compression extension. + # If we're reading from a URL, the `get_filepath_or_buffer` will use header info + # to determine compression, so use what it finds in that case. + inferred_compression = kwds.get('compression') + if inferred_compression == 'infer': + if isinstance(filepath_or_buffer, compat.string_types): + if filepath_or_buffer.endswith('.gz'): + inferred_compression = 'gzip' + elif filepath_or_buffer.endswith('.bz2'): + inferred_compression = 'bz2' + else: + inferred_compression = None + else: + inferred_compression = None + filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, encoding, compression=kwds.get('compression', None)) - kwds['compression'] = compression + kwds['compression'] = inferred_compression if compression == 'infer' else compression if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): @@ -301,7 +316,7 @@ def _read(filepath_or_buffer, kwds): 'verbose': False, 'encoding': None, 'squeeze': False, - 'compression': 'infer', + 'compression': None, 'mangle_dupe_cols': True, 'tupleize_cols': False, 'infer_datetime_format': False, @@ -1402,17 +1417,6 @@ def __init__(self, f, **kwds): self.comment = kwds['comment'] self._comment_lines = [] - if self.compression == 'infer': - if isinstance(f, compat.string_types): - if f.endswith('.gz'): - self.compression = 'gzip' - elif f.endswith('.bz2'): - self.compression = 'bz2' - else: - self.compression = None - else: - self.compression = None - if isinstance(f, compat.string_types): f = com._get_handle(f, 'r', encoding=self.encoding, compression=self.compression) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 70a49e6bd6782..6b7132aea3280 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -4341,6 +4341,15 @@ def test_parse_public_s3_bucket_python(self): self.assertFalse(df.empty) tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df) + @tm.network + def test_infer_s3_compression(self): + for ext in ['', '.gz', '.bz2']: + df = pd.read_csv('s3://pandas-test/tips.csv' + ext, + engine='python', compression='infer') + self.assertTrue(isinstance(df, pd.DataFrame)) + self.assertFalse(df.empty) + tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df) + @tm.network def test_parse_public_s3_bucket_nrows_python(self): for ext, comp in [('', None), ('.gz', 'gzip'), ('.bz2', 'bz2')]: diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 647e8e72414e9..8ac1f64f2d50e 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -541,17 +541,6 @@ cdef class TextReader: self.parser.cb_io = NULL self.parser.cb_cleanup = NULL - if self.compression == 'infer': - if isinstance(source, basestring): - if source.endswith('.gz'): - self.compression = 'gzip' - elif source.endswith('.bz2'): - self.compression = 'bz2' - else: - self.compression = None - else: - self.compression = None - if self.compression: if self.compression == 'gzip': import gzip