diff --git a/doc/source/io.rst b/doc/source/io.rst index 1c8a1159ab162..a6c702e1cd874 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -89,6 +89,8 @@ They can take a number of arguments: - ``delim_whitespace``: Parse whitespace-delimited (spaces or tabs) file (much faster than using a regular expression) - ``compression``: decompress ``'gzip'`` and ``'bz2'`` formats on the fly. + Set to ``'infer'`` (the default) to guess a format based on the file + extension. - ``dialect``: string or :class:`python:csv.Dialect` instance to expose more ways to specify the file format - ``dtype``: A data type name or a dict of column name to data type. If not diff --git a/doc/source/whatsnew/v0.16.1.txt b/doc/source/whatsnew/v0.16.1.txt old mode 100644 new mode 100755 index a6e917827b755..659aa6786b366 --- a/doc/source/whatsnew/v0.16.1.txt +++ b/doc/source/whatsnew/v0.16.1.txt @@ -80,6 +80,7 @@ API changes - :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously the order was arbitrary. (:issue:`9777`) +- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`) .. _whatsnew_0161.performance: diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py old mode 100644 new mode 100755 index fef02dcb6e0c5..59ecb29146315 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -56,8 +56,11 @@ class ParserWarning(Warning): dtype : Type name or dict of column -> type Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python') -compression : {'gzip', 'bz2', None}, default None - For on-the-fly decompression of on-disk data +compression : {'gzip', 'bz2', 'infer', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use gzip or + bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2', + respectively, and no decompression otherwise. Set to None for no + decompression. dialect : string or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details @@ -295,7 +298,7 @@ def _read(filepath_or_buffer, kwds): 'verbose': False, 'encoding': None, 'squeeze': False, - 'compression': None, + 'compression': 'infer', 'mangle_dupe_cols': True, 'tupleize_cols': False, 'infer_datetime_format': False, @@ -335,7 +338,7 @@ def _make_parser_function(name, sep=','): def parser_f(filepath_or_buffer, sep=sep, dialect=None, - compression=None, + compression='infer', doublequote=True, escapechar=None, @@ -1317,6 +1320,7 @@ def _wrap_compressed(f, compression, encoding=None): """ compression = compression.lower() encoding = encoding or get_option('display.encoding') + if compression == 'gzip': import gzip @@ -1389,6 +1393,17 @@ def __init__(self, f, **kwds): self.comment = kwds['comment'] self._comment_lines = [] + if self.compression == 'infer': + if isinstance(f, compat.string_types): + if f.endswith('.gz'): + self.compression = 'gzip' + elif f.endswith('.bz2'): + self.compression = 'bz2' + else: + self.compression = None + else: + self.compression = None + if isinstance(f, compat.string_types): f = com._get_handle(f, 'r', encoding=self.encoding, compression=self.compression) diff --git a/pandas/io/tests/data/test1.csv.bz2 b/pandas/io/tests/data/test1.csv.bz2 new file mode 100644 index 0000000000000..f96f26a8e7419 Binary files /dev/null and b/pandas/io/tests/data/test1.csv.bz2 differ diff --git a/pandas/io/tests/data/test1.csv.gz b/pandas/io/tests/data/test1.csv.gz new file mode 100644 index 0000000000000..1336db6e2af7e Binary files /dev/null and b/pandas/io/tests/data/test1.csv.gz differ diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py old mode 100644 new mode 100755 index b7016ad6cffae..799872d036c4f --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1098,6 +1098,21 @@ def test_read_csv_no_index_name(self): self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64) tm.assert_frame_equal(df, df2) + def test_read_csv_infer_compression(self): + # GH 9770 + expected = self.read_csv(self.csv1, index_col=0, parse_dates=True) + + inputs = [self.csv1, self.csv1 + '.gz', + self.csv1 + '.bz2', open(self.csv1)] + + for f in inputs: + df = self.read_csv(f, index_col=0, parse_dates=True, + compression='infer') + + tm.assert_frame_equal(expected, df) + + inputs[3].close() + def test_read_table_unicode(self): fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8')) df1 = read_table(fin, sep=";", encoding="utf-8", header=None) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 73a03fc5cef7c..b28e0587264d4 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -541,6 +541,17 @@ cdef class TextReader: self.parser.cb_io = NULL self.parser.cb_cleanup = NULL + if self.compression == 'infer': + if isinstance(source, basestring): + if source.endswith('.gz'): + self.compression = 'gzip' + elif source.endswith('.bz2'): + self.compression = 'bz2' + else: + self.compression = None + else: + self.compression = None + if self.compression: if self.compression == 'gzip': import gzip