Skip to content

Commit 7fe1c69

Browse files
committed
ENH: Add option in read_csv to infer compression type from filename
1 parent 5dff7df commit 7fe1c69

File tree

7 files changed

+48
-4
lines changed

7 files changed

+48
-4
lines changed

doc/source/io.rst

+2
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ They can take a number of arguments:
8989
- ``delim_whitespace``: Parse whitespace-delimited (spaces or tabs) file
9090
(much faster than using a regular expression)
9191
- ``compression``: decompress ``'gzip'`` and ``'bz2'`` formats on the fly.
92+
Set to ``'infer'`` (the default) to guess a format based on the file
93+
extension.
9294
- ``dialect``: string or :class:`python:csv.Dialect` instance to expose more
9395
ways to specify the file format
9496
- ``dtype``: A data type name or a dict of column name to data type. If not

doc/source/whatsnew/v0.16.1.txt

+3
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ Enhancements
1919

2020
- Added ``StringMethods.capitalize()`` and ``swapcase`` which behave as the same as standard ``str`` (:issue:`9766`)
2121

22+
2223
- ``DataFrame.mask()`` and ``Series.mask()`` now support same keywords as ``where`` (:issue:`8801`)
2324

2425

@@ -55,6 +56,8 @@ API changes
5556
- :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously
5657
the order was arbitrary. (:issue:`9777`)
5758

59+
- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior. (:issue:`9770`)
60+
5861

5962
.. _whatsnew_0161.performance:
6063

pandas/io/parsers.py

+18-4
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,10 @@ class ParserWarning(Warning):
5555
dtype : Type name or dict of column -> type
5656
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
5757
(Unsupported with engine='python')
58-
compression : {'gzip', 'bz2', None}, default None
59-
For on-the-fly decompression of on-disk data
58+
compression : {'gzip', 'bz2', 'infer', None}, default 'infer'
59+
For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
60+
bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
61+
respectively, and no decompression otherwise.
6062
dialect : string or csv.Dialect instance, default None
6163
If None defaults to Excel dialect. Ignored if sep longer than 1 char
6264
See csv.Dialect documentation for more details
@@ -294,7 +296,7 @@ def _read(filepath_or_buffer, kwds):
294296
'verbose': False,
295297
'encoding': None,
296298
'squeeze': False,
297-
'compression': None,
299+
'compression': 'infer',
298300
'mangle_dupe_cols': True,
299301
'tupleize_cols': False,
300302
'infer_datetime_format': False,
@@ -334,7 +336,7 @@ def _make_parser_function(name, sep=','):
334336
def parser_f(filepath_or_buffer,
335337
sep=sep,
336338
dialect=None,
337-
compression=None,
339+
compression='infer',
338340

339341
doublequote=True,
340342
escapechar=None,
@@ -1316,6 +1318,7 @@ def _wrap_compressed(f, compression, encoding=None):
13161318
"""
13171319
compression = compression.lower()
13181320
encoding = encoding or get_option('display.encoding')
1321+
13191322
if compression == 'gzip':
13201323
import gzip
13211324

@@ -1388,6 +1391,17 @@ def __init__(self, f, **kwds):
13881391
self.comment = kwds['comment']
13891392
self._comment_lines = []
13901393

1394+
if self.compression == 'infer':
1395+
if isinstance(f, compat.string_types):
1396+
if f.endswith('.gz'):
1397+
self.compression = 'gzip'
1398+
elif f.endswith('.bz2'):
1399+
self.compression = 'bz2'
1400+
else:
1401+
self.compression = None
1402+
else:
1403+
self.compression = None
1404+
13911405
if isinstance(f, compat.string_types):
13921406
f = com._get_handle(f, 'r', encoding=self.encoding,
13931407
compression=self.compression)

pandas/io/tests/data/test1.csv.bz2

307 Bytes
Binary file not shown.

pandas/io/tests/data/test1.csv.gz

294 Bytes
Binary file not shown.

pandas/io/tests/test_parsers.py

+14
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,20 @@ def test_read_csv_no_index_name(self):
10981098
self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64)
10991099
tm.assert_frame_equal(df, df2)
11001100

1101+
def test_read_csv_infer_compression(self):
1102+
expected = self.read_csv(self.csv1, index_col=0, parse_dates=True)
1103+
1104+
inputs = [self.csv1, self.csv1 + '.gz',
1105+
self.csv1 + '.bz2', open(self.csv1)]
1106+
1107+
for f in inputs:
1108+
df = self.read_csv(f, index_col=0, parse_dates=True,
1109+
compression='infer')
1110+
1111+
tm.assert_frame_equal(expected, df)
1112+
1113+
inputs[3].close()
1114+
11011115
def test_read_table_unicode(self):
11021116
fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8'))
11031117
df1 = read_table(fin, sep=";", encoding="utf-8", header=None)

pandas/parser.pyx

+11
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,17 @@ cdef class TextReader:
541541
self.parser.cb_io = NULL
542542
self.parser.cb_cleanup = NULL
543543

544+
if self.compression == 'infer':
545+
if isinstance(source, basestring):
546+
if source.endswith('.gz'):
547+
self.compression = 'gzip'
548+
elif source.endswith('.bz2'):
549+
self.compression = 'bz2'
550+
else:
551+
self.compression = None
552+
else:
553+
self.compression = None
554+
544555
if self.compression:
545556
if self.compression == 'gzip':
546557
import gzip

0 commit comments

Comments
 (0)