Skip to content

Commit 529cd3d

Browse files
committed
Merge pull request #9770 from evanpw/infer_compression
ENH: Add option in read_csv to infer compression type from filename
2 parents f9f88b2 + 6cb41c6 commit 529cd3d

File tree

7 files changed

+48
-4
lines changed

7 files changed

+48
-4
lines changed

doc/source/io.rst

+2
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,8 @@ They can take a number of arguments:
8989
- ``delim_whitespace``: Parse whitespace-delimited (spaces or tabs) file
9090
(much faster than using a regular expression)
9191
- ``compression``: decompress ``'gzip'`` and ``'bz2'`` formats on the fly.
92+
Set to ``'infer'`` (the default) to guess a format based on the file
93+
extension.
9294
- ``dialect``: string or :class:`python:csv.Dialect` instance to expose more
9395
ways to specify the file format
9496
- ``dtype``: A data type name or a dict of column name to data type. If not

doc/source/whatsnew/v0.16.1.txt

100644100755
+1
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,7 @@ API changes
8080
- :meth:`~pandas.DataFrame.assign` now inserts new columns in alphabetical order. Previously
8181
the order was arbitrary. (:issue:`9777`)
8282

83+
- By default, ``read_csv`` and ``read_table`` will now try to infer the compression type based on the file extension. Set ``compression=None`` to restore the previous behavior (no decompression). (:issue:`9770`)
8384

8485
.. _whatsnew_0161.performance:
8586

pandas/io/parsers.py

100644100755
+19-4
Original file line numberDiff line numberDiff line change
@@ -56,8 +56,11 @@ class ParserWarning(Warning):
5656
dtype : Type name or dict of column -> type
5757
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
5858
(Unsupported with engine='python')
59-
compression : {'gzip', 'bz2', None}, default None
60-
For on-the-fly decompression of on-disk data
59+
compression : {'gzip', 'bz2', 'infer', None}, default 'infer'
60+
For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
61+
bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
62+
respectively, and no decompression otherwise. Set to None for no
63+
decompression.
6164
dialect : string or csv.Dialect instance, default None
6265
If None defaults to Excel dialect. Ignored if sep longer than 1 char
6366
See csv.Dialect documentation for more details
@@ -295,7 +298,7 @@ def _read(filepath_or_buffer, kwds):
295298
'verbose': False,
296299
'encoding': None,
297300
'squeeze': False,
298-
'compression': None,
301+
'compression': 'infer',
299302
'mangle_dupe_cols': True,
300303
'tupleize_cols': False,
301304
'infer_datetime_format': False,
@@ -335,7 +338,7 @@ def _make_parser_function(name, sep=','):
335338
def parser_f(filepath_or_buffer,
336339
sep=sep,
337340
dialect=None,
338-
compression=None,
341+
compression='infer',
339342

340343
doublequote=True,
341344
escapechar=None,
@@ -1317,6 +1320,7 @@ def _wrap_compressed(f, compression, encoding=None):
13171320
"""
13181321
compression = compression.lower()
13191322
encoding = encoding or get_option('display.encoding')
1323+
13201324
if compression == 'gzip':
13211325
import gzip
13221326

@@ -1389,6 +1393,17 @@ def __init__(self, f, **kwds):
13891393
self.comment = kwds['comment']
13901394
self._comment_lines = []
13911395

1396+
if self.compression == 'infer':
1397+
if isinstance(f, compat.string_types):
1398+
if f.endswith('.gz'):
1399+
self.compression = 'gzip'
1400+
elif f.endswith('.bz2'):
1401+
self.compression = 'bz2'
1402+
else:
1403+
self.compression = None
1404+
else:
1405+
self.compression = None
1406+
13921407
if isinstance(f, compat.string_types):
13931408
f = com._get_handle(f, 'r', encoding=self.encoding,
13941409
compression=self.compression)

pandas/io/tests/data/test1.csv.bz2

307 Bytes
Binary file not shown.

pandas/io/tests/data/test1.csv.gz

294 Bytes
Binary file not shown.

pandas/io/tests/test_parsers.py

100644100755
+15
Original file line numberDiff line numberDiff line change
@@ -1098,6 +1098,21 @@ def test_read_csv_no_index_name(self):
10981098
self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64)
10991099
tm.assert_frame_equal(df, df2)
11001100

1101+
def test_read_csv_infer_compression(self):
1102+
# GH 9770
1103+
expected = self.read_csv(self.csv1, index_col=0, parse_dates=True)
1104+
1105+
inputs = [self.csv1, self.csv1 + '.gz',
1106+
self.csv1 + '.bz2', open(self.csv1)]
1107+
1108+
for f in inputs:
1109+
df = self.read_csv(f, index_col=0, parse_dates=True,
1110+
compression='infer')
1111+
1112+
tm.assert_frame_equal(expected, df)
1113+
1114+
inputs[3].close()
1115+
11011116
def test_read_table_unicode(self):
11021117
fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8'))
11031118
df1 = read_table(fin, sep=";", encoding="utf-8", header=None)

pandas/parser.pyx

+11
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,17 @@ cdef class TextReader:
541541
self.parser.cb_io = NULL
542542
self.parser.cb_cleanup = NULL
543543

544+
if self.compression == 'infer':
545+
if isinstance(source, basestring):
546+
if source.endswith('.gz'):
547+
self.compression = 'gzip'
548+
elif source.endswith('.bz2'):
549+
self.compression = 'bz2'
550+
else:
551+
self.compression = None
552+
else:
553+
self.compression = None
554+
544555
if self.compression:
545556
if self.compression == 'gzip':
546557
import gzip

0 commit comments

Comments
 (0)