Skip to content

Commit e235925

Browse files
committed
ENH: Add option in read_csv to infer compression type from filename
1 parent a004c59 commit e235925

File tree

7 files changed

+44
-3
lines changed

7 files changed

+44
-3
lines changed

doc/source/io.rst

+1
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,7 @@ They can take a number of arguments:
8989
- ``delim_whitespace``: Parse whitespace-delimited (spaces or tabs) file
9090
(much faster than using a regular expression)
9191
- ``compression``: decompress ``'gzip'`` and ``'bz2'`` formats on the fly.
92+
Set to ``'infer'`` to guess a format based on the file extension.
9293
- ``dialect``: string or :class:`python:csv.Dialect` instance to expose more
9394
ways to specify the file format
9495
- ``dtype``: A data type name or a dict of column name to data type. If not

doc/source/whatsnew/v0.16.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ We recommend that all users upgrade to this version.
1616

1717
Enhancements
1818
~~~~~~~~~~~~
19-
19+
- Setting the ``compression`` argument of ``read_csv`` or ``read_table`` to ``'infer'`` will now guess the compression type based on the file extension.
2020

2121

2222

pandas/io/parsers.py

+16-2
Original file line numberDiff line numberDiff line change
@@ -55,8 +55,10 @@ class ParserWarning(Warning):
5555
dtype : Type name or dict of column -> type
5656
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
5757
(Unsupported with engine='python')
58-
compression : {'gzip', 'bz2', None}, default None
59-
For on-the-fly decompression of on-disk data
58+
compression : {'gzip', 'bz2', 'infer', None}, default None
59+
For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
60+
bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
61+
respectively, and None otherwise.
6062
dialect : string or csv.Dialect instance, default None
6163
If None defaults to Excel dialect. Ignored if sep longer than 1 char
6264
See csv.Dialect documentation for more details
@@ -1314,6 +1316,7 @@ def _wrap_compressed(f, compression, encoding=None):
13141316
"""
13151317
compression = compression.lower()
13161318
encoding = encoding or get_option('display.encoding')
1319+
13171320
if compression == 'gzip':
13181321
import gzip
13191322

@@ -1386,6 +1389,17 @@ def __init__(self, f, **kwds):
13861389
self.comment = kwds['comment']
13871390
self._comment_lines = []
13881391

1392+
if self.compression == 'infer':
1393+
if isinstance(f, compat.string_types):
1394+
if f.endswith('.gz'):
1395+
self.compression = 'gzip'
1396+
elif f.endswith('.bz2'):
1397+
self.compression = 'bz2'
1398+
else:
1399+
self.compression = None
1400+
else:
1401+
self.compression = None
1402+
13891403
if isinstance(f, compat.string_types):
13901404
f = com._get_handle(f, 'r', encoding=self.encoding,
13911405
compression=self.compression)

pandas/io/tests/data/test1.csv.bz2

307 Bytes
Binary file not shown.

pandas/io/tests/data/test1.csv.gz

294 Bytes
Binary file not shown.

pandas/io/tests/test_parsers.py

+15
Original file line numberDiff line numberDiff line change
@@ -1071,6 +1071,21 @@ def test_read_csv_no_index_name(self):
10711071
self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64)
10721072
tm.assert_frame_equal(df, df2)
10731073

1074+
def test_read_csv_infer_compression(self):
1075+
inputs = [self.csv1, self.csv1 + '.gz',
1076+
self.csv1 + '.bz2', open(self.csv1)]
1077+
1078+
for f in inputs:
1079+
df = self.read_csv(f, index_col=0, parse_dates=True,
1080+
compression='infer')
1081+
1082+
self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D'])
1083+
self.assertEqual(df.index.name, 'index')
1084+
self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp))
1085+
self.assertEqual(df.values.dtype, np.float64)
1086+
1087+
inputs[3].close()
1088+
10741089
def test_read_table_unicode(self):
10751090
fin = BytesIO(u('\u0141aski, Jan;1').encode('utf-8'))
10761091
df1 = read_table(fin, sep=";", encoding="utf-8", header=None)

pandas/parser.pyx

+11
Original file line numberDiff line numberDiff line change
@@ -541,6 +541,17 @@ cdef class TextReader:
541541
self.parser.cb_io = NULL
542542
self.parser.cb_cleanup = NULL
543543

544+
if self.compression == 'infer':
545+
if isinstance(source, basestring):
546+
if source.endswith('.gz'):
547+
self.compression = 'gzip'
548+
elif source.endswith('.bz2'):
549+
self.compression = 'bz2'
550+
else:
551+
self.compression = None
552+
else:
553+
self.compression = None
554+
544555
if self.compression:
545556
if self.compression == 'gzip':
546557
import gzip

0 commit comments

Comments
 (0)