Skip to content

Commit 7ee6d08

Browse files
Mahmoud Lababidijreback
Mahmoud Lababidi
authored andcommitted
ENH: Add ZIP file decompression and TestCompression.
closes #12175 closes #11413
1 parent 4844ac1 commit 7ee6d08

File tree

6 files changed

+252
-117
lines changed

6 files changed

+252
-117
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI
5656
Other Enhancements
5757
^^^^^^^^^^^^^^^^^^
5858

59+
- ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV, via extension inference or explict ``compression='zip'`` (:issue:`12175`)
5960
- ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
6061

6162
.. _whatsnew_0181.api:

pandas/io/common.py

+15
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,21 @@ def _get_handle(path, mode, encoding=None, compression=None):
360360
elif compression == 'bz2':
361361
import bz2
362362
f = bz2.BZ2File(path, mode)
363+
elif compression == 'zip':
364+
import zipfile
365+
zip_file = zipfile.ZipFile(path)
366+
zip_names = zip_file.namelist()
367+
368+
if len(zip_names) == 1:
369+
file_name = zip_names.pop()
370+
f = zip_file.open(file_name)
371+
elif len(zip_names) == 0:
372+
raise ValueError('Zero files found in ZIP file {}'
373+
.format(path))
374+
else:
375+
raise ValueError('Multiple files found in ZIP file.'
376+
' Only one file per ZIP :{}'
377+
.format(zip_names))
363378
else:
364379
raise ValueError('Unrecognized compression type: %s' %
365380
compression)

pandas/io/parsers.py

+30-5
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,16 @@ class ParserWarning(Warning):
158158
information
159159
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ on
160160
``iterator`` and ``chunksize``.
161-
compression : {'infer', 'gzip', 'bz2', None}, default 'infer'
162-
For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
163-
bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
164-
respectively, and no decompression otherwise. Set to None for no
165-
decompression.
161+
compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer'
162+
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
163+
bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or
164+
'.zip', respectively, and no decompression otherwise. If using 'zip',
165+
the ZIP file must contain only one data file to be read in.
166+
Set to None for no decompression.
167+
168+
.. versionadded:: 0.18.0
169+
170+
support for 'zip' compression.
166171
thousands : str, default None
167172
Thousands separator
168173
decimal : str, default '.'
@@ -273,6 +278,8 @@ def _read(filepath_or_buffer, kwds):
273278
inferred_compression = 'gzip'
274279
elif filepath_or_buffer.endswith('.bz2'):
275280
inferred_compression = 'bz2'
281+
elif filepath_or_buffer.endswith('.zip'):
282+
inferred_compression = 'zip'
276283
else:
277284
inferred_compression = None
278285
else:
@@ -1397,6 +1404,24 @@ def _wrap_compressed(f, compression, encoding=None):
13971404
data = bz2.decompress(f.read())
13981405
f = StringIO(data)
13991406
return f
1407+
elif compression == 'zip':
1408+
import zipfile
1409+
zip_file = zipfile.ZipFile(f)
1410+
zip_names = zip_file.namelist()
1411+
1412+
if len(zip_names) == 1:
1413+
file_name = zip_names.pop()
1414+
f = zip_file.open(file_name)
1415+
return f
1416+
1417+
elif len(zip_names) == 0:
1418+
raise ValueError('Corrupted or zero files found in compressed '
1419+
'zip file %s', zip_file.filename)
1420+
1421+
else:
1422+
raise ValueError('Multiple files found in compressed '
1423+
'zip file %s', str(zip_names))
1424+
14001425
else:
14011426
raise ValueError('do not recognize compression method %s'
14021427
% compression)

0 commit comments

Comments
 (0)