Skip to content

Commit 68eb884

Browse files
author
Mahmoud Lababidi
committed
Add ZIP file decompression and TestCompression.
Fix PEP8 issues. Change Compression to be a Mixin. Add Compression Mixin correctly with current Tests. Add .format, Rename Compression, with-block, empty zip, bad-zip
1 parent 4844ac1 commit 68eb884

File tree

6 files changed

+249
-117
lines changed

6 files changed

+249
-117
lines changed

doc/source/whatsnew/v0.18.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI
5656
Other Enhancements
5757
^^^^^^^^^^^^^^^^^^
5858

59+
- ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV (:issue:`12175`)
5960
- ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
6061

6162
.. _whatsnew_0181.api:

pandas/io/common.py

+15
Original file line numberDiff line numberDiff line change
@@ -360,6 +360,21 @@ def _get_handle(path, mode, encoding=None, compression=None):
360360
elif compression == 'bz2':
361361
import bz2
362362
f = bz2.BZ2File(path, mode)
363+
elif compression == 'zip':
364+
import zipfile
365+
zip_file = zipfile.ZipFile(path)
366+
zip_names = zip_file.namelist()
367+
368+
if len(zip_names) == 1:
369+
file_name = zip_names.pop()
370+
f = zip_file.open(file_name)
371+
elif len(zip_names) == 0:
372+
raise ValueError('Zero files found in ZIP file {}'
373+
.format(path))
374+
else:
375+
raise ValueError('Multiple files found in ZIP file.'
376+
' Only one file per ZIP :{}'
377+
.format(zip_names))
363378
else:
364379
raise ValueError('Unrecognized compression type: %s' %
365380
compression)

pandas/io/parsers.py

+27-5
Original file line numberDiff line numberDiff line change
@@ -158,11 +158,12 @@ class ParserWarning(Warning):
158158
information
159159
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ on
160160
``iterator`` and ``chunksize``.
161-
compression : {'infer', 'gzip', 'bz2', None}, default 'infer'
162-
For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
163-
bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
164-
respectively, and no decompression otherwise. Set to None for no
165-
decompression.
161+
compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer'
162+
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
163+
bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or
164+
'.zip', respectively, and no decompression otherwise. New in 0.18.1: ZIP
165+
compression If using 'zip', the ZIP file must contain only one data file
166+
to be read in. Set to None for no decompression.
166167
thousands : str, default None
167168
Thousands separator
168169
decimal : str, default '.'
@@ -273,6 +274,8 @@ def _read(filepath_or_buffer, kwds):
273274
inferred_compression = 'gzip'
274275
elif filepath_or_buffer.endswith('.bz2'):
275276
inferred_compression = 'bz2'
277+
elif filepath_or_buffer.endswith('.zip'):
278+
inferred_compression = 'zip'
276279
else:
277280
inferred_compression = None
278281
else:
@@ -1397,6 +1400,25 @@ def _wrap_compressed(f, compression, encoding=None):
13971400
data = bz2.decompress(f.read())
13981401
f = StringIO(data)
13991402
return f
1403+
elif compression == 'zip':
1404+
import zipfile
1405+
zip_file = zipfile.ZipFile(f)
1406+
zip_names = zip_file.namelist()
1407+
print('ZIPNAMES' + zip_names)
1408+
1409+
if len(zip_names) == 1:
1410+
file_name = zip_names.pop()
1411+
f = zip_file.open(file_name)
1412+
return f
1413+
1414+
elif len(zip_names) == 0:
1415+
raise ValueError('Corrupted or zero files found in compressed '
1416+
'zip file %s', zip_file.filename)
1417+
1418+
else:
1419+
raise ValueError('Multiple files found in compressed '
1420+
'zip file %s', str(zip_names))
1421+
14001422
else:
14011423
raise ValueError('do not recognize compression method %s'
14021424
% compression)

0 commit comments

Comments
 (0)