pandas-dev · lababidi · Jan 29, 2016 · jreback · Mar 18, 2016 · jreback
diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
@@ -56,6 +56,7 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI
 Other Enhancements
 ^^^^^^^^^^^^^^^^^^
 
+- ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV (:issue:`12175`)
 - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
 
 .. _whatsnew_0181.api:

diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -360,6 +360,21 @@ def _get_handle(path, mode, encoding=None, compression=None):
         elif compression == 'bz2':
             import bz2
             f = bz2.BZ2File(path, mode)
+        elif compression == 'zip':
+            import zipfile
+            zip_file = zipfile.ZipFile(path)
+            zip_names = zip_file.namelist()
+
+            if len(zip_names) == 1:
+                file_name = zip_names.pop()
+                f = zip_file.open(file_name)
+            elif len(zip_names) == 0:
+                raise ValueError('Zero files found in ZIP file {}'
+                                 .format(path))
+            else:
+                raise ValueError('Multiple files found in ZIP file.'
+                                 ' Only one file per ZIP :{}'
+                                 .format(zip_names))
         else:
             raise ValueError('Unrecognized compression type: %s' %
                              compression)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
@@ -158,11 +158,12 @@ class ParserWarning(Warning):
     information
     <http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ on
     ``iterator`` and ``chunksize``.
-compression : {'infer', 'gzip', 'bz2', None}, default 'infer'
-    For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
-    bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
-    respectively, and no decompression otherwise. Set to None for no
-    decompression.
+compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer'
+    For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
+    bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or
+    '.zip', respectively, and no decompression otherwise. New in 0.18.1: ZIP
+    compression  If using 'zip', the ZIP file must contain only one data file
+    to be read in. Set to None for no decompression.
 thousands : str, default None
     Thousands separator
 decimal : str, default '.'
@@ -273,6 +274,8 @@ def _read(filepath_or_buffer, kwds):
                 inferred_compression = 'gzip'
             elif filepath_or_buffer.endswith('.bz2'):
                 inferred_compression = 'bz2'
+            elif filepath_or_buffer.endswith('.zip'):
+                inferred_compression = 'zip'
             else:
                 inferred_compression = None
         else:
@@ -1397,6 +1400,25 @@ def _wrap_compressed(f, compression, encoding=None):
             data = bz2.decompress(f.read())
             f = StringIO(data)
         return f
+    elif compression == 'zip':
+        import zipfile
+        zip_file = zipfile.ZipFile(f)
+        zip_names = zip_file.namelist()
+        print('ZIPNAMES' + zip_names)
+
+        if len(zip_names) == 1:
+            file_name = zip_names.pop()
+            f = zip_file.open(file_name)
+            return f
+
+        elif len(zip_names) == 0:
+            raise ValueError('Corrupted or zero files found in compressed '
+                             'zip file %s', zip_file.filename)
+
+        else:
+            raise ValueError('Multiple files found in compressed '
+                             'zip file %s', str(zip_names))
+
     else:
         raise ValueError('do not recognize compression method %s'
                          % compression)