From 68eb884e361cf8f1b3dbc0e628ecf7a7b6fc9418 Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Date: Fri, 29 Jan 2016 10:11:20 -0500
Subject: [PATCH] Add ZIP file decompression and TestCompression.

Fix PEP8 issues. Change Compression to be a Mixin. Add Compression Mixin correctly with current Tests. Add .format, Rename Compression, with-block, empty zip, bad-zip
---
 doc/source/whatsnew/v0.18.1.txt   |   1 +
 pandas/io/common.py               |  15 ++
 pandas/io/parsers.py              |  32 +++-
 pandas/io/tests/test_parsers.py   | 300 +++++++++++++++++++-----------
 pandas/parser.pyx                 |  15 ++
 pandas/tests/frame/test_to_csv.py |   3 +-
 6 files changed, 249 insertions(+), 117 deletions(-)

diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt
index 1ced927001a94..ff8c3347c64ff 100644
--- a/doc/source/whatsnew/v0.18.1.txt
+++ b/doc/source/whatsnew/v0.18.1.txt
@@ -56,6 +56,7 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI
 Other Enhancements
 ^^^^^^^^^^^^^^^^^^
 
+- ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV (:issue:`12175`)
 - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`).
 
 .. _whatsnew_0181.api:
diff --git a/pandas/io/common.py b/pandas/io/common.py
index be8c3ccfe08e6..d44057178d27e 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -360,6 +360,21 @@ def _get_handle(path, mode, encoding=None, compression=None):
         elif compression == 'bz2':
             import bz2
             f = bz2.BZ2File(path, mode)
+        elif compression == 'zip':
+            import zipfile
+            zip_file = zipfile.ZipFile(path)
+            zip_names = zip_file.namelist()
+
+            if len(zip_names) == 1:
+                file_name = zip_names.pop()
+                f = zip_file.open(file_name)
+            elif len(zip_names) == 0:
+                raise ValueError('Zero files found in ZIP file {}'
+                                 .format(path))
+            else:
+                raise ValueError('Multiple files found in ZIP file.'
+                                 ' Only one file per ZIP :{}'
+                                 .format(zip_names))
         else:
             raise ValueError('Unrecognized compression type: %s' %
                              compression)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 36a9abdfbca60..49fbadadfb719 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -158,11 +158,12 @@ class ParserWarning(Warning):
     information
     <http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_ on
     ``iterator`` and ``chunksize``.
-compression : {'infer', 'gzip', 'bz2', None}, default 'infer'
-    For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
-    bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
-    respectively, and no decompression otherwise. Set to None for no
-    decompression.
+compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer'
+    For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
+    bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or
+    '.zip', respectively, and no decompression otherwise. New in 0.18.1: ZIP
+    compression  If using 'zip', the ZIP file must contain only one data file
+    to be read in. Set to None for no decompression.
 thousands : str, default None
     Thousands separator
 decimal : str, default '.'
@@ -273,6 +274,8 @@ def _read(filepath_or_buffer, kwds):
                 inferred_compression = 'gzip'
             elif filepath_or_buffer.endswith('.bz2'):
                 inferred_compression = 'bz2'
+            elif filepath_or_buffer.endswith('.zip'):
+                inferred_compression = 'zip'
             else:
                 inferred_compression = None
         else:
@@ -1397,6 +1400,25 @@ def _wrap_compressed(f, compression, encoding=None):
             data = bz2.decompress(f.read())
             f = StringIO(data)
         return f
+    elif compression == 'zip':
+        import zipfile
+        zip_file = zipfile.ZipFile(f)
+        zip_names = zip_file.namelist()
+        print('ZIPNAMES' + zip_names)
+
+        if len(zip_names) == 1:
+            file_name = zip_names.pop()
+            f = zip_file.open(file_name)
+            return f
+
+        elif len(zip_names) == 0:
+            raise ValueError('Corrupted or zero files found in compressed '
+                             'zip file %s', zip_file.filename)
+
+        else:
+            raise ValueError('Multiple files found in compressed '
+                             'zip file %s', str(zip_names))
+
     else:
         raise ValueError('do not recognize compression method %s'
                          % compression)
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 9f53fc1ded882..7c7b40d77e821 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -3,44 +3,39 @@
 
 # flake8: noqa
 
-from datetime import datetime
 import csv
 import os
-import sys
-import re
-import nose
 import platform
 from distutils.version import LooseVersion
 
+import re
+import sys
+from datetime import datetime
 from multiprocessing.pool import ThreadPool
 
-from numpy import nan
+import nose
 import numpy as np
-from pandas.io.common import DtypeWarning
+import pandas.lib as lib
+import pandas.parser
+from numpy import nan
+from numpy.testing.decorators import slow
+from pandas.lib import Timestamp
 
+import pandas as pd
+import pandas.io.parsers as parsers
+import pandas.tseries.tools as tools
+import pandas.util.testing as tm
 from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
+from pandas import compat
 from pandas.compat import(
     StringIO, BytesIO, PY3, range, long, lrange, lmap, u
 )
-from pandas.io.common import URLError
-import pandas.io.parsers as parsers
+from pandas.compat import parse_date
+from pandas.core.common import AbstractMethodError
+from pandas.io.common import DtypeWarning, URLError
 from pandas.io.parsers import (read_csv, read_table, read_fwf,
                                TextFileReader, TextParser)
-
-import pandas.util.testing as tm
-import pandas as pd
-
-from pandas.core.common import AbstractMethodError
-from pandas.compat import parse_date
-import pandas.lib as lib
-from pandas import compat
-from pandas.lib import Timestamp
 from pandas.tseries.index import date_range
-import pandas.tseries.tools as tools
-
-from numpy.testing.decorators import slow
-
-import pandas.parser
 
 
 class ParserTests(object):
@@ -2696,7 +2691,166 @@ def test_uneven_lines_with_usecols(self):
         tm.assert_frame_equal(df, expected)
 
 
-class TestPythonParser(ParserTests, tm.TestCase):
+class CompressionTests(object):
+    def test_zip(self):
+        try:
+            import zipfile
+        except ImportError:
+            raise nose.SkipTest('need zipfile to run')
+
+        with open(self.csv1, 'rb') as data_file:
+            data = data_file.read()
+            expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean('test_file.zip') as path:
+            tmp = zipfile.ZipFile(path, mode='w')
+            tmp.writestr('test_file', data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='zip')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(path, compression='infer')
+            tm.assert_frame_equal(result, expected)
+
+            if self.engine is not 'python':
+                with open(path, 'rb') as f:
+                    result = self.read_csv(f, compression='zip')
+                    tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean('combined_zip.zip') as path:
+            inner_file_names = ['test_file', 'second_file']
+            tmp = zipfile.ZipFile(path, mode='w')
+            for file_name in inner_file_names:
+                tmp.writestr(file_name, data)
+            tmp.close()
+
+            self.assertRaisesRegexp(ValueError, 'Multiple files', self.read_csv,
+                              path, compression='zip')
+
+            self.assertRaisesRegexp(ValueError, 'Multiple files', self.read_csv,
+                              path, compression='infer')
+
+        with tm.ensure_clean() as path:
+            tmp = zipfile.ZipFile(path, mode='w')
+            tmp.close()
+
+            self.assertRaisesRegexp(ValueError, 'Zero files',self.read_csv,
+                              path, compression='zip')
+
+        with tm.ensure_clean() as path:
+            with open(path, 'wb') as f:
+                self.assertRaises(zipfile.BadZipfile, self.read_csv, f, compression='zip')
+
+
+    def test_gzip(self):
+        try:
+            import gzip
+        except ImportError:
+            raise nose.SkipTest('need gzip to run')
+
+        with open(self.csv1, 'rb') as data_file:
+            data = data_file.read()
+            expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            tmp = gzip.GzipFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='gzip')
+            tm.assert_frame_equal(result, expected)
+
+            with open(path, 'rb') as f:
+                result = self.read_csv(f, compression='gzip')
+                tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean('test.gz') as path:
+            tmp = gzip.GzipFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+            result = self.read_csv(path, compression='infer')
+            tm.assert_frame_equal(result, expected)
+
+    def test_bz2(self):
+        try:
+            import bz2
+        except ImportError:
+            raise nose.SkipTest('need bz2 to run')
+
+        with open(self.csv1, 'rb') as data_file:
+            data = data_file.read()
+            expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            tmp = bz2.BZ2File(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='bz2')
+            tm.assert_frame_equal(result, expected)
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='bz3')
+
+            with open(path, 'rb') as fin:
+                if compat.PY3:
+                    result = self.read_csv(fin, compression='bz2')
+                    tm.assert_frame_equal(result, expected)
+                elif self.engine is not 'python':
+                    self.assertRaises(ValueError, self.read_csv,
+                                      fin, compression='bz2')
+
+        with tm.ensure_clean('test.bz2') as path:
+            tmp = bz2.BZ2File(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+            result = self.read_csv(path, compression='infer')
+            tm.assert_frame_equal(result, expected)
+
+    def test_decompression_regex_sep(self):
+        try:
+            import gzip
+            import bz2
+        except ImportError:
+            raise nose.SkipTest('need gzip and bz2 to run')
+
+        with open(self.csv1, 'rb') as data_file:
+            data = data_file.read()
+            data = data.replace(b',', b'::')
+            expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            tmp = gzip.GzipFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            # GH 6607
+            # Test currently only valid with the python engine because of
+            # regex sep. Temporarily copied to TestPythonParser.
+            # Here test for ValueError when passing regex sep:
+
+            with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
+                result = self.read_csv(path, sep='::', compression='gzip', engine='c')
+                tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean() as path:
+            tmp = bz2.BZ2File(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            # GH 6607
+            with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
+                result = self.read_csv(path, sep='::', compression='bz2', engine='c')
+                tm.assert_frame_equal(result, expected)
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='bz3')
+
+
+class TestPythonParser(ParserTests, CompressionTests, tm.TestCase):
+
+    engine = 'python'
 
     def test_negative_skipfooter_raises(self):
         text = """#foo,a,b,c
@@ -2716,12 +2870,12 @@ def test_negative_skipfooter_raises(self):
 
     def read_csv(self, *args, **kwds):
         kwds = kwds.copy()
-        kwds['engine'] = 'python'
+        kwds['engine'] = self.engine
         return read_csv(*args, **kwds)
 
     def read_table(self, *args, **kwds):
         kwds = kwds.copy()
-        kwds['engine'] = 'python'
+        kwds['engine'] = self.engine
         return read_table(*args, **kwds)
 
     def float_precision_choices(self):
@@ -3521,17 +3675,19 @@ def test_buffer_rd_bytes(self):
             except Exception as e:
                 pass
 
-class TestCParserHighMemory(CParserTests, tm.TestCase):
+
+class TestCParserHighMemory(CParserTests, CompressionTests, tm.TestCase):
+    engine = 'c'
 
     def read_csv(self, *args, **kwds):
         kwds = kwds.copy()
-        kwds['engine'] = 'c'
+        kwds['engine'] = self.engine
         kwds['low_memory'] = False
         return read_csv(*args, **kwds)
 
     def read_table(self, *args, **kwds):
         kwds = kwds.copy()
-        kwds['engine'] = 'c'
+        kwds['engine'] = self.engine
         kwds['low_memory'] = False
         return read_table(*args, **kwds)
 
@@ -3832,18 +3988,20 @@ def test_single_char_leading_whitespace(self):
         tm.assert_frame_equal(result, expected)
 
 
-class TestCParserLowMemory(CParserTests, tm.TestCase):
+class TestCParserLowMemory(CParserTests, CompressionTests, tm.TestCase):
+
+    engine = 'c'
 
     def read_csv(self, *args, **kwds):
         kwds = kwds.copy()
-        kwds['engine'] = 'c'
+        kwds['engine'] = self.engine
         kwds['low_memory'] = True
         kwds['buffer_lines'] = 2
         return read_csv(*args, **kwds)
 
     def read_table(self, *args, **kwds):
         kwds = kwds.copy()
-        kwds['engine'] = 'c'
+        kwds['engine'] = self.engine
         kwds['low_memory'] = True
         kwds['buffer_lines'] = 2
         return read_table(*args, **kwds)
@@ -4060,86 +4218,6 @@ def test_pure_python_failover(self):
         expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
         tm.assert_frame_equal(result, expected)
 
-    def test_decompression(self):
-        try:
-            import gzip
-            import bz2
-        except ImportError:
-            raise nose.SkipTest('need gzip and bz2 to run')
-
-        data = open(self.csv1, 'rb').read()
-        expected = self.read_csv(self.csv1)
-
-        with tm.ensure_clean() as path:
-            tmp = gzip.GzipFile(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            result = self.read_csv(path, compression='gzip')
-            tm.assert_frame_equal(result, expected)
-
-            result = self.read_csv(open(path, 'rb'), compression='gzip')
-            tm.assert_frame_equal(result, expected)
-
-        with tm.ensure_clean() as path:
-            tmp = bz2.BZ2File(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            result = self.read_csv(path, compression='bz2')
-            tm.assert_frame_equal(result, expected)
-
-            # result = self.read_csv(open(path, 'rb'), compression='bz2')
-            # tm.assert_frame_equal(result, expected)
-
-            self.assertRaises(ValueError, self.read_csv,
-                              path, compression='bz3')
-
-            with open(path, 'rb') as fin:
-                if compat.PY3:
-                    result = self.read_csv(fin, compression='bz2')
-                    tm.assert_frame_equal(result, expected)
-                else:
-                    self.assertRaises(ValueError, self.read_csv,
-                                      fin, compression='bz2')
-
-    def test_decompression_regex_sep(self):
-        try:
-            import gzip
-            import bz2
-        except ImportError:
-            raise nose.SkipTest('need gzip and bz2 to run')
-
-        data = open(self.csv1, 'rb').read()
-        data = data.replace(b',', b'::')
-        expected = self.read_csv(self.csv1)
-
-        with tm.ensure_clean() as path:
-            tmp = gzip.GzipFile(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            # GH 6607
-            # Test currently only valid with the python engine because of
-            # regex sep. Temporarily copied to TestPythonParser.
-            # Here test for ValueError when passing regex sep:
-
-            with tm.assertRaisesRegexp(ValueError, 'regex sep'):  # XXX
-                result = self.read_csv(path, sep='::', compression='gzip')
-                tm.assert_frame_equal(result, expected)
-
-        with tm.ensure_clean() as path:
-            tmp = bz2.BZ2File(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            # GH 6607
-            with tm.assertRaisesRegexp(ValueError, 'regex sep'):  # XXX
-                result = self.read_csv(path, sep='::', compression='bz2')
-                tm.assert_frame_equal(result, expected)
-
-            self.assertRaises(ValueError, self.read_csv,
-                              path, compression='bz3')
 
     def test_memory_map(self):
         # it works!
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index e2ba8d9d07ae2..8bfc0ab8d6c56 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -567,6 +567,21 @@ cdef class TextReader:
                 else:
                     raise ValueError('Python 2 cannot read bz2 from open file '
                                      'handle')
+            elif self.compression == 'zip':
+                import zipfile
+                zip_file = zipfile.ZipFile(source)
+                zip_names = zip_file.namelist()
+
+                if len(zip_names) == 1:
+                    file_name = zip_names.pop()
+                    source = zip_file.open(file_name)
+
+                elif len(zip_names) == 0:
+                    raise ValueError('Zero files found in compressed '
+                                     'zip file %s', source)
+                else:
+                    raise ValueError('Multiple files found in compressed '
+                                     'zip file %s', str(zip_names))
             else:
                 raise ValueError('Unrecognized compression type: %s' %
                                  self.compression)
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
index a5b86b35d330e..4faf67eda6c78 100644
--- a/pandas/tests/frame/test_to_csv.py
+++ b/pandas/tests/frame/test_to_csv.py
@@ -994,7 +994,8 @@ def test_to_csv_compression_value_error(self):
 
         with ensure_clean() as filename:
             # zip compression is not supported and should raise ValueError
-            self.assertRaises(ValueError, df.to_csv,
+            import zipfile
+            self.assertRaises(zipfile.BadZipfile, df.to_csv,
                               filename, compression="zip")
 
     def test_to_csv_date_format(self):