From 7f3461c70e75ed73896845f54883b5bea781cc5e Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Date: Wed, 20 Jan 2016 15:53:05 -0500
Subject: [PATCH 1/8] Add Zip file functionality. Fixes #11413

---
 pandas/io/parsers.py                  |  21 +++++++++++++++++++--
 pandas/io/tests/data/salary.table.zip | Bin 0 -> 445 bytes
 pandas/io/tests/test_parsers.py       |  15 ++++++++++++++-
 pandas/parser.pyx                     |  12 ++++++++++++
 4 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 pandas/io/tests/data/salary.table.zip

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 9d25eaecc6620..8df5390845c0f 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -61,9 +61,9 @@ class ParserWarning(Warning):
 dtype : Type name or dict of column -> type, default None
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     (Unsupported with engine='python')
-compression : {'gzip', 'bz2', 'infer', None}, default 'infer'
+compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer'
     For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
-    bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
+    bz2 if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip',
     respectively, and no decompression otherwise. Set to None for no
     decompression.
 dialect : string or csv.Dialect instance, default None
@@ -252,6 +252,8 @@ def _read(filepath_or_buffer, kwds):
                 inferred_compression = 'gzip'
             elif filepath_or_buffer.endswith('.bz2'):
                 inferred_compression = 'bz2'
+            elif filepath_or_buffer.endswith('.zip'):
+                inferred_compression = 'zip'
             else:
                 inferred_compression = None
         else:
@@ -1379,6 +1381,21 @@ def _wrap_compressed(f, compression, encoding=None):
             data = bz2.decompress(f.read())
             f = StringIO(data)
         return f
+    elif compression == 'zip':
+        import zipfile
+        zip_file = zipfile.ZipFile(f)
+        zip_names = zip_file.namelist()
+
+        if len(zip_names) == 1:
+            file_name = zip_names.pop()
+            f = zip_file.open(file_name)
+            return f
+
+        elif len(zip_names)>1:
+            raise ValueError('Multiple files found in compressed '
+                             'zip file %s', str(zip_names))
+        return f
+
     else:
         raise ValueError('do not recognize compression method %s'
                          % compression)
diff --git a/pandas/io/tests/data/salary.table.zip b/pandas/io/tests/data/salary.table.zip
new file mode 100644
index 0000000000000000000000000000000000000000..97a74a99830826f8dac8935688f07fcfdbf8bb5f
GIT binary patch
literal 445
zcmWIWW@Zs#U|`^2_+4t^;a{n{o}ZC{;U^OV0}q1?LvdnGVo{}DNn%n?YG?>219Rq+
z`C$uzxU_<sfsy4aP$^iqZ_w`2BL+OHYlH7_{FyAJkh;p`X8j}&m7}Mt^>`YXG;Y|R
zfB*Sw&Hf38-~LUo*LP|5IPjj`jE{YfMQK+}^@@^`&-I^K9!^pEx~KlbO!m}VL&Jw!
zcPfvZ;H-+9mfbFI?^L$u*T;Cq2|wPc@Ay9PwZFr$&gkc|1%=OCJszD)o-}v4e9pVh
z>bU7s@6`%^es^AtKT`AKS?;*b)BRfK)9R!5{r>iGazm|=+I?1)!u#_LxZ>-l?x^~v
z`qMUnBi70F3)>#sZVm1v{u!OlE1p!&QB~#X&Is<dxKtB%Ypt{3-s$T~Ip!tx?CdoA
zs=8Zp&MQw{zx7t#l^=GVUcQE-PjS`UKhn$VVpyJcoIjFe`S|j2`2cT5COKwY!72d^
mZUzQm&@n7&1hG&9nH3VqXaOAH&B_K+%Ls%)KspqpodEzITdJb~

literal 0
HcmV?d00001

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index e34f2cb87a2df..dcf825c0e4d78 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -3809,12 +3809,25 @@ def test_decompression(self):
         try:
             import gzip
             import bz2
+            import zipfile
         except ImportError:
-            raise nose.SkipTest('need gzip and bz2 to run')
+            raise nose.SkipTest('need zipfile, gzip and bz2 to run')
 
         data = open(self.csv1, 'rb').read()
         expected = self.read_csv(self.csv1)
 
+        with tm.ensure_clean() as path:
+            file_name = 'test_file'
+            tmp = zipfile.ZipFile(path, mode='w')
+            tmp.writestr(file_name, data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='zip')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='zip')
+            tm.assert_frame_equal(result, expected)
+
         with tm.ensure_clean() as path:
             tmp = gzip.GzipFile(path, mode='wb')
             tmp.write(data)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index f9b8d921f02d1..8c3ddc4127b3b 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -563,6 +563,18 @@ cdef class TextReader:
                 else:
                     raise ValueError('Python 2 cannot read bz2 from open file '
                                      'handle')
+            elif self.compression == 'zip':
+                import zipfile
+                zip_file = zipfile.ZipFile(source)
+                zip_names = zip_file.namelist()
+
+                if len(zip_names) == 1:
+                    file_name = zip_names.pop()
+                    source = zip_file.open(file_name)
+
+                elif len(zip_names)>1:
+                    raise ValueError('Multiple files found in compressed '
+                                     'zip file %s', str(zip_names))
             else:
                 raise ValueError('Unrecognized compression type: %s' %
                                  self.compression)

From 2fc43b68c3ef956a3026bf86d2b9efd646280b85 Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Date: Thu, 21 Jan 2016 10:35:24 -0500
Subject: [PATCH 2/8] Add test to ensure ValueError is thrown when ZIP file
 contains multiple files.

---
 pandas/io/tests/test_parsers.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index dcf825c0e4d78..a333a0b10f383 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -3828,6 +3828,17 @@ def test_decompression(self):
             result = self.read_csv(open(path, 'rb'), compression='zip')
             tm.assert_frame_equal(result, expected)
 
+
+        with tm.ensure_clean() as path:
+            file_names = ['test_file', 'second_file']
+            tmp = zipfile.ZipFile(path, mode='w')
+            for file_name in file_names:
+                tmp.writestr(file_name, data)
+            tmp.close()
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='zip')
+
         with tm.ensure_clean() as path:
             tmp = gzip.GzipFile(path, mode='wb')
             tmp.write(data)

From f5a641d98a8dbb2c0fe37a685773706cf1032e5b Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <lababidi@gmail.com>
Date: Tue, 26 Jan 2016 14:42:13 -0500
Subject: [PATCH 3/8] Add parser description warning to handle ZIP files

In the description for the parser, a warning/comment is made that a zip file may only contain one file that needs to be read in. If more than one file is compressed into the ZIP file, a ValueError is thrown.
---
 pandas/io/parsers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 8df5390845c0f..287c1990ff7f0 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -62,10 +62,10 @@ class ParserWarning(Warning):
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     (Unsupported with engine='python')
 compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer'
-    For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
-    bz2 if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip',
-    respectively, and no decompression otherwise. Set to None for no
-    decompression.
+    For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
+    bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip',
+    respectively, and no decompression otherwise. If using 'zip', the ZIP file must 
+    contain only one data file to be read in. Set to None for no decompression.
 dialect : string or csv.Dialect instance, default None
     If None defaults to Excel dialect. Ignored if sep longer than 1 char
     See csv.Dialect documentation for more details

From cf7c3473417da0bfcf5f1e03e247814b0c380a81 Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Date: Tue, 26 Jan 2016 17:54:05 -0500
Subject: [PATCH 4/8] Create TestCompression Nose Test Class. Split
 test_compression into test_gzip, test_bz2, test_zip. Add tests for python and
 c engines.

---
 pandas/io/common.py             |  10 ++
 pandas/io/parsers.py            |   9 +-
 pandas/io/tests/test_parsers.py | 287 ++++++++++++++++++--------------
 pandas/parser.pyx               |   2 +-
 4 files changed, 176 insertions(+), 132 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index e46f609077810..08733c38a1faa 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -339,6 +339,16 @@ def _get_handle(path, mode, encoding=None, compression=None):
         elif compression == 'bz2':
             import bz2
             f = bz2.BZ2File(path, mode)
+        elif compression == 'zip':
+            import zipfile
+            zip_file = zipfile.ZipFile(path)
+            zip_names = zip_file.namelist()
+
+            if len(zip_names) == 1:
+                file_name = zip_names.pop()
+                f = zip_file.open(file_name)
+            else:
+                raise ValueError('ZIP file contains multiple files {}', zip_file.filename)
         else:
             raise ValueError('Unrecognized compression type: %s' %
                              compression)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 8df5390845c0f..ea93fbcccfeb1 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -732,10 +732,10 @@ def _make_engine(self, engine='c'):
         if engine == 'c':
             self._engine = CParserWrapper(self.f, **self.options)
         else:
-            if engine == 'python':
-                klass = PythonParser
-            elif engine == 'python-fwf':
+            if engine == 'python-fwf':
                 klass = FixedWidthFieldParser
+            else:  #default to engine == 'python':
+                klass = PythonParser
             self._engine = klass(self.f, **self.options)
 
     def _failover_to_python(self):
@@ -1391,10 +1391,9 @@ def _wrap_compressed(f, compression, encoding=None):
             f = zip_file.open(file_name)
             return f
 
-        elif len(zip_names)>1:
+        else:
             raise ValueError('Multiple files found in compressed '
                              'zip file %s', str(zip_names))
-        return f
 
     else:
         raise ValueError('do not recognize compression method %s'
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index a333a0b10f383..1dcbf9d194d8b 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -1,42 +1,37 @@
 # -*- coding: utf-8 -*-
 # pylint: disable=E1101
 
-from datetime import datetime
 import csv
 import os
-import sys
-import re
-import nose
 import platform
-
+import re
+import sys
+from datetime import datetime
 from multiprocessing.pool import ThreadPool
 
-from numpy import nan
+import nose
 import numpy as np
-from pandas.io.common import DtypeWarning
+import pandas.lib as lib
+import pandas.parser
+from numpy import nan
+from numpy.testing.decorators import slow
+from pandas.lib import Timestamp
 
+import pandas as pd
+import pandas.io.parsers as parsers
+import pandas.tseries.tools as tools
+import pandas.util.testing as tm
 from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
+from pandas import compat
 from pandas.compat import(
     StringIO, BytesIO, PY3, range, long, lrange, lmap, u
 )
+from pandas.compat import parse_date
+from pandas.io.common import DtypeWarning
 from pandas.io.common import URLError
-import pandas.io.parsers as parsers
 from pandas.io.parsers import (read_csv, read_table, read_fwf,
                                TextFileReader, TextParser)
-
-import pandas.util.testing as tm
-import pandas as pd
-
-from pandas.compat import parse_date
-import pandas.lib as lib
-from pandas import compat
-from pandas.lib import Timestamp
 from pandas.tseries.index import date_range
-import pandas.tseries.tools as tools
-
-from numpy.testing.decorators import slow
-
-import pandas.parser
 
 
 class ParserTests(object):
@@ -3590,6 +3585,151 @@ def test_single_char_leading_whitespace(self):
                                skipinitialspace=True)
         tm.assert_frame_equal(result, expected)
 
+
+class TestCompression(ParserTests, tm.TestCase):
+
+    def read_csv(self, *args, **kwargs):
+        return read_csv(*args, **kwargs)
+
+    def read_table(self, *args, **kwargs):
+        return read_csv(*args, **kwargs)
+
+    def test_zip(self):
+        try:
+            import zipfile
+        except ImportError:
+            raise nose.SkipTest('need zipfile to run')
+
+        data = open(self.csv1, 'rb').read()
+        expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            file_name = 'test_file'
+            tmp = zipfile.ZipFile(path, mode='w')
+            tmp.writestr(file_name, data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='zip', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(path, compression='zip', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='zip', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='zip', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean() as path:
+            file_names = ['test_file', 'second_file']
+            tmp = zipfile.ZipFile(path, mode='w')
+            for file_name in file_names:
+                tmp.writestr(file_name, data)
+            tmp.close()
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='zip', engine='c')
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='zip', engine='python')
+
+    def test_gzip(self):
+        try:
+            import gzip
+        except ImportError:
+            raise nose.SkipTest('need gzip to run')
+
+        data = open(self.csv1, 'rb').read()
+        expected = self.read_csv(self.csv1)
+        with tm.ensure_clean() as path:
+            tmp = gzip.GzipFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='gzip', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(path, compression='gzip', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='gzip', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='gzip', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+    def test_bz2(self):
+        try:
+            import bz2
+        except ImportError:
+            raise nose.SkipTest('need bz2 to run')
+
+        data = open(self.csv1, 'rb').read()
+        expected = self.read_csv(self.csv1)
+        with tm.ensure_clean() as path:
+            tmp = bz2.BZ2File(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='bz2', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(path, compression='bz2', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='bz3')
+
+            with open(path, 'rb') as fin:
+                if compat.PY3:
+                    result = self.read_csv(fin, compression='bz2', engine='c')
+                    tm.assert_frame_equal(result, expected)
+                    result = self.read_csv(fin, compression='bz2', engine='python')
+                    tm.assert_frame_equal(result, expected)
+                else:
+                    self.assertRaises(ValueError, self.read_csv,
+                                      fin, compression='bz2', engine='c')
+
+    def test_decompression_regex_sep(self):
+        try:
+            import gzip
+            import bz2
+        except ImportError:
+            raise nose.SkipTest('need gzip and bz2 to run')
+
+        data = open(self.csv1, 'rb').read()
+        data = data.replace(b',', b'::')
+        expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            tmp = gzip.GzipFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            # GH 6607
+            # Test currently only valid with the python engine because of
+            # regex sep. Temporarily copied to TestPythonParser.
+            # Here test for ValueError when passing regex sep:
+
+            with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
+                result = self.read_csv(path, sep='::', compression='gzip', engine='c')
+                tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean() as path:
+            tmp = bz2.BZ2File(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            # GH 6607
+            with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
+                result = self.read_csv(path, sep='::', compression='bz2', engine='c')
+                tm.assert_frame_equal(result, expected)
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='bz3')
+
+
 class TestCParserLowMemory(ParserTests, tm.TestCase):
 
     def read_csv(self, *args, **kwds):
@@ -3805,111 +3945,6 @@ def test_pure_python_failover(self):
         expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
         tm.assert_frame_equal(result, expected)
 
-    def test_decompression(self):
-        try:
-            import gzip
-            import bz2
-            import zipfile
-        except ImportError:
-            raise nose.SkipTest('need zipfile, gzip and bz2 to run')
-
-        data = open(self.csv1, 'rb').read()
-        expected = self.read_csv(self.csv1)
-
-        with tm.ensure_clean() as path:
-            file_name = 'test_file'
-            tmp = zipfile.ZipFile(path, mode='w')
-            tmp.writestr(file_name, data)
-            tmp.close()
-
-            result = self.read_csv(path, compression='zip')
-            tm.assert_frame_equal(result, expected)
-
-            result = self.read_csv(open(path, 'rb'), compression='zip')
-            tm.assert_frame_equal(result, expected)
-
-
-        with tm.ensure_clean() as path:
-            file_names = ['test_file', 'second_file']
-            tmp = zipfile.ZipFile(path, mode='w')
-            for file_name in file_names:
-                tmp.writestr(file_name, data)
-            tmp.close()
-
-            self.assertRaises(ValueError, self.read_csv,
-                              path, compression='zip')
-
-        with tm.ensure_clean() as path:
-            tmp = gzip.GzipFile(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            result = self.read_csv(path, compression='gzip')
-            tm.assert_frame_equal(result, expected)
-
-            result = self.read_csv(open(path, 'rb'), compression='gzip')
-            tm.assert_frame_equal(result, expected)
-
-        with tm.ensure_clean() as path:
-            tmp = bz2.BZ2File(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            result = self.read_csv(path, compression='bz2')
-            tm.assert_frame_equal(result, expected)
-
-            # result = self.read_csv(open(path, 'rb'), compression='bz2')
-            # tm.assert_frame_equal(result, expected)
-
-            self.assertRaises(ValueError, self.read_csv,
-                              path, compression='bz3')
-
-            with open(path, 'rb') as fin:
-                if compat.PY3:
-                    result = self.read_csv(fin, compression='bz2')
-                    tm.assert_frame_equal(result, expected)
-                else:
-                    self.assertRaises(ValueError, self.read_csv,
-                                      fin, compression='bz2')
-
-    def test_decompression_regex_sep(self):
-        try:
-            import gzip
-            import bz2
-        except ImportError:
-            raise nose.SkipTest('need gzip and bz2 to run')
-
-        data = open(self.csv1, 'rb').read()
-        data = data.replace(b',', b'::')
-        expected = self.read_csv(self.csv1)
-
-        with tm.ensure_clean() as path:
-            tmp = gzip.GzipFile(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            # GH 6607
-            # Test currently only valid with the python engine because of
-            # regex sep. Temporarily copied to TestPythonParser.
-            # Here test for ValueError when passing regex sep:
-
-            with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
-                result = self.read_csv(path, sep='::', compression='gzip')
-                tm.assert_frame_equal(result, expected)
-
-        with tm.ensure_clean() as path:
-            tmp = bz2.BZ2File(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            # GH 6607
-            with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
-                result = self.read_csv(path, sep='::', compression='bz2')
-                tm.assert_frame_equal(result, expected)
-
-            self.assertRaises(ValueError, self.read_csv,
-                              path, compression='bz3')
-
     def test_memory_map(self):
         # it works!
         result = self.read_csv(self.csv1, memory_map=True)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index 8c3ddc4127b3b..cad389f9e2a09 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -572,7 +572,7 @@ cdef class TextReader:
                     file_name = zip_names.pop()
                     source = zip_file.open(file_name)
 
-                elif len(zip_names)>1:
+                else:
                     raise ValueError('Multiple files found in compressed '
                                      'zip file %s', str(zip_names))
             else:

From 56e55a660b6af3f49f2d9a0a23b5d1158c6afe91 Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Date: Wed, 20 Jan 2016 15:53:05 -0500
Subject: [PATCH 5/8] Add Zip file functionality. Fixes #11413

---
 pandas/io/parsers.py                  |  21 +++++++++++++++++++--
 pandas/io/tests/data/salary.table.zip | Bin 0 -> 445 bytes
 pandas/io/tests/test_parsers.py       |  15 ++++++++++++++-
 pandas/parser.pyx                     |  12 ++++++++++++
 4 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 pandas/io/tests/data/salary.table.zip

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index dc6923b752ac7..06b4d80b844fe 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -61,9 +61,9 @@ class ParserWarning(Warning):
 dtype : Type name or dict of column -> type, default None
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     (Unsupported with engine='python')
-compression : {'gzip', 'bz2', 'infer', None}, default 'infer'
+compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer'
     For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
-    bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
+    bz2 if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip',
     respectively, and no decompression otherwise. Set to None for no
     decompression.
 dialect : string or csv.Dialect instance, default None
@@ -258,6 +258,8 @@ def _read(filepath_or_buffer, kwds):
                 inferred_compression = 'gzip'
             elif filepath_or_buffer.endswith('.bz2'):
                 inferred_compression = 'bz2'
+            elif filepath_or_buffer.endswith('.zip'):
+                inferred_compression = 'zip'
             else:
                 inferred_compression = None
         else:
@@ -1387,6 +1389,21 @@ def _wrap_compressed(f, compression, encoding=None):
             data = bz2.decompress(f.read())
             f = StringIO(data)
         return f
+    elif compression == 'zip':
+        import zipfile
+        zip_file = zipfile.ZipFile(f)
+        zip_names = zip_file.namelist()
+
+        if len(zip_names) == 1:
+            file_name = zip_names.pop()
+            f = zip_file.open(file_name)
+            return f
+
+        elif len(zip_names)>1:
+            raise ValueError('Multiple files found in compressed '
+                             'zip file %s', str(zip_names))
+        return f
+
     else:
         raise ValueError('do not recognize compression method %s'
                          % compression)
diff --git a/pandas/io/tests/data/salary.table.zip b/pandas/io/tests/data/salary.table.zip
new file mode 100644
index 0000000000000000000000000000000000000000..97a74a99830826f8dac8935688f07fcfdbf8bb5f
GIT binary patch
literal 445
zcmWIWW@Zs#U|`^2_+4t^;a{n{o}ZC{;U^OV0}q1?LvdnGVo{}DNn%n?YG?>219Rq+
z`C$uzxU_<sfsy4aP$^iqZ_w`2BL+OHYlH7_{FyAJkh;p`X8j}&m7}Mt^>`YXG;Y|R
zfB*Sw&Hf38-~LUo*LP|5IPjj`jE{YfMQK+}^@@^`&-I^K9!^pEx~KlbO!m}VL&Jw!
zcPfvZ;H-+9mfbFI?^L$u*T;Cq2|wPc@Ay9PwZFr$&gkc|1%=OCJszD)o-}v4e9pVh
z>bU7s@6`%^es^AtKT`AKS?;*b)BRfK)9R!5{r>iGazm|=+I?1)!u#_LxZ>-l?x^~v
z`qMUnBi70F3)>#sZVm1v{u!OlE1p!&QB~#X&Is<dxKtB%Ypt{3-s$T~Ip!tx?CdoA
zs=8Zp&MQw{zx7t#l^=GVUcQE-PjS`UKhn$VVpyJcoIjFe`S|j2`2cT5COKwY!72d^
mZUzQm&@n7&1hG&9nH3VqXaOAH&B_K+%Ls%)KspqpodEzITdJb~

literal 0
HcmV?d00001

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 7c68a44874631..5214b8d5c72c5 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -3985,12 +3985,25 @@ def test_decompression(self):
         try:
             import gzip
             import bz2
+            import zipfile
         except ImportError:
-            raise nose.SkipTest('need gzip and bz2 to run')
+            raise nose.SkipTest('need zipfile, gzip and bz2 to run')
 
         data = open(self.csv1, 'rb').read()
         expected = self.read_csv(self.csv1)
 
+        with tm.ensure_clean() as path:
+            file_name = 'test_file'
+            tmp = zipfile.ZipFile(path, mode='w')
+            tmp.writestr(file_name, data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='zip')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='zip')
+            tm.assert_frame_equal(result, expected)
+
         with tm.ensure_clean() as path:
             tmp = gzip.GzipFile(path, mode='wb')
             tmp.write(data)
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index f9b8d921f02d1..8c3ddc4127b3b 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -563,6 +563,18 @@ cdef class TextReader:
                 else:
                     raise ValueError('Python 2 cannot read bz2 from open file '
                                      'handle')
+            elif self.compression == 'zip':
+                import zipfile
+                zip_file = zipfile.ZipFile(source)
+                zip_names = zip_file.namelist()
+
+                if len(zip_names) == 1:
+                    file_name = zip_names.pop()
+                    source = zip_file.open(file_name)
+
+                elif len(zip_names)>1:
+                    raise ValueError('Multiple files found in compressed '
+                                     'zip file %s', str(zip_names))
             else:
                 raise ValueError('Unrecognized compression type: %s' %
                                  self.compression)

From 40fe2688e63f9d0f3f03885fff8ec4fe1b8e0acd Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Date: Thu, 21 Jan 2016 10:35:24 -0500
Subject: [PATCH 6/8] Add test to ensure ValueError is thrown when ZIP file
 contains multiple files.

---
 pandas/io/tests/test_parsers.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 5214b8d5c72c5..0621a5a95aa3f 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -4004,6 +4004,17 @@ def test_decompression(self):
             result = self.read_csv(open(path, 'rb'), compression='zip')
             tm.assert_frame_equal(result, expected)
 
+
+        with tm.ensure_clean() as path:
+            file_names = ['test_file', 'second_file']
+            tmp = zipfile.ZipFile(path, mode='w')
+            for file_name in file_names:
+                tmp.writestr(file_name, data)
+            tmp.close()
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='zip')
+
         with tm.ensure_clean() as path:
             tmp = gzip.GzipFile(path, mode='wb')
             tmp.write(data)

From ee336f1198a935505908c32761e60baa200ef328 Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <lababidi@gmail.com>
Date: Tue, 26 Jan 2016 14:42:13 -0500
Subject: [PATCH 7/8] Add parser description warning to handle ZIP files

In the description for the parser, a warning/comment is made that a zip file may only contain one file that needs to be read in. If more than one file is compressed into the ZIP file, a ValueError is thrown.
---
 pandas/io/parsers.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 06b4d80b844fe..745ba54729077 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -62,10 +62,10 @@ class ParserWarning(Warning):
     Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
     (Unsupported with engine='python')
 compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer'
-    For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
-    bz2 if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip',
-    respectively, and no decompression otherwise. Set to None for no
-    decompression.
+    For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
+    bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip',
+    respectively, and no decompression otherwise. If using 'zip', the ZIP file must 
+    contain only one data file to be read in. Set to None for no decompression.
 dialect : string or csv.Dialect instance, default None
     If None defaults to Excel dialect. Ignored if sep longer than 1 char
     See csv.Dialect documentation for more details

From 1c3ecd73a538c1f9739249146cc0c7ca944bebf1 Mon Sep 17 00:00:00 2001
From: Mahmoud Lababidi <mahmoud@thehumangeo.com>
Date: Tue, 26 Jan 2016 17:54:05 -0500
Subject: [PATCH 8/8] Create TestCompression Nose Test Class. Split
 test_compression into test_gzip, test_bz2, test_zip. Add tests for python and
 c engines.

---
 pandas/io/common.py             |  10 ++
 pandas/io/parsers.py            |   9 +-
 pandas/io/tests/test_parsers.py | 284 ++++++++++++++++++--------------
 pandas/parser.pyx               |   2 +-
 4 files changed, 175 insertions(+), 130 deletions(-)

diff --git a/pandas/io/common.py b/pandas/io/common.py
index 811d42b7b4b9e..11cf45aa47f8e 100644
--- a/pandas/io/common.py
+++ b/pandas/io/common.py
@@ -345,6 +345,16 @@ def _get_handle(path, mode, encoding=None, compression=None):
         elif compression == 'bz2':
             import bz2
             f = bz2.BZ2File(path, mode)
+        elif compression == 'zip':
+            import zipfile
+            zip_file = zipfile.ZipFile(path)
+            zip_names = zip_file.namelist()
+
+            if len(zip_names) == 1:
+                file_name = zip_names.pop()
+                f = zip_file.open(file_name)
+            else:
+                raise ValueError('ZIP file contains multiple files {}', zip_file.filename)
         else:
             raise ValueError('Unrecognized compression type: %s' %
                              compression)
diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py
index 745ba54729077..7eb544ceddad3 100755
--- a/pandas/io/parsers.py
+++ b/pandas/io/parsers.py
@@ -740,10 +740,10 @@ def _make_engine(self, engine='c'):
         if engine == 'c':
             self._engine = CParserWrapper(self.f, **self.options)
         else:
-            if engine == 'python':
-                klass = PythonParser
-            elif engine == 'python-fwf':
+            if engine == 'python-fwf':
                 klass = FixedWidthFieldParser
+            else:  #default to engine == 'python':
+                klass = PythonParser
             self._engine = klass(self.f, **self.options)
 
     def _failover_to_python(self):
@@ -1399,10 +1399,9 @@ def _wrap_compressed(f, compression, encoding=None):
             f = zip_file.open(file_name)
             return f
 
-        elif len(zip_names)>1:
+        else:
             raise ValueError('Multiple files found in compressed '
                              'zip file %s', str(zip_names))
-        return f
 
     else:
         raise ValueError('do not recognize compression method %s'
diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py
index 0621a5a95aa3f..a17a7f2e6df6c 100755
--- a/pandas/io/tests/test_parsers.py
+++ b/pandas/io/tests/test_parsers.py
@@ -6,39 +6,35 @@
 from datetime import datetime
 import csv
 import os
-import sys
-import re
-import nose
 import platform
-
+import re
+import sys
+from datetime import datetime
 from multiprocessing.pool import ThreadPool
 
-from numpy import nan
+import nose
 import numpy as np
-from pandas.io.common import DtypeWarning
+import pandas.lib as lib
+import pandas.parser
+from numpy import nan
+from numpy.testing.decorators import slow
+from pandas.lib import Timestamp
 
+import pandas as pd
+import pandas.io.parsers as parsers
+import pandas.tseries.tools as tools
+import pandas.util.testing as tm
 from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
+from pandas import compat
 from pandas.compat import(
     StringIO, BytesIO, PY3, range, long, lrange, lmap, u
 )
+from pandas.compat import parse_date
+from pandas.io.common import DtypeWarning
 from pandas.io.common import URLError
-import pandas.io.parsers as parsers
 from pandas.io.parsers import (read_csv, read_table, read_fwf,
                                TextFileReader, TextParser)
-
-import pandas.util.testing as tm
-import pandas as pd
-
-from pandas.compat import parse_date
-import pandas.lib as lib
-from pandas import compat
-from pandas.lib import Timestamp
 from pandas.tseries.index import date_range
-import pandas.tseries.tools as tools
-
-from numpy.testing.decorators import slow
-
-import pandas.parser
 
 
 class ParserTests(object):
@@ -3753,6 +3749,150 @@ def test_single_char_leading_whitespace(self):
         tm.assert_frame_equal(result, expected)
 
 
+class TestCompression(ParserTests, tm.TestCase):
+
+    def read_csv(self, *args, **kwargs):
+        return read_csv(*args, **kwargs)
+
+    def read_table(self, *args, **kwargs):
+        return read_csv(*args, **kwargs)
+
+    def test_zip(self):
+        try:
+            import zipfile
+        except ImportError:
+            raise nose.SkipTest('need zipfile to run')
+
+        data = open(self.csv1, 'rb').read()
+        expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            file_name = 'test_file'
+            tmp = zipfile.ZipFile(path, mode='w')
+            tmp.writestr(file_name, data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='zip', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(path, compression='zip', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='zip', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='zip', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean() as path:
+            file_names = ['test_file', 'second_file']
+            tmp = zipfile.ZipFile(path, mode='w')
+            for file_name in file_names:
+                tmp.writestr(file_name, data)
+            tmp.close()
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='zip', engine='c')
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='zip', engine='python')
+
+    def test_gzip(self):
+        try:
+            import gzip
+        except ImportError:
+            raise nose.SkipTest('need gzip to run')
+
+        data = open(self.csv1, 'rb').read()
+        expected = self.read_csv(self.csv1)
+        with tm.ensure_clean() as path:
+            tmp = gzip.GzipFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='gzip', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(path, compression='gzip', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='gzip', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(open(path, 'rb'), compression='gzip', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+    def test_bz2(self):
+        try:
+            import bz2
+        except ImportError:
+            raise nose.SkipTest('need bz2 to run')
+
+        data = open(self.csv1, 'rb').read()
+        expected = self.read_csv(self.csv1)
+        with tm.ensure_clean() as path:
+            tmp = bz2.BZ2File(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            result = self.read_csv(path, compression='bz2', engine='c')
+            tm.assert_frame_equal(result, expected)
+
+            result = self.read_csv(path, compression='bz2', engine='python')
+            tm.assert_frame_equal(result, expected)
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='bz3')
+
+            with open(path, 'rb') as fin:
+                if compat.PY3:
+                    result = self.read_csv(fin, compression='bz2', engine='c')
+                    tm.assert_frame_equal(result, expected)
+                    result = self.read_csv(fin, compression='bz2', engine='python')
+                    tm.assert_frame_equal(result, expected)
+                else:
+                    self.assertRaises(ValueError, self.read_csv,
+                                      fin, compression='bz2', engine='c')
+
+    def test_decompression_regex_sep(self):
+        try:
+            import gzip
+            import bz2
+        except ImportError:
+            raise nose.SkipTest('need gzip and bz2 to run')
+
+        data = open(self.csv1, 'rb').read()
+        data = data.replace(b',', b'::')
+        expected = self.read_csv(self.csv1)
+
+        with tm.ensure_clean() as path:
+            tmp = gzip.GzipFile(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            # GH 6607
+            # Test currently only valid with the python engine because of
+            # regex sep. Temporarily copied to TestPythonParser.
+            # Here test for ValueError when passing regex sep:
+
+            with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
+                result = self.read_csv(path, sep='::', compression='gzip', engine='c')
+                tm.assert_frame_equal(result, expected)
+
+        with tm.ensure_clean() as path:
+            tmp = bz2.BZ2File(path, mode='wb')
+            tmp.write(data)
+            tmp.close()
+
+            # GH 6607
+            with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
+                result = self.read_csv(path, sep='::', compression='bz2', engine='c')
+                tm.assert_frame_equal(result, expected)
+
+            self.assertRaises(ValueError, self.read_csv,
+                              path, compression='bz3')
+
+
 class TestCParserLowMemory(CParserTests, tm.TestCase):
 
     def read_csv(self, *args, **kwds):
@@ -3981,110 +4121,6 @@ def test_pure_python_failover(self):
         expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
         tm.assert_frame_equal(result, expected)
 
-    def test_decompression(self):
-        try:
-            import gzip
-            import bz2
-            import zipfile
-        except ImportError:
-            raise nose.SkipTest('need zipfile, gzip and bz2 to run')
-
-        data = open(self.csv1, 'rb').read()
-        expected = self.read_csv(self.csv1)
-
-        with tm.ensure_clean() as path:
-            file_name = 'test_file'
-            tmp = zipfile.ZipFile(path, mode='w')
-            tmp.writestr(file_name, data)
-            tmp.close()
-
-            result = self.read_csv(path, compression='zip')
-            tm.assert_frame_equal(result, expected)
-
-            result = self.read_csv(open(path, 'rb'), compression='zip')
-            tm.assert_frame_equal(result, expected)
-
-
-        with tm.ensure_clean() as path:
-            file_names = ['test_file', 'second_file']
-            tmp = zipfile.ZipFile(path, mode='w')
-            for file_name in file_names:
-                tmp.writestr(file_name, data)
-            tmp.close()
-
-            self.assertRaises(ValueError, self.read_csv,
-                              path, compression='zip')
-
-        with tm.ensure_clean() as path:
-            tmp = gzip.GzipFile(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            result = self.read_csv(path, compression='gzip')
-            tm.assert_frame_equal(result, expected)
-
-            result = self.read_csv(open(path, 'rb'), compression='gzip')
-            tm.assert_frame_equal(result, expected)
-
-        with tm.ensure_clean() as path:
-            tmp = bz2.BZ2File(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            result = self.read_csv(path, compression='bz2')
-            tm.assert_frame_equal(result, expected)
-
-            # result = self.read_csv(open(path, 'rb'), compression='bz2')
-            # tm.assert_frame_equal(result, expected)
-
-            self.assertRaises(ValueError, self.read_csv,
-                              path, compression='bz3')
-
-            with open(path, 'rb') as fin:
-                if compat.PY3:
-                    result = self.read_csv(fin, compression='bz2')
-                    tm.assert_frame_equal(result, expected)
-                else:
-                    self.assertRaises(ValueError, self.read_csv,
-                                      fin, compression='bz2')
-
-    def test_decompression_regex_sep(self):
-        try:
-            import gzip
-            import bz2
-        except ImportError:
-            raise nose.SkipTest('need gzip and bz2 to run')
-
-        data = open(self.csv1, 'rb').read()
-        data = data.replace(b',', b'::')
-        expected = self.read_csv(self.csv1)
-
-        with tm.ensure_clean() as path:
-            tmp = gzip.GzipFile(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            # GH 6607
-            # Test currently only valid with the python engine because of
-            # regex sep. Temporarily copied to TestPythonParser.
-            # Here test for ValueError when passing regex sep:
-
-            with tm.assertRaisesRegexp(ValueError, 'regex sep'):  # XXX
-                result = self.read_csv(path, sep='::', compression='gzip')
-                tm.assert_frame_equal(result, expected)
-
-        with tm.ensure_clean() as path:
-            tmp = bz2.BZ2File(path, mode='wb')
-            tmp.write(data)
-            tmp.close()
-
-            # GH 6607
-            with tm.assertRaisesRegexp(ValueError, 'regex sep'):  # XXX
-                result = self.read_csv(path, sep='::', compression='bz2')
-                tm.assert_frame_equal(result, expected)
-
-            self.assertRaises(ValueError, self.read_csv,
-                              path, compression='bz3')
 
     def test_memory_map(self):
         # it works!
diff --git a/pandas/parser.pyx b/pandas/parser.pyx
index 8c3ddc4127b3b..cad389f9e2a09 100644
--- a/pandas/parser.pyx
+++ b/pandas/parser.pyx
@@ -572,7 +572,7 @@ cdef class TextReader:
                     file_name = zip_names.pop()
                     source = zip_file.open(file_name)
 
-                elif len(zip_names)>1:
+                else:
                     raise ValueError('Multiple files found in compressed '
                                      'zip file %s', str(zip_names))
             else: