diff --git a/pandas/io/common.py b/pandas/io/common.py index 811d42b7b4b9e..11cf45aa47f8e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -345,6 +345,16 @@ def _get_handle(path, mode, encoding=None, compression=None): elif compression == 'bz2': import bz2 f = bz2.BZ2File(path, mode) + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(path) + zip_names = zip_file.namelist() + + if len(zip_names) == 1: + file_name = zip_names.pop() + f = zip_file.open(file_name) + else: + raise ValueError('ZIP file contains multiple files {}', zip_file.filename) else: raise ValueError('Unrecognized compression type: %s' % compression) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dc6923b752ac7..7eb544ceddad3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -61,11 +61,11 @@ class ParserWarning(Warning): dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python') -compression : {'gzip', 'bz2', 'infer', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use gzip or - bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2', - respectively, and no decompression otherwise. Set to None for no - decompression. +compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use gzip, + bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip', + respectively, and no decompression otherwise. If using 'zip', the ZIP file must + contain only one data file to be read in. Set to None for no decompression. dialect : string or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details @@ -258,6 +258,8 @@ def _read(filepath_or_buffer, kwds): inferred_compression = 'gzip' elif filepath_or_buffer.endswith('.bz2'): inferred_compression = 'bz2' + elif filepath_or_buffer.endswith('.zip'): + inferred_compression = 'zip' else: inferred_compression = None else: @@ -738,10 +740,10 @@ def _make_engine(self, engine='c'): if engine == 'c': self._engine = CParserWrapper(self.f, **self.options) else: - if engine == 'python': - klass = PythonParser - elif engine == 'python-fwf': + if engine == 'python-fwf': klass = FixedWidthFieldParser + else: #default to engine == 'python': + klass = PythonParser self._engine = klass(self.f, **self.options) def _failover_to_python(self): @@ -1387,6 +1389,20 @@ def _wrap_compressed(f, compression, encoding=None): data = bz2.decompress(f.read()) f = StringIO(data) return f + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(f) + zip_names = zip_file.namelist() + + if len(zip_names) == 1: + file_name = zip_names.pop() + f = zip_file.open(file_name) + return f + + else: + raise ValueError('Multiple files found in compressed ' + 'zip file %s', str(zip_names)) + else: raise ValueError('do not recognize compression method %s' % compression) diff --git a/pandas/io/tests/data/salary.table.zip b/pandas/io/tests/data/salary.table.zip new file mode 100644 index 0000000000000..97a74a9983082 Binary files /dev/null and b/pandas/io/tests/data/salary.table.zip differ diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 7c68a44874631..a17a7f2e6df6c 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -6,39 +6,35 @@ from datetime import datetime import csv import os -import sys -import re -import nose import platform - +import re +import sys +from datetime import datetime from multiprocessing.pool import ThreadPool -from numpy import nan +import nose import numpy as np -from pandas.io.common import DtypeWarning +import pandas.lib as lib +import pandas.parser +from numpy import nan +from numpy.testing.decorators import slow +from pandas.lib import Timestamp +import pandas as pd +import pandas.io.parsers as parsers +import pandas.tseries.tools as tools +import pandas.util.testing as tm from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex +from pandas import compat from pandas.compat import( StringIO, BytesIO, PY3, range, long, lrange, lmap, u ) +from pandas.compat import parse_date +from pandas.io.common import DtypeWarning from pandas.io.common import URLError -import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, TextFileReader, TextParser) - -import pandas.util.testing as tm -import pandas as pd - -from pandas.compat import parse_date -import pandas.lib as lib -from pandas import compat -from pandas.lib import Timestamp from pandas.tseries.index import date_range -import pandas.tseries.tools as tools - -from numpy.testing.decorators import slow - -import pandas.parser class ParserTests(object): @@ -3753,6 +3749,150 @@ def test_single_char_leading_whitespace(self): tm.assert_frame_equal(result, expected) +class TestCompression(ParserTests, tm.TestCase): + + def read_csv(self, *args, **kwargs): + return read_csv(*args, **kwargs) + + def read_table(self, *args, **kwargs): + return read_csv(*args, **kwargs) + + def test_zip(self): + try: + import zipfile + except ImportError: + raise nose.SkipTest('need zipfile to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + file_name = 'test_file' + tmp = zipfile.ZipFile(path, mode='w') + tmp.writestr(file_name, data) + tmp.close() + + result = self.read_csv(path, compression='zip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='zip', engine='python') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='zip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='zip', engine='python') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + file_names = ['test_file', 'second_file'] + tmp = zipfile.ZipFile(path, mode='w') + for file_name in file_names: + tmp.writestr(file_name, data) + tmp.close() + + self.assertRaises(ValueError, self.read_csv, + path, compression='zip', engine='c') + + self.assertRaises(ValueError, self.read_csv, + path, compression='zip', engine='python') + + def test_gzip(self): + try: + import gzip + except ImportError: + raise nose.SkipTest('need gzip to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='gzip', engine='python') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='gzip', engine='python') + tm.assert_frame_equal(result, expected) + + def test_bz2(self): + try: + import bz2 + except ImportError: + raise nose.SkipTest('need bz2 to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='bz2', engine='python') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + with open(path, 'rb') as fin: + if compat.PY3: + result = self.read_csv(fin, compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + result = self.read_csv(fin, compression='bz2', engine='python') + tm.assert_frame_equal(result, expected) + else: + self.assertRaises(ValueError, self.read_csv, + fin, compression='bz2', engine='c') + + def test_decompression_regex_sep(self): + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest('need gzip and bz2 to run') + + data = open(self.csv1, 'rb').read() + data = data.replace(b',', b'::') + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + # Test currently only valid with the python engine because of + # regex sep. Temporarily copied to TestPythonParser. + # Here test for ValueError when passing regex sep: + + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + class TestCParserLowMemory(CParserTests, tm.TestCase): def read_csv(self, *args, **kwds): @@ -3981,86 +4121,6 @@ def test_pure_python_failover(self): expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) tm.assert_frame_equal(result, expected) - def test_decompression(self): - try: - import gzip - import bz2 - except ImportError: - raise nose.SkipTest('need gzip and bz2 to run') - - data = open(self.csv1, 'rb').read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='gzip') - tm.assert_frame_equal(result, expected) - - result = self.read_csv(open(path, 'rb'), compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='bz2') - tm.assert_frame_equal(result, expected) - - # result = self.read_csv(open(path, 'rb'), compression='bz2') - # tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') - - with open(path, 'rb') as fin: - if compat.PY3: - result = self.read_csv(fin, compression='bz2') - tm.assert_frame_equal(result, expected) - else: - self.assertRaises(ValueError, self.read_csv, - fin, compression='bz2') - - def test_decompression_regex_sep(self): - try: - import gzip - import bz2 - except ImportError: - raise nose.SkipTest('need gzip and bz2 to run') - - data = open(self.csv1, 'rb').read() - data = data.replace(b',', b'::') - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - # Test currently only valid with the python engine because of - # regex sep. Temporarily copied to TestPythonParser. - # Here test for ValueError when passing regex sep: - - with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX - result = self.read_csv(path, sep='::', compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX - result = self.read_csv(path, sep='::', compression='bz2') - tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') def test_memory_map(self): # it works! diff --git a/pandas/parser.pyx b/pandas/parser.pyx index f9b8d921f02d1..cad389f9e2a09 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -563,6 +563,18 @@ cdef class TextReader: else: raise ValueError('Python 2 cannot read bz2 from open file ' 'handle') + elif self.compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(source) + zip_names = zip_file.namelist() + + if len(zip_names) == 1: + file_name = zip_names.pop() + source = zip_file.open(file_name) + + else: + raise ValueError('Multiple files found in compressed ' + 'zip file %s', str(zip_names)) else: raise ValueError('Unrecognized compression type: %s' % self.compression)