From 7f3461c70e75ed73896845f54883b5bea781cc5e Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Wed, 20 Jan 2016 15:53:05 -0500 Subject: [PATCH 1/8] Add Zip file functionality. Fixes #11413 --- pandas/io/parsers.py | 21 +++++++++++++++++++-- pandas/io/tests/data/salary.table.zip | Bin 0 -> 445 bytes pandas/io/tests/test_parsers.py | 15 ++++++++++++++- pandas/parser.pyx | 12 ++++++++++++ 4 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 pandas/io/tests/data/salary.table.zip diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9d25eaecc6620..8df5390845c0f 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -61,9 +61,9 @@ class ParserWarning(Warning): dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python') -compression : {'gzip', 'bz2', 'infer', None}, default 'infer' +compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip or - bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2', + bz2 if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip', respectively, and no decompression otherwise. Set to None for no decompression. dialect : string or csv.Dialect instance, default None @@ -252,6 +252,8 @@ def _read(filepath_or_buffer, kwds): inferred_compression = 'gzip' elif filepath_or_buffer.endswith('.bz2'): inferred_compression = 'bz2' + elif filepath_or_buffer.endswith('.zip'): + inferred_compression = 'zip' else: inferred_compression = None else: @@ -1379,6 +1381,21 @@ def _wrap_compressed(f, compression, encoding=None): data = bz2.decompress(f.read()) f = StringIO(data) return f + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(f) + zip_names = zip_file.namelist() + + if len(zip_names) == 1: + file_name = zip_names.pop() + f = zip_file.open(file_name) + return f + + elif len(zip_names)>1: + raise ValueError('Multiple files found in compressed ' + 'zip file %s', str(zip_names)) + return f + else: raise ValueError('do not recognize compression method %s' % compression) diff --git a/pandas/io/tests/data/salary.table.zip b/pandas/io/tests/data/salary.table.zip new file mode 100644 index 0000000000000000000000000000000000000000..97a74a99830826f8dac8935688f07fcfdbf8bb5f GIT binary patch literal 445 zcmWIWW@Zs#U|`^2_+4t^;a{n{o}ZC{;U^OV0}q1?LvdnGVo{}DNn%n?YG?>219Rq+ z`C$uzxU_`YXG;Y|R zfB*Sw&Hf38-~LUo*LP|5IPjj`jE{YfMQK+}^@@^`&-I^K9!^pEx~KlbO!m}VL&Jw! zcPfvZ;H-+9mfbFI?^L$u*T;Cq2|wPc@Ay9PwZFr$&gkc|1%=OCJszD)o-}v4e9pVh z>bU7s@6`%^es^AtKT`AKS?;*b)BRfK)9R!5{r>iGazm|=+I?1)!u#_LxZ>-l?x^~v z`qMUnBi70F3)>#sZVm1v{u!OlE1p!&QB~#X&Is1: + raise ValueError('Multiple files found in compressed ' + 'zip file %s', str(zip_names)) else: raise ValueError('Unrecognized compression type: %s' % self.compression) From 2fc43b68c3ef956a3026bf86d2b9efd646280b85 Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Thu, 21 Jan 2016 10:35:24 -0500 Subject: [PATCH 2/8] Add test to ensure ValueError is thrown when ZIP file contains multiple files. --- pandas/io/tests/test_parsers.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index dcf825c0e4d78..a333a0b10f383 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3828,6 +3828,17 @@ def test_decompression(self): result = self.read_csv(open(path, 'rb'), compression='zip') tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + file_names = ['test_file', 'second_file'] + tmp = zipfile.ZipFile(path, mode='w') + for file_name in file_names: + tmp.writestr(file_name, data) + tmp.close() + + self.assertRaises(ValueError, self.read_csv, + path, compression='zip') + with tm.ensure_clean() as path: tmp = gzip.GzipFile(path, mode='wb') tmp.write(data) From f5a641d98a8dbb2c0fe37a685773706cf1032e5b Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Tue, 26 Jan 2016 14:42:13 -0500 Subject: [PATCH 3/8] Add parser description warning to handle ZIP files In the description for the parser, a warning/comment is made that a zip file may only contain one file that needs to be read in. If more than one file is compressed into the ZIP file, a ValueError is thrown. --- pandas/io/parsers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8df5390845c0f..287c1990ff7f0 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -62,10 +62,10 @@ class ParserWarning(Warning): Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python') compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use gzip or - bz2 if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip', - respectively, and no decompression otherwise. Set to None for no - decompression. + For on-the-fly decompression of on-disk data. If 'infer', then use gzip, + bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip', + respectively, and no decompression otherwise. If using 'zip', the ZIP file must + contain only one data file to be read in. Set to None for no decompression. dialect : string or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details From cf7c3473417da0bfcf5f1e03e247814b0c380a81 Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Tue, 26 Jan 2016 17:54:05 -0500 Subject: [PATCH 4/8] Create TestCompression Nose Test Class. Split test_compression into test_gzip, test_bz2, test_zip. Add tests for python and c engines. --- pandas/io/common.py | 10 ++ pandas/io/parsers.py | 9 +- pandas/io/tests/test_parsers.py | 287 ++++++++++++++++++-------------- pandas/parser.pyx | 2 +- 4 files changed, 176 insertions(+), 132 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e46f609077810..08733c38a1faa 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -339,6 +339,16 @@ def _get_handle(path, mode, encoding=None, compression=None): elif compression == 'bz2': import bz2 f = bz2.BZ2File(path, mode) + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(path) + zip_names = zip_file.namelist() + + if len(zip_names) == 1: + file_name = zip_names.pop() + f = zip_file.open(file_name) + else: + raise ValueError('ZIP file contains multiple files {}', zip_file.filename) else: raise ValueError('Unrecognized compression type: %s' % compression) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 8df5390845c0f..ea93fbcccfeb1 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -732,10 +732,10 @@ def _make_engine(self, engine='c'): if engine == 'c': self._engine = CParserWrapper(self.f, **self.options) else: - if engine == 'python': - klass = PythonParser - elif engine == 'python-fwf': + if engine == 'python-fwf': klass = FixedWidthFieldParser + else: #default to engine == 'python': + klass = PythonParser self._engine = klass(self.f, **self.options) def _failover_to_python(self): @@ -1391,10 +1391,9 @@ def _wrap_compressed(f, compression, encoding=None): f = zip_file.open(file_name) return f - elif len(zip_names)>1: + else: raise ValueError('Multiple files found in compressed ' 'zip file %s', str(zip_names)) - return f else: raise ValueError('do not recognize compression method %s' diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index a333a0b10f383..1dcbf9d194d8b 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1,42 +1,37 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101 -from datetime import datetime import csv import os -import sys -import re -import nose import platform - +import re +import sys +from datetime import datetime from multiprocessing.pool import ThreadPool -from numpy import nan +import nose import numpy as np -from pandas.io.common import DtypeWarning +import pandas.lib as lib +import pandas.parser +from numpy import nan +from numpy.testing.decorators import slow +from pandas.lib import Timestamp +import pandas as pd +import pandas.io.parsers as parsers +import pandas.tseries.tools as tools +import pandas.util.testing as tm from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex +from pandas import compat from pandas.compat import( StringIO, BytesIO, PY3, range, long, lrange, lmap, u ) +from pandas.compat import parse_date +from pandas.io.common import DtypeWarning from pandas.io.common import URLError -import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, TextFileReader, TextParser) - -import pandas.util.testing as tm -import pandas as pd - -from pandas.compat import parse_date -import pandas.lib as lib -from pandas import compat -from pandas.lib import Timestamp from pandas.tseries.index import date_range -import pandas.tseries.tools as tools - -from numpy.testing.decorators import slow - -import pandas.parser class ParserTests(object): @@ -3590,6 +3585,151 @@ def test_single_char_leading_whitespace(self): skipinitialspace=True) tm.assert_frame_equal(result, expected) + +class TestCompression(ParserTests, tm.TestCase): + + def read_csv(self, *args, **kwargs): + return read_csv(*args, **kwargs) + + def read_table(self, *args, **kwargs): + return read_csv(*args, **kwargs) + + def test_zip(self): + try: + import zipfile + except ImportError: + raise nose.SkipTest('need zipfile to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + file_name = 'test_file' + tmp = zipfile.ZipFile(path, mode='w') + tmp.writestr(file_name, data) + tmp.close() + + result = self.read_csv(path, compression='zip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='zip', engine='python') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='zip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='zip', engine='python') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + file_names = ['test_file', 'second_file'] + tmp = zipfile.ZipFile(path, mode='w') + for file_name in file_names: + tmp.writestr(file_name, data) + tmp.close() + + self.assertRaises(ValueError, self.read_csv, + path, compression='zip', engine='c') + + self.assertRaises(ValueError, self.read_csv, + path, compression='zip', engine='python') + + def test_gzip(self): + try: + import gzip + except ImportError: + raise nose.SkipTest('need gzip to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='gzip', engine='python') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='gzip', engine='python') + tm.assert_frame_equal(result, expected) + + def test_bz2(self): + try: + import bz2 + except ImportError: + raise nose.SkipTest('need bz2 to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='bz2', engine='python') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + with open(path, 'rb') as fin: + if compat.PY3: + result = self.read_csv(fin, compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + result = self.read_csv(fin, compression='bz2', engine='python') + tm.assert_frame_equal(result, expected) + else: + self.assertRaises(ValueError, self.read_csv, + fin, compression='bz2', engine='c') + + def test_decompression_regex_sep(self): + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest('need gzip and bz2 to run') + + data = open(self.csv1, 'rb').read() + data = data.replace(b',', b'::') + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + # Test currently only valid with the python engine because of + # regex sep. Temporarily copied to TestPythonParser. + # Here test for ValueError when passing regex sep: + + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + class TestCParserLowMemory(ParserTests, tm.TestCase): def read_csv(self, *args, **kwds): @@ -3805,111 +3945,6 @@ def test_pure_python_failover(self): expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) tm.assert_frame_equal(result, expected) - def test_decompression(self): - try: - import gzip - import bz2 - import zipfile - except ImportError: - raise nose.SkipTest('need zipfile, gzip and bz2 to run') - - data = open(self.csv1, 'rb').read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - file_name = 'test_file' - tmp = zipfile.ZipFile(path, mode='w') - tmp.writestr(file_name, data) - tmp.close() - - result = self.read_csv(path, compression='zip') - tm.assert_frame_equal(result, expected) - - result = self.read_csv(open(path, 'rb'), compression='zip') - tm.assert_frame_equal(result, expected) - - - with tm.ensure_clean() as path: - file_names = ['test_file', 'second_file'] - tmp = zipfile.ZipFile(path, mode='w') - for file_name in file_names: - tmp.writestr(file_name, data) - tmp.close() - - self.assertRaises(ValueError, self.read_csv, - path, compression='zip') - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='gzip') - tm.assert_frame_equal(result, expected) - - result = self.read_csv(open(path, 'rb'), compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='bz2') - tm.assert_frame_equal(result, expected) - - # result = self.read_csv(open(path, 'rb'), compression='bz2') - # tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') - - with open(path, 'rb') as fin: - if compat.PY3: - result = self.read_csv(fin, compression='bz2') - tm.assert_frame_equal(result, expected) - else: - self.assertRaises(ValueError, self.read_csv, - fin, compression='bz2') - - def test_decompression_regex_sep(self): - try: - import gzip - import bz2 - except ImportError: - raise nose.SkipTest('need gzip and bz2 to run') - - data = open(self.csv1, 'rb').read() - data = data.replace(b',', b'::') - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - # Test currently only valid with the python engine because of - # regex sep. Temporarily copied to TestPythonParser. - # Here test for ValueError when passing regex sep: - - with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX - result = self.read_csv(path, sep='::', compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX - result = self.read_csv(path, sep='::', compression='bz2') - tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') - def test_memory_map(self): # it works! result = self.read_csv(self.csv1, memory_map=True) diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 8c3ddc4127b3b..cad389f9e2a09 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -572,7 +572,7 @@ cdef class TextReader: file_name = zip_names.pop() source = zip_file.open(file_name) - elif len(zip_names)>1: + else: raise ValueError('Multiple files found in compressed ' 'zip file %s', str(zip_names)) else: From 56e55a660b6af3f49f2d9a0a23b5d1158c6afe91 Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Wed, 20 Jan 2016 15:53:05 -0500 Subject: [PATCH 5/8] Add Zip file functionality. Fixes #11413 --- pandas/io/parsers.py | 21 +++++++++++++++++++-- pandas/io/tests/data/salary.table.zip | Bin 0 -> 445 bytes pandas/io/tests/test_parsers.py | 15 ++++++++++++++- pandas/parser.pyx | 12 ++++++++++++ 4 files changed, 45 insertions(+), 3 deletions(-) create mode 100644 pandas/io/tests/data/salary.table.zip diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index dc6923b752ac7..06b4d80b844fe 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -61,9 +61,9 @@ class ParserWarning(Warning): dtype : Type name or dict of column -> type, default None Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python') -compression : {'gzip', 'bz2', 'infer', None}, default 'infer' +compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip or - bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2', + bz2 if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip', respectively, and no decompression otherwise. Set to None for no decompression. dialect : string or csv.Dialect instance, default None @@ -258,6 +258,8 @@ def _read(filepath_or_buffer, kwds): inferred_compression = 'gzip' elif filepath_or_buffer.endswith('.bz2'): inferred_compression = 'bz2' + elif filepath_or_buffer.endswith('.zip'): + inferred_compression = 'zip' else: inferred_compression = None else: @@ -1387,6 +1389,21 @@ def _wrap_compressed(f, compression, encoding=None): data = bz2.decompress(f.read()) f = StringIO(data) return f + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(f) + zip_names = zip_file.namelist() + + if len(zip_names) == 1: + file_name = zip_names.pop() + f = zip_file.open(file_name) + return f + + elif len(zip_names)>1: + raise ValueError('Multiple files found in compressed ' + 'zip file %s', str(zip_names)) + return f + else: raise ValueError('do not recognize compression method %s' % compression) diff --git a/pandas/io/tests/data/salary.table.zip b/pandas/io/tests/data/salary.table.zip new file mode 100644 index 0000000000000000000000000000000000000000..97a74a99830826f8dac8935688f07fcfdbf8bb5f GIT binary patch literal 445 zcmWIWW@Zs#U|`^2_+4t^;a{n{o}ZC{;U^OV0}q1?LvdnGVo{}DNn%n?YG?>219Rq+ z`C$uzxU_`YXG;Y|R zfB*Sw&Hf38-~LUo*LP|5IPjj`jE{YfMQK+}^@@^`&-I^K9!^pEx~KlbO!m}VL&Jw! zcPfvZ;H-+9mfbFI?^L$u*T;Cq2|wPc@Ay9PwZFr$&gkc|1%=OCJszD)o-}v4e9pVh z>bU7s@6`%^es^AtKT`AKS?;*b)BRfK)9R!5{r>iGazm|=+I?1)!u#_LxZ>-l?x^~v z`qMUnBi70F3)>#sZVm1v{u!OlE1p!&QB~#X&Is1: + raise ValueError('Multiple files found in compressed ' + 'zip file %s', str(zip_names)) else: raise ValueError('Unrecognized compression type: %s' % self.compression) From 40fe2688e63f9d0f3f03885fff8ec4fe1b8e0acd Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Thu, 21 Jan 2016 10:35:24 -0500 Subject: [PATCH 6/8] Add test to ensure ValueError is thrown when ZIP file contains multiple files. --- pandas/io/tests/test_parsers.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 5214b8d5c72c5..0621a5a95aa3f 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -4004,6 +4004,17 @@ def test_decompression(self): result = self.read_csv(open(path, 'rb'), compression='zip') tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + file_names = ['test_file', 'second_file'] + tmp = zipfile.ZipFile(path, mode='w') + for file_name in file_names: + tmp.writestr(file_name, data) + tmp.close() + + self.assertRaises(ValueError, self.read_csv, + path, compression='zip') + with tm.ensure_clean() as path: tmp = gzip.GzipFile(path, mode='wb') tmp.write(data) From ee336f1198a935505908c32761e60baa200ef328 Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Tue, 26 Jan 2016 14:42:13 -0500 Subject: [PATCH 7/8] Add parser description warning to handle ZIP files In the description for the parser, a warning/comment is made that a zip file may only contain one file that needs to be read in. If more than one file is compressed into the ZIP file, a ValueError is thrown. --- pandas/io/parsers.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 06b4d80b844fe..745ba54729077 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -62,10 +62,10 @@ class ParserWarning(Warning): Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32} (Unsupported with engine='python') compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use gzip or - bz2 if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip', - respectively, and no decompression otherwise. Set to None for no - decompression. + For on-the-fly decompression of on-disk data. If 'infer', then use gzip, + bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip', + respectively, and no decompression otherwise. If using 'zip', the ZIP file must + contain only one data file to be read in. Set to None for no decompression. dialect : string or csv.Dialect instance, default None If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details From 1c3ecd73a538c1f9739249146cc0c7ca944bebf1 Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Tue, 26 Jan 2016 17:54:05 -0500 Subject: [PATCH 8/8] Create TestCompression Nose Test Class. Split test_compression into test_gzip, test_bz2, test_zip. Add tests for python and c engines. --- pandas/io/common.py | 10 ++ pandas/io/parsers.py | 9 +- pandas/io/tests/test_parsers.py | 284 ++++++++++++++++++-------------- pandas/parser.pyx | 2 +- 4 files changed, 175 insertions(+), 130 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index 811d42b7b4b9e..11cf45aa47f8e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -345,6 +345,16 @@ def _get_handle(path, mode, encoding=None, compression=None): elif compression == 'bz2': import bz2 f = bz2.BZ2File(path, mode) + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(path) + zip_names = zip_file.namelist() + + if len(zip_names) == 1: + file_name = zip_names.pop() + f = zip_file.open(file_name) + else: + raise ValueError('ZIP file contains multiple files {}', zip_file.filename) else: raise ValueError('Unrecognized compression type: %s' % compression) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 745ba54729077..7eb544ceddad3 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -740,10 +740,10 @@ def _make_engine(self, engine='c'): if engine == 'c': self._engine = CParserWrapper(self.f, **self.options) else: - if engine == 'python': - klass = PythonParser - elif engine == 'python-fwf': + if engine == 'python-fwf': klass = FixedWidthFieldParser + else: #default to engine == 'python': + klass = PythonParser self._engine = klass(self.f, **self.options) def _failover_to_python(self): @@ -1399,10 +1399,9 @@ def _wrap_compressed(f, compression, encoding=None): f = zip_file.open(file_name) return f - elif len(zip_names)>1: + else: raise ValueError('Multiple files found in compressed ' 'zip file %s', str(zip_names)) - return f else: raise ValueError('do not recognize compression method %s' diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 0621a5a95aa3f..a17a7f2e6df6c 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -6,39 +6,35 @@ from datetime import datetime import csv import os -import sys -import re -import nose import platform - +import re +import sys +from datetime import datetime from multiprocessing.pool import ThreadPool -from numpy import nan +import nose import numpy as np -from pandas.io.common import DtypeWarning +import pandas.lib as lib +import pandas.parser +from numpy import nan +from numpy.testing.decorators import slow +from pandas.lib import Timestamp +import pandas as pd +import pandas.io.parsers as parsers +import pandas.tseries.tools as tools +import pandas.util.testing as tm from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex +from pandas import compat from pandas.compat import( StringIO, BytesIO, PY3, range, long, lrange, lmap, u ) +from pandas.compat import parse_date +from pandas.io.common import DtypeWarning from pandas.io.common import URLError -import pandas.io.parsers as parsers from pandas.io.parsers import (read_csv, read_table, read_fwf, TextFileReader, TextParser) - -import pandas.util.testing as tm -import pandas as pd - -from pandas.compat import parse_date -import pandas.lib as lib -from pandas import compat -from pandas.lib import Timestamp from pandas.tseries.index import date_range -import pandas.tseries.tools as tools - -from numpy.testing.decorators import slow - -import pandas.parser class ParserTests(object): @@ -3753,6 +3749,150 @@ def test_single_char_leading_whitespace(self): tm.assert_frame_equal(result, expected) +class TestCompression(ParserTests, tm.TestCase): + + def read_csv(self, *args, **kwargs): + return read_csv(*args, **kwargs) + + def read_table(self, *args, **kwargs): + return read_csv(*args, **kwargs) + + def test_zip(self): + try: + import zipfile + except ImportError: + raise nose.SkipTest('need zipfile to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + file_name = 'test_file' + tmp = zipfile.ZipFile(path, mode='w') + tmp.writestr(file_name, data) + tmp.close() + + result = self.read_csv(path, compression='zip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='zip', engine='python') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='zip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='zip', engine='python') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + file_names = ['test_file', 'second_file'] + tmp = zipfile.ZipFile(path, mode='w') + for file_name in file_names: + tmp.writestr(file_name, data) + tmp.close() + + self.assertRaises(ValueError, self.read_csv, + path, compression='zip', engine='c') + + self.assertRaises(ValueError, self.read_csv, + path, compression='zip', engine='python') + + def test_gzip(self): + try: + import gzip + except ImportError: + raise nose.SkipTest('need gzip to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='gzip', engine='python') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(open(path, 'rb'), compression='gzip', engine='python') + tm.assert_frame_equal(result, expected) + + def test_bz2(self): + try: + import bz2 + except ImportError: + raise nose.SkipTest('need bz2 to run') + + data = open(self.csv1, 'rb').read() + expected = self.read_csv(self.csv1) + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='bz2', engine='python') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + with open(path, 'rb') as fin: + if compat.PY3: + result = self.read_csv(fin, compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + result = self.read_csv(fin, compression='bz2', engine='python') + tm.assert_frame_equal(result, expected) + else: + self.assertRaises(ValueError, self.read_csv, + fin, compression='bz2', engine='c') + + def test_decompression_regex_sep(self): + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest('need gzip and bz2 to run') + + data = open(self.csv1, 'rb').read() + data = data.replace(b',', b'::') + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + # Test currently only valid with the python engine because of + # regex sep. Temporarily copied to TestPythonParser. + # Here test for ValueError when passing regex sep: + + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + class TestCParserLowMemory(CParserTests, tm.TestCase): def read_csv(self, *args, **kwds): @@ -3981,110 +4121,6 @@ def test_pure_python_failover(self): expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) tm.assert_frame_equal(result, expected) - def test_decompression(self): - try: - import gzip - import bz2 - import zipfile - except ImportError: - raise nose.SkipTest('need zipfile, gzip and bz2 to run') - - data = open(self.csv1, 'rb').read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - file_name = 'test_file' - tmp = zipfile.ZipFile(path, mode='w') - tmp.writestr(file_name, data) - tmp.close() - - result = self.read_csv(path, compression='zip') - tm.assert_frame_equal(result, expected) - - result = self.read_csv(open(path, 'rb'), compression='zip') - tm.assert_frame_equal(result, expected) - - - with tm.ensure_clean() as path: - file_names = ['test_file', 'second_file'] - tmp = zipfile.ZipFile(path, mode='w') - for file_name in file_names: - tmp.writestr(file_name, data) - tmp.close() - - self.assertRaises(ValueError, self.read_csv, - path, compression='zip') - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='gzip') - tm.assert_frame_equal(result, expected) - - result = self.read_csv(open(path, 'rb'), compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='bz2') - tm.assert_frame_equal(result, expected) - - # result = self.read_csv(open(path, 'rb'), compression='bz2') - # tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') - - with open(path, 'rb') as fin: - if compat.PY3: - result = self.read_csv(fin, compression='bz2') - tm.assert_frame_equal(result, expected) - else: - self.assertRaises(ValueError, self.read_csv, - fin, compression='bz2') - - def test_decompression_regex_sep(self): - try: - import gzip - import bz2 - except ImportError: - raise nose.SkipTest('need gzip and bz2 to run') - - data = open(self.csv1, 'rb').read() - data = data.replace(b',', b'::') - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - # Test currently only valid with the python engine because of - # regex sep. Temporarily copied to TestPythonParser. - # Here test for ValueError when passing regex sep: - - with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX - result = self.read_csv(path, sep='::', compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX - result = self.read_csv(path, sep='::', compression='bz2') - tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') def test_memory_map(self): # it works! diff --git a/pandas/parser.pyx b/pandas/parser.pyx index 8c3ddc4127b3b..cad389f9e2a09 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -572,7 +572,7 @@ cdef class TextReader: file_name = zip_names.pop() source = zip_file.open(file_name) - elif len(zip_names)>1: + else: raise ValueError('Multiple files found in compressed ' 'zip file %s', str(zip_names)) else: