From 68eb884e361cf8f1b3dbc0e628ecf7a7b6fc9418 Mon Sep 17 00:00:00 2001 From: Mahmoud Lababidi Date: Fri, 29 Jan 2016 10:11:20 -0500 Subject: [PATCH] Add ZIP file decompression and TestCompression. Fix PEP8 issues. Change Compression to be a Mixin. Add Compression Mixin correctly with current Tests. Add .format, Rename Compression, with-block, empty zip, bad-zip --- doc/source/whatsnew/v0.18.1.txt | 1 + pandas/io/common.py | 15 ++ pandas/io/parsers.py | 32 +++- pandas/io/tests/test_parsers.py | 300 +++++++++++++++++++----------- pandas/parser.pyx | 15 ++ pandas/tests/frame/test_to_csv.py | 3 +- 6 files changed, 249 insertions(+), 117 deletions(-) diff --git a/doc/source/whatsnew/v0.18.1.txt b/doc/source/whatsnew/v0.18.1.txt index 1ced927001a94..ff8c3347c64ff 100644 --- a/doc/source/whatsnew/v0.18.1.txt +++ b/doc/source/whatsnew/v0.18.1.txt @@ -56,6 +56,7 @@ Partial string indexing now matches on ``DateTimeIndex`` when part of a ``MultiI Other Enhancements ^^^^^^^^^^^^^^^^^^ +- ``pd.read_csv()`` now supports opening ZIP files that contains a single CSV (:issue:`12175`) - ``pd.read_msgpack()`` now always gives writeable ndarrays even when compression is used (:issue:`12359`). .. _whatsnew_0181.api: diff --git a/pandas/io/common.py b/pandas/io/common.py index be8c3ccfe08e6..d44057178d27e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -360,6 +360,21 @@ def _get_handle(path, mode, encoding=None, compression=None): elif compression == 'bz2': import bz2 f = bz2.BZ2File(path, mode) + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(path) + zip_names = zip_file.namelist() + + if len(zip_names) == 1: + file_name = zip_names.pop() + f = zip_file.open(file_name) + elif len(zip_names) == 0: + raise ValueError('Zero files found in ZIP file {}' + .format(path)) + else: + raise ValueError('Multiple files found in ZIP file.' + ' Only one file per ZIP :{}' + .format(zip_names)) else: raise ValueError('Unrecognized compression type: %s' % compression) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 36a9abdfbca60..49fbadadfb719 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -158,11 +158,12 @@ class ParserWarning(Warning): information `_ on ``iterator`` and ``chunksize``. -compression : {'infer', 'gzip', 'bz2', None}, default 'infer' - For on-the-fly decompression of on-disk data. If 'infer', then use gzip or - bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2', - respectively, and no decompression otherwise. Set to None for no - decompression. +compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer' + For on-the-fly decompression of on-disk data. If 'infer', then use gzip, + bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or + '.zip', respectively, and no decompression otherwise. New in 0.18.1: ZIP + compression If using 'zip', the ZIP file must contain only one data file + to be read in. Set to None for no decompression. thousands : str, default None Thousands separator decimal : str, default '.' @@ -273,6 +274,8 @@ def _read(filepath_or_buffer, kwds): inferred_compression = 'gzip' elif filepath_or_buffer.endswith('.bz2'): inferred_compression = 'bz2' + elif filepath_or_buffer.endswith('.zip'): + inferred_compression = 'zip' else: inferred_compression = None else: @@ -1397,6 +1400,25 @@ def _wrap_compressed(f, compression, encoding=None): data = bz2.decompress(f.read()) f = StringIO(data) return f + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(f) + zip_names = zip_file.namelist() + print('ZIPNAMES' + zip_names) + + if len(zip_names) == 1: + file_name = zip_names.pop() + f = zip_file.open(file_name) + return f + + elif len(zip_names) == 0: + raise ValueError('Corrupted or zero files found in compressed ' + 'zip file %s', zip_file.filename) + + else: + raise ValueError('Multiple files found in compressed ' + 'zip file %s', str(zip_names)) + else: raise ValueError('do not recognize compression method %s' % compression) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index 9f53fc1ded882..7c7b40d77e821 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -3,44 +3,39 @@ # flake8: noqa -from datetime import datetime import csv import os -import sys -import re -import nose import platform from distutils.version import LooseVersion +import re +import sys +from datetime import datetime from multiprocessing.pool import ThreadPool -from numpy import nan +import nose import numpy as np -from pandas.io.common import DtypeWarning +import pandas.lib as lib +import pandas.parser +from numpy import nan +from numpy.testing.decorators import slow +from pandas.lib import Timestamp +import pandas as pd +import pandas.io.parsers as parsers +import pandas.tseries.tools as tools +import pandas.util.testing as tm from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex +from pandas import compat from pandas.compat import( StringIO, BytesIO, PY3, range, long, lrange, lmap, u ) -from pandas.io.common import URLError -import pandas.io.parsers as parsers +from pandas.compat import parse_date +from pandas.core.common import AbstractMethodError +from pandas.io.common import DtypeWarning, URLError from pandas.io.parsers import (read_csv, read_table, read_fwf, TextFileReader, TextParser) - -import pandas.util.testing as tm -import pandas as pd - -from pandas.core.common import AbstractMethodError -from pandas.compat import parse_date -import pandas.lib as lib -from pandas import compat -from pandas.lib import Timestamp from pandas.tseries.index import date_range -import pandas.tseries.tools as tools - -from numpy.testing.decorators import slow - -import pandas.parser class ParserTests(object): @@ -2696,7 +2691,166 @@ def test_uneven_lines_with_usecols(self): tm.assert_frame_equal(df, expected) -class TestPythonParser(ParserTests, tm.TestCase): +class CompressionTests(object): + def test_zip(self): + try: + import zipfile + except ImportError: + raise nose.SkipTest('need zipfile to run') + + with open(self.csv1, 'rb') as data_file: + data = data_file.read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean('test_file.zip') as path: + tmp = zipfile.ZipFile(path, mode='w') + tmp.writestr('test_file', data) + tmp.close() + + result = self.read_csv(path, compression='zip') + tm.assert_frame_equal(result, expected) + + result = self.read_csv(path, compression='infer') + tm.assert_frame_equal(result, expected) + + if self.engine is not 'python': + with open(path, 'rb') as f: + result = self.read_csv(f, compression='zip') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean('combined_zip.zip') as path: + inner_file_names = ['test_file', 'second_file'] + tmp = zipfile.ZipFile(path, mode='w') + for file_name in inner_file_names: + tmp.writestr(file_name, data) + tmp.close() + + self.assertRaisesRegexp(ValueError, 'Multiple files', self.read_csv, + path, compression='zip') + + self.assertRaisesRegexp(ValueError, 'Multiple files', self.read_csv, + path, compression='infer') + + with tm.ensure_clean() as path: + tmp = zipfile.ZipFile(path, mode='w') + tmp.close() + + self.assertRaisesRegexp(ValueError, 'Zero files',self.read_csv, + path, compression='zip') + + with tm.ensure_clean() as path: + with open(path, 'wb') as f: + self.assertRaises(zipfile.BadZipfile, self.read_csv, f, compression='zip') + + + def test_gzip(self): + try: + import gzip + except ImportError: + raise nose.SkipTest('need gzip to run') + + with open(self.csv1, 'rb') as data_file: + data = data_file.read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='gzip') + tm.assert_frame_equal(result, expected) + + with open(path, 'rb') as f: + result = self.read_csv(f, compression='gzip') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean('test.gz') as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + result = self.read_csv(path, compression='infer') + tm.assert_frame_equal(result, expected) + + def test_bz2(self): + try: + import bz2 + except ImportError: + raise nose.SkipTest('need bz2 to run') + + with open(self.csv1, 'rb') as data_file: + data = data_file.read() + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + result = self.read_csv(path, compression='bz2') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + with open(path, 'rb') as fin: + if compat.PY3: + result = self.read_csv(fin, compression='bz2') + tm.assert_frame_equal(result, expected) + elif self.engine is not 'python': + self.assertRaises(ValueError, self.read_csv, + fin, compression='bz2') + + with tm.ensure_clean('test.bz2') as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + result = self.read_csv(path, compression='infer') + tm.assert_frame_equal(result, expected) + + def test_decompression_regex_sep(self): + try: + import gzip + import bz2 + except ImportError: + raise nose.SkipTest('need gzip and bz2 to run') + + with open(self.csv1, 'rb') as data_file: + data = data_file.read() + data = data.replace(b',', b'::') + expected = self.read_csv(self.csv1) + + with tm.ensure_clean() as path: + tmp = gzip.GzipFile(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + # Test currently only valid with the python engine because of + # regex sep. Temporarily copied to TestPythonParser. + # Here test for ValueError when passing regex sep: + + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='gzip', engine='c') + tm.assert_frame_equal(result, expected) + + with tm.ensure_clean() as path: + tmp = bz2.BZ2File(path, mode='wb') + tmp.write(data) + tmp.close() + + # GH 6607 + with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + result = self.read_csv(path, sep='::', compression='bz2', engine='c') + tm.assert_frame_equal(result, expected) + + self.assertRaises(ValueError, self.read_csv, + path, compression='bz3') + + +class TestPythonParser(ParserTests, CompressionTests, tm.TestCase): + + engine = 'python' def test_negative_skipfooter_raises(self): text = """#foo,a,b,c @@ -2716,12 +2870,12 @@ def test_negative_skipfooter_raises(self): def read_csv(self, *args, **kwds): kwds = kwds.copy() - kwds['engine'] = 'python' + kwds['engine'] = self.engine return read_csv(*args, **kwds) def read_table(self, *args, **kwds): kwds = kwds.copy() - kwds['engine'] = 'python' + kwds['engine'] = self.engine return read_table(*args, **kwds) def float_precision_choices(self): @@ -3521,17 +3675,19 @@ def test_buffer_rd_bytes(self): except Exception as e: pass -class TestCParserHighMemory(CParserTests, tm.TestCase): + +class TestCParserHighMemory(CParserTests, CompressionTests, tm.TestCase): + engine = 'c' def read_csv(self, *args, **kwds): kwds = kwds.copy() - kwds['engine'] = 'c' + kwds['engine'] = self.engine kwds['low_memory'] = False return read_csv(*args, **kwds) def read_table(self, *args, **kwds): kwds = kwds.copy() - kwds['engine'] = 'c' + kwds['engine'] = self.engine kwds['low_memory'] = False return read_table(*args, **kwds) @@ -3832,18 +3988,20 @@ def test_single_char_leading_whitespace(self): tm.assert_frame_equal(result, expected) -class TestCParserLowMemory(CParserTests, tm.TestCase): +class TestCParserLowMemory(CParserTests, CompressionTests, tm.TestCase): + + engine = 'c' def read_csv(self, *args, **kwds): kwds = kwds.copy() - kwds['engine'] = 'c' + kwds['engine'] = self.engine kwds['low_memory'] = True kwds['buffer_lines'] = 2 return read_csv(*args, **kwds) def read_table(self, *args, **kwds): kwds = kwds.copy() - kwds['engine'] = 'c' + kwds['engine'] = self.engine kwds['low_memory'] = True kwds['buffer_lines'] = 2 return read_table(*args, **kwds) @@ -4060,86 +4218,6 @@ def test_pure_python_failover(self): expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]}) tm.assert_frame_equal(result, expected) - def test_decompression(self): - try: - import gzip - import bz2 - except ImportError: - raise nose.SkipTest('need gzip and bz2 to run') - - data = open(self.csv1, 'rb').read() - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='gzip') - tm.assert_frame_equal(result, expected) - - result = self.read_csv(open(path, 'rb'), compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - result = self.read_csv(path, compression='bz2') - tm.assert_frame_equal(result, expected) - - # result = self.read_csv(open(path, 'rb'), compression='bz2') - # tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') - - with open(path, 'rb') as fin: - if compat.PY3: - result = self.read_csv(fin, compression='bz2') - tm.assert_frame_equal(result, expected) - else: - self.assertRaises(ValueError, self.read_csv, - fin, compression='bz2') - - def test_decompression_regex_sep(self): - try: - import gzip - import bz2 - except ImportError: - raise nose.SkipTest('need gzip and bz2 to run') - - data = open(self.csv1, 'rb').read() - data = data.replace(b',', b'::') - expected = self.read_csv(self.csv1) - - with tm.ensure_clean() as path: - tmp = gzip.GzipFile(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - # Test currently only valid with the python engine because of - # regex sep. Temporarily copied to TestPythonParser. - # Here test for ValueError when passing regex sep: - - with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX - result = self.read_csv(path, sep='::', compression='gzip') - tm.assert_frame_equal(result, expected) - - with tm.ensure_clean() as path: - tmp = bz2.BZ2File(path, mode='wb') - tmp.write(data) - tmp.close() - - # GH 6607 - with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX - result = self.read_csv(path, sep='::', compression='bz2') - tm.assert_frame_equal(result, expected) - - self.assertRaises(ValueError, self.read_csv, - path, compression='bz3') def test_memory_map(self): # it works! diff --git a/pandas/parser.pyx b/pandas/parser.pyx index e2ba8d9d07ae2..8bfc0ab8d6c56 100644 --- a/pandas/parser.pyx +++ b/pandas/parser.pyx @@ -567,6 +567,21 @@ cdef class TextReader: else: raise ValueError('Python 2 cannot read bz2 from open file ' 'handle') + elif self.compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(source) + zip_names = zip_file.namelist() + + if len(zip_names) == 1: + file_name = zip_names.pop() + source = zip_file.open(file_name) + + elif len(zip_names) == 0: + raise ValueError('Zero files found in compressed ' + 'zip file %s', source) + else: + raise ValueError('Multiple files found in compressed ' + 'zip file %s', str(zip_names)) else: raise ValueError('Unrecognized compression type: %s' % self.compression) diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index a5b86b35d330e..4faf67eda6c78 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -994,7 +994,8 @@ def test_to_csv_compression_value_error(self): with ensure_clean() as filename: # zip compression is not supported and should raise ValueError - self.assertRaises(ValueError, df.to_csv, + import zipfile + self.assertRaises(zipfile.BadZipfile, df.to_csv, filename, compression="zip") def test_to_csv_date_format(self):