Skip to content

Add Zip file functionality. Fixes #11413 #12103

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 11 commits into from
10 changes: 10 additions & 0 deletions pandas/io/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,16 @@ def _get_handle(path, mode, encoding=None, compression=None):
elif compression == 'bz2':
import bz2
f = bz2.BZ2File(path, mode)
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(path)
zip_names = zip_file.namelist()

if len(zip_names) == 1:
file_name = zip_names.pop()
f = zip_file.open(file_name)
else:
raise ValueError('ZIP file contains multiple files {}', zip_file.filename)
else:
raise ValueError('Unrecognized compression type: %s' %
compression)
Expand Down
32 changes: 24 additions & 8 deletions pandas/io/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,11 +61,11 @@ class ParserWarning(Warning):
dtype : Type name or dict of column -> type, default None
Data type for data or columns. E.g. {'a': np.float64, 'b': np.int32}
(Unsupported with engine='python')
compression : {'gzip', 'bz2', 'infer', None}, default 'infer'
For on-the-fly decompression of on-disk data. If 'infer', then use gzip or
bz2 if filepath_or_buffer is a string ending in '.gz' or '.bz2',
respectively, and no decompression otherwise. Set to None for no
decompression.
compression : {'gzip', 'bz2', 'zip', 'infer', None}, default 'infer'
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add an explanation about .zip is only a single file

For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
bz2 or zip if filepath_or_buffer is a string ending in '.gz', '.bz2' or '.zip',
respectively, and no decompression otherwise. If using 'zip', the ZIP file must
contain only one data file to be read in. Set to None for no decompression.
dialect : string or csv.Dialect instance, default None
If None defaults to Excel dialect. Ignored if sep longer than 1 char
See csv.Dialect documentation for more details
Expand Down Expand Up @@ -258,6 +258,8 @@ def _read(filepath_or_buffer, kwds):
inferred_compression = 'gzip'
elif filepath_or_buffer.endswith('.bz2'):
inferred_compression = 'bz2'
elif filepath_or_buffer.endswith('.zip'):
inferred_compression = 'zip'
else:
inferred_compression = None
else:
Expand Down Expand Up @@ -738,10 +740,10 @@ def _make_engine(self, engine='c'):
if engine == 'c':
self._engine = CParserWrapper(self.f, **self.options)
else:
if engine == 'python':
klass = PythonParser
elif engine == 'python-fwf':
if engine == 'python-fwf':
klass = FixedWidthFieldParser
else: #default to engine == 'python':
klass = PythonParser
self._engine = klass(self.f, **self.options)

def _failover_to_python(self):
Expand Down Expand Up @@ -1387,6 +1389,20 @@ def _wrap_compressed(f, compression, encoding=None):
data = bz2.decompress(f.read())
f = StringIO(data)
return f
elif compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(f)
zip_names = zip_file.namelist()

if len(zip_names) == 1:
file_name = zip_names.pop()
f = zip_file.open(file_name)
return f

else:
raise ValueError('Multiple files found in compressed '
'zip file %s', str(zip_names))

else:
raise ValueError('do not recognize compression method %s'
% compression)
Expand Down
Binary file added pandas/io/tests/data/salary.table.zip
Binary file not shown.
260 changes: 160 additions & 100 deletions pandas/io/tests/test_parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,39 +6,35 @@
from datetime import datetime
import csv
import os
import sys
import re
import nose
import platform

import re
import sys
from datetime import datetime
from multiprocessing.pool import ThreadPool

from numpy import nan
import nose
import numpy as np
from pandas.io.common import DtypeWarning
import pandas.lib as lib
import pandas.parser
from numpy import nan
from numpy.testing.decorators import slow
from pandas.lib import Timestamp

import pandas as pd
import pandas.io.parsers as parsers
import pandas.tseries.tools as tools
import pandas.util.testing as tm
from pandas import DataFrame, Series, Index, MultiIndex, DatetimeIndex
from pandas import compat
from pandas.compat import(
StringIO, BytesIO, PY3, range, long, lrange, lmap, u
)
from pandas.compat import parse_date
from pandas.io.common import DtypeWarning
from pandas.io.common import URLError
import pandas.io.parsers as parsers
from pandas.io.parsers import (read_csv, read_table, read_fwf,
TextFileReader, TextParser)

import pandas.util.testing as tm
import pandas as pd

from pandas.compat import parse_date
import pandas.lib as lib
from pandas import compat
from pandas.lib import Timestamp
from pandas.tseries.index import date_range
import pandas.tseries.tools as tools

from numpy.testing.decorators import slow

import pandas.parser


class ParserTests(object):
Expand Down Expand Up @@ -3753,6 +3749,150 @@ def test_single_char_leading_whitespace(self):
tm.assert_frame_equal(result, expected)


class TestCompression(ParserTests, tm.TestCase):

def read_csv(self, *args, **kwargs):
return read_csv(*args, **kwargs)

def read_table(self, *args, **kwargs):
return read_csv(*args, **kwargs)

def test_zip(self):
try:
import zipfile
except ImportError:
raise nose.SkipTest('need zipfile to run')

data = open(self.csv1, 'rb').read()
expected = self.read_csv(self.csv1)

with tm.ensure_clean() as path:
file_name = 'test_file'
tmp = zipfile.ZipFile(path, mode='w')
tmp.writestr(file_name, data)
tmp.close()

result = self.read_csv(path, compression='zip', engine='c')
tm.assert_frame_equal(result, expected)

result = self.read_csv(path, compression='zip', engine='python')
tm.assert_frame_equal(result, expected)

result = self.read_csv(open(path, 'rb'), compression='zip', engine='c')
tm.assert_frame_equal(result, expected)

result = self.read_csv(open(path, 'rb'), compression='zip', engine='python')
tm.assert_frame_equal(result, expected)

with tm.ensure_clean() as path:
file_names = ['test_file', 'second_file']
tmp = zipfile.ZipFile(path, mode='w')
for file_name in file_names:
tmp.writestr(file_name, data)
tmp.close()

self.assertRaises(ValueError, self.read_csv,
path, compression='zip', engine='c')

self.assertRaises(ValueError, self.read_csv,
path, compression='zip', engine='python')

def test_gzip(self):
try:
import gzip
except ImportError:
raise nose.SkipTest('need gzip to run')

data = open(self.csv1, 'rb').read()
expected = self.read_csv(self.csv1)
with tm.ensure_clean() as path:
tmp = gzip.GzipFile(path, mode='wb')
tmp.write(data)
tmp.close()

result = self.read_csv(path, compression='gzip', engine='c')
tm.assert_frame_equal(result, expected)

result = self.read_csv(path, compression='gzip', engine='python')
tm.assert_frame_equal(result, expected)

result = self.read_csv(open(path, 'rb'), compression='gzip', engine='c')
tm.assert_frame_equal(result, expected)

result = self.read_csv(open(path, 'rb'), compression='gzip', engine='python')
tm.assert_frame_equal(result, expected)

def test_bz2(self):
try:
import bz2
except ImportError:
raise nose.SkipTest('need bz2 to run')

data = open(self.csv1, 'rb').read()
expected = self.read_csv(self.csv1)
with tm.ensure_clean() as path:
tmp = bz2.BZ2File(path, mode='wb')
tmp.write(data)
tmp.close()

result = self.read_csv(path, compression='bz2', engine='c')
tm.assert_frame_equal(result, expected)

result = self.read_csv(path, compression='bz2', engine='python')
tm.assert_frame_equal(result, expected)

self.assertRaises(ValueError, self.read_csv,
path, compression='bz3')

with open(path, 'rb') as fin:
if compat.PY3:
result = self.read_csv(fin, compression='bz2', engine='c')
tm.assert_frame_equal(result, expected)
result = self.read_csv(fin, compression='bz2', engine='python')
tm.assert_frame_equal(result, expected)
else:
self.assertRaises(ValueError, self.read_csv,
fin, compression='bz2', engine='c')

def test_decompression_regex_sep(self):
try:
import gzip
import bz2
except ImportError:
raise nose.SkipTest('need gzip and bz2 to run')

data = open(self.csv1, 'rb').read()
data = data.replace(b',', b'::')
expected = self.read_csv(self.csv1)

with tm.ensure_clean() as path:
tmp = gzip.GzipFile(path, mode='wb')
tmp.write(data)
tmp.close()

# GH 6607
# Test currently only valid with the python engine because of
# regex sep. Temporarily copied to TestPythonParser.
# Here test for ValueError when passing regex sep:

with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
result = self.read_csv(path, sep='::', compression='gzip', engine='c')
tm.assert_frame_equal(result, expected)

with tm.ensure_clean() as path:
tmp = bz2.BZ2File(path, mode='wb')
tmp.write(data)
tmp.close()

# GH 6607
with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX
result = self.read_csv(path, sep='::', compression='bz2', engine='c')
tm.assert_frame_equal(result, expected)

self.assertRaises(ValueError, self.read_csv,
path, compression='bz3')


class TestCParserLowMemory(CParserTests, tm.TestCase):

def read_csv(self, *args, **kwds):
Expand Down Expand Up @@ -3981,86 +4121,6 @@ def test_pure_python_failover(self):
expected = DataFrame({'a': [1, 4], 'b': [2, 5], 'c': [3, 6]})
tm.assert_frame_equal(result, expected)

def test_decompression(self):
try:
import gzip
import bz2
except ImportError:
raise nose.SkipTest('need gzip and bz2 to run')

data = open(self.csv1, 'rb').read()
expected = self.read_csv(self.csv1)

with tm.ensure_clean() as path:
tmp = gzip.GzipFile(path, mode='wb')
tmp.write(data)
tmp.close()

result = self.read_csv(path, compression='gzip')
tm.assert_frame_equal(result, expected)

result = self.read_csv(open(path, 'rb'), compression='gzip')
tm.assert_frame_equal(result, expected)

with tm.ensure_clean() as path:
tmp = bz2.BZ2File(path, mode='wb')
tmp.write(data)
tmp.close()

result = self.read_csv(path, compression='bz2')
tm.assert_frame_equal(result, expected)

# result = self.read_csv(open(path, 'rb'), compression='bz2')
# tm.assert_frame_equal(result, expected)

self.assertRaises(ValueError, self.read_csv,
path, compression='bz3')

with open(path, 'rb') as fin:
if compat.PY3:
result = self.read_csv(fin, compression='bz2')
tm.assert_frame_equal(result, expected)
else:
self.assertRaises(ValueError, self.read_csv,
fin, compression='bz2')

def test_decompression_regex_sep(self):
try:
import gzip
import bz2
except ImportError:
raise nose.SkipTest('need gzip and bz2 to run')

data = open(self.csv1, 'rb').read()
data = data.replace(b',', b'::')
expected = self.read_csv(self.csv1)

with tm.ensure_clean() as path:
tmp = gzip.GzipFile(path, mode='wb')
tmp.write(data)
tmp.close()

# GH 6607
# Test currently only valid with the python engine because of
# regex sep. Temporarily copied to TestPythonParser.
# Here test for ValueError when passing regex sep:

with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX
result = self.read_csv(path, sep='::', compression='gzip')
tm.assert_frame_equal(result, expected)

with tm.ensure_clean() as path:
tmp = bz2.BZ2File(path, mode='wb')
tmp.write(data)
tmp.close()

# GH 6607
with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX
result = self.read_csv(path, sep='::', compression='bz2')
tm.assert_frame_equal(result, expected)

self.assertRaises(ValueError, self.read_csv,
path, compression='bz3')

def test_memory_map(self):
# it works!
Expand Down
12 changes: 12 additions & 0 deletions pandas/parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -563,6 +563,18 @@ cdef class TextReader:
else:
raise ValueError('Python 2 cannot read bz2 from open file '
'handle')
elif self.compression == 'zip':
import zipfile
zip_file = zipfile.ZipFile(source)
zip_names = zip_file.namelist()

if len(zip_names) == 1:
file_name = zip_names.pop()
source = zip_file.open(file_name)

else:
raise ValueError('Multiple files found in compressed '
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

maybe just do else here (e.g. you can have 0 files in an archive?)

'zip file %s', str(zip_names))
else:
raise ValueError('Unrecognized compression type: %s' %
self.compression)
Expand Down