Skip to content

Commit 1cb810b

Browse files
committed
add zip decompression support. refactor using lambda.
1 parent b8c4175 commit 1cb810b

File tree

5 files changed

+30
-41
lines changed

5 files changed

+30
-41
lines changed

pandas/io/common.py

+4-10
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,6 @@
1414

1515
try:
1616
from s3fs import S3File
17-
1817
need_text_wrapping = (BytesIO, S3File)
1918
except ImportError:
2019
need_text_wrapping = (BytesIO,)
@@ -29,21 +28,20 @@
2928

3029
try:
3130
import pathlib
32-
3331
_PATHLIB_INSTALLED = True
3432
except ImportError:
3533
_PATHLIB_INSTALLED = False
3634

35+
3736
try:
3837
from py.path import local as LocalPath
39-
4038
_PY_PATH_INSTALLED = True
4139
except:
4240
_PY_PATH_INSTALLED = False
4341

42+
4443
if compat.PY3:
4544
from urllib.request import urlopen, pathname2url
46-
4745
_urlopen = urlopen
4846
from urllib.parse import urlparse as parse_url
4947
from urllib.parse import (uses_relative, uses_netloc, uses_params,
@@ -60,13 +58,13 @@
6058
from contextlib import contextmanager, closing # noqa
6159
from functools import wraps # noqa
6260

63-
6461
# @wraps(_urlopen)
6562
@contextmanager
6663
def urlopen(*args, **kwargs):
6764
with closing(_urlopen(*args, **kwargs)) as f:
6865
yield f
6966

67+
7068
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
7169
_VALID_URLS.discard('')
7270

@@ -77,7 +75,6 @@ class ParserError(ValueError):
7775
"""
7876
pass
7977

80-
8178
# gh-12665: Alias for now and remove later.
8279
CParserError = ParserError
8380

@@ -112,14 +109,12 @@ class BaseIterator(object):
112109
"""Subclass this and provide a "__next__()" method to obtain an iterator.
113110
Useful only when the object being iterated is non-reusable (e.g. OK for a
114111
parser, not for an in-memory table, yes for its iterator)."""
115-
116112
def __iter__(self):
117113
return self
118114

119115
def __next__(self):
120116
raise AbstractMethodError(self)
121117

122-
123118
if not compat.PY3:
124119
BaseIterator.next = lambda self: self.__next__()
125120

@@ -465,6 +460,7 @@ def __next__(self):
465460

466461

467462
class UTF8Recoder(BaseIterator):
463+
468464
"""
469465
Iterator that reads an encoded stream and reencodes the input to UTF-8
470466
"""
@@ -487,7 +483,6 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
487483
# ignore encoding
488484
return csv.reader(f, dialect=dialect, **kwds)
489485

490-
491486
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
492487
return csv.writer(f, dialect=dialect, **kwds)
493488
else:
@@ -509,7 +504,6 @@ def __next__(self):
509504
row = next(self.reader)
510505
return [compat.text_type(s, "utf-8") for s in row]
511506

512-
513507
class UnicodeWriter:
514508

515509
"""

pandas/io/pickle.py

+19-27
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,10 @@ def read_pickle(path, compression='infer'):
4444
----------
4545
path : string
4646
File path
47-
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
47+
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
4848
For on-the-fly decompression of on-disk data. If 'infer', then use
49-
gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz',
50-
respectively, and no decompression otherwise.
49+
gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz',
50+
or 'zip' respectively, and no decompression otherwise.
5151
Set to None for no decompression.
5252
5353
.. versionadded:: 0.20.0
@@ -59,6 +59,17 @@ def read_pickle(path, compression='infer'):
5959

6060
inferred_compression = _infer_compression(path, compression)
6161

62+
def read_wrapper(func):
63+
# wrapper file handle open/close operation
64+
f, fh = _get_handle(path, 'rb',
65+
compression=inferred_compression,
66+
is_text=False)
67+
try:
68+
return func(f)
69+
finally:
70+
for _f in fh:
71+
_f.close()
72+
6273
def try_read(path, encoding=None):
6374
# try with cPickle
6475
# try with current pickle, if we have a Type Error then
@@ -69,35 +80,16 @@ def try_read(path, encoding=None):
6980
# cpickle
7081
# GH 6899
7182
try:
72-
f, fh = _get_handle(path, 'rb',
73-
compression=inferred_compression,
74-
is_text=False)
75-
try:
76-
return pkl.load(f)
77-
finally:
78-
for _f in fh:
79-
_f.close()
83+
return read_wrapper(lambda f: pkl.load(f))
8084
except Exception:
8185
# reg/patched pickle
8286
try:
83-
f, fh = _get_handle(path, 'rb',
84-
compression=inferred_compression,
85-
is_text=False)
86-
try:
87-
return pc.load(f, encoding=encoding, compat=False)
88-
finally:
89-
for _f in fh:
90-
_f.close()
87+
return read_wrapper(
88+
lambda f: pc.load(f, encoding=encoding, compat=False))
9189
# compat pickle
9290
except:
93-
f, fh = _get_handle(path, 'rb',
94-
compression=inferred_compression,
95-
is_text=False)
96-
try:
97-
return pc.load(f, encoding=encoding, compat=True)
98-
finally:
99-
for _f in fh:
100-
_f.close()
91+
return read_wrapper(
92+
lambda f: pc.load(f, encoding=encoding, compat=True))
10193
try:
10294
return try_read(path)
10395
except:
Binary file not shown.

pandas/io/tests/test_pickle.py

+6-4
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,9 @@ def test_compression_infer(self):
324324
for ext in extensions:
325325
yield self.compression_infer, ext
326326

327-
def compression_prepared_data(self, ext):
327+
def decompression_prepared_data(self, ext):
328+
if ext == '.xz':
329+
tm._skip_if_no_lzma()
328330
pickle_path = os.path.join(tm.get_data_path(),
329331
'pickle_compression',
330332
'data.pickle')
@@ -333,10 +335,10 @@ def compression_prepared_data(self, ext):
333335
data2 = pd.read_pickle(compressed_path)
334336
tm.assert_frame_equal(data1, data2)
335337

336-
def test_compression_prepared_data(self):
337-
extensions = ['.gz', '.bz2', '.xz']
338+
def test_decompression_prepared_data(self):
339+
extensions = ['.gz', '.bz2', '.xz', '.zip']
338340
for ext in extensions:
339-
yield self.compression_prepared_data, ext
341+
yield self.decompression_prepared_data, ext
340342

341343

342344
if __name__ == '__main__':

setup.py

+1
Original file line numberDiff line numberDiff line change
@@ -660,6 +660,7 @@ def pxd(name):
660660
package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5',
661661
'tests/data/legacy_pickle/*/*.pickle',
662662
'tests/data/legacy_msgpack/*/*.msgpack',
663+
'tests/data/pickle_compression/*',
663664
'tests/data/*.csv*',
664665
'tests/data/*.dta',
665666
'tests/data/*.pickle',

0 commit comments

Comments
 (0)