Skip to content

Commit 6df6611

Browse files
committed
pickle compression code update
1 parent 81d55a0 commit 6df6611

File tree

4 files changed

+65
-24
lines changed

4 files changed

+65
-24
lines changed

pandas/core/generic.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -1265,7 +1265,9 @@ def to_pickle(self, path, compression='infer'):
12651265
path : string
12661266
File path
12671267
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
1268-
.. versionadded:: 0.19.2
1268+
a string representing the compression to use in the output file
1269+
1270+
.. versionadded:: 0.20.0
12691271
"""
12701272
from pandas.io.pickle import to_pickle
12711273
return to_pickle(self, path, compression=compression)

pandas/io/common.py

+14-6
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
try:
1616
from s3fs import S3File
17+
1718
need_text_wrapping = (BytesIO, S3File)
1819
except ImportError:
1920
need_text_wrapping = (BytesIO,)
@@ -28,20 +29,21 @@
2829

2930
try:
3031
import pathlib
32+
3133
_PATHLIB_INSTALLED = True
3234
except ImportError:
3335
_PATHLIB_INSTALLED = False
3436

35-
3637
try:
3738
from py.path import local as LocalPath
39+
3840
_PY_PATH_INSTALLED = True
3941
except:
4042
_PY_PATH_INSTALLED = False
4143

42-
4344
if compat.PY3:
4445
from urllib.request import urlopen, pathname2url
46+
4547
_urlopen = urlopen
4648
from urllib.parse import urlparse as parse_url
4749
from urllib.parse import (uses_relative, uses_netloc, uses_params,
@@ -58,13 +60,13 @@
5860
from contextlib import contextmanager, closing # noqa
5961
from functools import wraps # noqa
6062

63+
6164
# @wraps(_urlopen)
6265
@contextmanager
6366
def urlopen(*args, **kwargs):
6467
with closing(_urlopen(*args, **kwargs)) as f:
6568
yield f
6669

67-
6870
_VALID_URLS = set(uses_relative + uses_netloc + uses_params)
6971
_VALID_URLS.discard('')
7072

@@ -75,6 +77,7 @@ class ParserError(ValueError):
7577
"""
7678
pass
7779

80+
7881
# gh-12665: Alias for now and remove later.
7982
CParserError = ParserError
8083

@@ -109,12 +112,14 @@ class BaseIterator(object):
109112
"""Subclass this and provide a "__next__()" method to obtain an iterator.
110113
Useful only when the object being iterated is non-reusable (e.g. OK for a
111114
parser, not for an in-memory table, yes for its iterator)."""
115+
112116
def __iter__(self):
113117
return self
114118

115119
def __next__(self):
116120
raise AbstractMethodError(self)
117121

122+
118123
if not compat.PY3:
119124
BaseIterator.next = lambda self: self.__next__()
120125

@@ -318,7 +323,8 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
318323
memory_map : boolean, default False
319324
See parsers._parser_params for more information.
320325
is_text : boolean, default True
321-
whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.)
326+
whether file/buffer is in text format (csv, json, etc.), or in binary
327+
mode (pickle, etc.)
322328
Returns
323329
-------
324330
f : file-like
@@ -401,7 +407,8 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
401407
handles.append(f)
402408

403409
# in Python 3, convert BytesIO or fileobjects passed with an encoding
404-
if compat.PY3 and is_text and (compression or isinstance(f, need_text_wrapping)):
410+
if compat.PY3 and is_text and\
411+
(compression or isinstance(f, need_text_wrapping)):
405412
from io import TextIOWrapper
406413
f = TextIOWrapper(f, encoding=encoding)
407414
handles.append(f)
@@ -458,7 +465,6 @@ def __next__(self):
458465

459466

460467
class UTF8Recoder(BaseIterator):
461-
462468
"""
463469
Iterator that reads an encoded stream and reencodes the input to UTF-8
464470
"""
@@ -481,6 +487,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds):
481487
# ignore encoding
482488
return csv.reader(f, dialect=dialect, **kwds)
483489

490+
484491
def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds):
485492
return csv.writer(f, dialect=dialect, **kwds)
486493
else:
@@ -502,6 +509,7 @@ def __next__(self):
502509
row = next(self.reader)
503510
return [compat.text_type(s, "utf-8") for s in row]
504511

512+
505513
class UnicodeWriter:
506514

507515
"""

pandas/io/pickle.py

+37-12
Original file line numberDiff line numberDiff line change
@@ -17,12 +17,19 @@ def to_pickle(obj, path, compression='infer'):
1717
path : string
1818
File path
1919
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
20-
.. versionadded:: 0.19.2
20+
a string representing the compression to use in the output file
21+
22+
.. versionadded:: 0.20.0
2123
"""
2224
inferred_compression = _infer_compression(path, compression)
23-
f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False)
24-
with f:
25+
f, fh = _get_handle(path, 'wb',
26+
compression=inferred_compression,
27+
is_text=False)
28+
try:
2529
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL)
30+
finally:
31+
for _f in fh:
32+
_f.close()
2633

2734

2835
def read_pickle(path, compression='infer'):
@@ -38,7 +45,12 @@ def read_pickle(path, compression='infer'):
3845
path : string
3946
File path
4047
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer'
41-
.. versionadded:: 0.19.2
48+
For on-the-fly decompression of on-disk data. If 'infer', then use
49+
gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz',
50+
respectively, and no decompression otherwise.
51+
Set to None for no decompression.
52+
53+
.. versionadded:: 0.20.0
4254
4355
Returns
4456
-------
@@ -57,22 +69,35 @@ def try_read(path, encoding=None):
5769
# cpickle
5870
# GH 6899
5971
try:
60-
f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False)
61-
with f:
72+
f, fh = _get_handle(path, 'rb',
73+
compression=inferred_compression,
74+
is_text=False)
75+
try:
6276
return pkl.load(f)
77+
finally:
78+
for _f in fh:
79+
_f.close()
6380
except Exception:
6481
# reg/patched pickle
6582
try:
66-
f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False)
67-
with f:
83+
f, fh = _get_handle(path, 'rb',
84+
compression=inferred_compression,
85+
is_text=False)
86+
try:
6887
return pc.load(f, encoding=encoding, compat=False)
69-
88+
finally:
89+
for _f in fh:
90+
_f.close()
7091
# compat pickle
7192
except:
72-
f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False)
73-
with f:
93+
f, fh = _get_handle(path, 'rb',
94+
compression=inferred_compression,
95+
is_text=False)
96+
try:
7497
return pc.load(f, encoding=encoding, compat=True)
75-
98+
finally:
99+
for _f in fh:
100+
_f.close()
76101
try:
77102
return try_read(path)
78103
except:

pandas/io/tests/test_pickle.py

+11-5
Original file line numberDiff line numberDiff line change
@@ -286,18 +286,22 @@ def test_pickle_v0_15_2(self):
286286

287287
def compression_explicit(self, compression):
288288
# issue 11666
289+
if compression == 'xz':
290+
tm._skip_if_no_lzma()
289291
with tm.ensure_clean(self.path) as path:
290292
df = tm.makeDataFrame()
291293
df.to_pickle(path, compression=compression)
292-
tm.assert_frame_equal(df, pandas.read_pickle(path, compression=compression))
294+
df2 = pd.read_pickle(path, compression=compression)
295+
tm.assert_frame_equal(df, df2)
293296

294297
def test_compression_explicit(self):
295298
compressions = [None, 'gzip', 'bz2', 'xz']
296299
for c in compressions:
297300
yield self.compression_explicit, c
298301

299302
def compression_explicit_bad(self, compression):
300-
with tm.assertRaisesRegexp(ValueError, "Unrecognized compression type"):
303+
with tm.assertRaisesRegexp(ValueError,
304+
"Unrecognized compression type"):
301305
with tm.ensure_clean(self.path) as path:
302306
df = tm.makeDataFrame()
303307
df.to_pickle(path, compression=compression)
@@ -308,10 +312,12 @@ def test_compression_explicit_bad(self):
308312
yield self.compression_explicit_bad, c
309313

310314
def compression_infer(self, ext):
311-
with tm.ensure_clean(self.path + ext) as p:
315+
if ext == '.xz':
316+
tm._skip_if_no_lzma()
317+
with tm.ensure_clean(self.path + ext) as path:
312318
df = tm.makeDataFrame()
313-
df.to_pickle(p)
314-
tm.assert_frame_equal(df, pandas.read_pickle(p))
319+
df.to_pickle(path)
320+
tm.assert_frame_equal(df, pd.read_pickle(path))
315321

316322
def test_compression_infer(self):
317323
extensions = ['', '.gz', '.bz2', '.xz', '.who_am_i']

0 commit comments

Comments
 (0)