From 025a0cd3421a4afbaa9959005f1322ebde521748 Mon Sep 17 00:00:00 2001 From: goldenbull Date: Sat, 8 Oct 2016 08:53:53 +0800 Subject: [PATCH 1/9] add compression support for pickle --- pandas/core/generic.py | 6 ++++-- pandas/io/common.py | 12 ++++++++---- pandas/io/pickle.py | 31 ++++++++++++++++++++++--------- pandas/io/tests/test_pickle.py | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 68 insertions(+), 15 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 48d799811aa94..c2004c5ee190a 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1256,7 +1256,7 @@ def to_sql(self, name, con, flavor=None, schema=None, if_exists='fail', if_exists=if_exists, index=index, index_label=index_label, chunksize=chunksize, dtype=dtype) - def to_pickle(self, path): + def to_pickle(self, path, compression='infer'): """ Pickle (serialize) object to input file path. @@ -1264,9 +1264,11 @@ def to_pickle(self, path): ---------- path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + .. versionadded:: 0.19.2 """ from pandas.io.pickle import to_pickle - return to_pickle(self, path) + return to_pickle(self, path, compression=compression) def to_clipboard(self, excel=None, sep=None, **kwargs): """ diff --git a/pandas/io/common.py b/pandas/io/common.py index fa1022b882124..56db0139f8048 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -296,7 +296,7 @@ def _infer_compression(filepath_or_buffer, compression): def _get_handle(path_or_buf, mode, encoding=None, compression=None, - memory_map=False): + memory_map=False, is_text=True): """ Get file handle for given path/buffer and mode. @@ -311,7 +311,8 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, Supported compression protocols are gzip, bz2, zip, and xz memory_map : boolean, default False See parsers._parser_params for more information. - + is_text : boolean, default True + whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) Returns ------- f : file-like @@ -385,13 +386,16 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, elif encoding: # Python 3 and encoding f = open(path_or_buf, mode, encoding=encoding) - else: + elif is_text: # Python 3 and no explicit encoding f = open(path_or_buf, mode, errors='replace') + else: + # Python 3 and binary mode + f = open(path_or_buf, mode) handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding - if compat.PY3 and (compression or isinstance(f, compat.BytesIO)): + if compat.PY3 and is_text and (compression or isinstance(f, compat.BytesIO)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) handles.append(f) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 2358c296f782e..b9e593f4b7332 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -4,9 +4,10 @@ from numpy.lib.format import read_array, write_array from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3 from pandas.types.common import is_datetime64_dtype, _NS_DTYPE +from pandas.io.common import _get_handle, _infer_compression -def to_pickle(obj, path): +def to_pickle(obj, path, compression='infer'): """ Pickle (serialize) object to input file path @@ -15,12 +16,16 @@ def to_pickle(obj, path): obj : any object path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + .. versionadded:: 0.19.2 """ - with open(path, 'wb') as f: + inferred_compression = _infer_compression(path, compression) + f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) + with f: pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) -def read_pickle(path): +def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path @@ -32,12 +37,16 @@ def read_pickle(path): ---------- path : string File path + compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + .. versionadded:: 0.19.2 Returns ------- unpickled : type of object stored in file """ + inferred_compression = _infer_compression(path, compression) + def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then @@ -48,18 +57,21 @@ def try_read(path, encoding=None): # cpickle # GH 6899 try: - with open(path, 'rb') as fh: - return pkl.load(fh) + f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) + with f: + return pkl.load(f) except Exception: # reg/patched pickle try: - with open(path, 'rb') as fh: - return pc.load(fh, encoding=encoding, compat=False) + f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) + with f: + return pc.load(f, encoding=encoding, compat=False) # compat pickle except: - with open(path, 'rb') as fh: - return pc.load(fh, encoding=encoding, compat=True) + f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) + with f: + return pc.load(f, encoding=encoding, compat=True) try: return try_read(path) @@ -68,6 +80,7 @@ def try_read(path, encoding=None): return try_read(path, encoding='latin1') raise + # compat with sparse pickle / unpickle diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index a49f50b1bcb9f..30041f1741184 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -284,6 +284,40 @@ def test_pickle_v0_15_2(self): # tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + def compression_explicit(self, compression): + # issue 11666 + with tm.ensure_clean(self.path) as path: + df = tm.makeDataFrame() + df.to_pickle(path, compression=compression) + tm.assert_frame_equal(df, pandas.read_pickle(path, compression=compression)) + + def test_compression_explicit(self): + compressions = [None, 'gzip', 'bz2', 'xz'] + for c in compressions: + yield self.compression_explicit, c + + def compression_explicit_bad(self, compression): + with tm.assertRaisesRegexp(ValueError, "Unrecognized compression type"): + with tm.ensure_clean(self.path) as path: + df = tm.makeDataFrame() + df.to_pickle(path, compression=compression) + + def test_compression_explicit_bad(self): + compressions = ['', 'None', 'bad', '7z'] + for c in compressions: + yield self.compression_explicit_bad, c + + def compression_infer(self, ext): + with tm.ensure_clean(self.path + ext) as p: + df = tm.makeDataFrame() + df.to_pickle(p) + tm.assert_frame_equal(df, pandas.read_pickle(p)) + + def test_compression_infer(self): + extensions = ['', '.gz', '.bz2', '.xz', '.who_am_i'] + for ext in extensions: + yield self.compression_infer, ext + if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], From 6df661101f050a60b00dd963a4d08ed138267c71 Mon Sep 17 00:00:00 2001 From: goldenbull Date: Thu, 29 Dec 2016 10:52:46 +0800 Subject: [PATCH 2/9] pickle compression code update --- pandas/core/generic.py | 4 ++- pandas/io/common.py | 20 +++++++++----- pandas/io/pickle.py | 49 +++++++++++++++++++++++++--------- pandas/io/tests/test_pickle.py | 16 +++++++---- 4 files changed, 65 insertions(+), 24 deletions(-) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 0f55ade8c3bd7..dcf732da03eca 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -1265,7 +1265,9 @@ def to_pickle(self, path, compression='infer'): path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' - .. versionadded:: 0.19.2 + a string representing the compression to use in the output file + + .. versionadded:: 0.20.0 """ from pandas.io.pickle import to_pickle return to_pickle(self, path, compression=compression) diff --git a/pandas/io/common.py b/pandas/io/common.py index 2ed49c1d7f120..80f983406cb5d 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -14,6 +14,7 @@ try: from s3fs import S3File + need_text_wrapping = (BytesIO, S3File) except ImportError: need_text_wrapping = (BytesIO,) @@ -28,20 +29,21 @@ try: import pathlib + _PATHLIB_INSTALLED = True except ImportError: _PATHLIB_INSTALLED = False - try: from py.path import local as LocalPath + _PY_PATH_INSTALLED = True except: _PY_PATH_INSTALLED = False - if compat.PY3: from urllib.request import urlopen, pathname2url + _urlopen = urlopen from urllib.parse import urlparse as parse_url from urllib.parse import (uses_relative, uses_netloc, uses_params, @@ -58,13 +60,13 @@ from contextlib import contextmanager, closing # noqa from functools import wraps # noqa + # @wraps(_urlopen) @contextmanager def urlopen(*args, **kwargs): with closing(_urlopen(*args, **kwargs)) as f: yield f - _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard('') @@ -75,6 +77,7 @@ class ParserError(ValueError): """ pass + # gh-12665: Alias for now and remove later. CParserError = ParserError @@ -109,12 +112,14 @@ class BaseIterator(object): """Subclass this and provide a "__next__()" method to obtain an iterator. Useful only when the object being iterated is non-reusable (e.g. OK for a parser, not for an in-memory table, yes for its iterator).""" + def __iter__(self): return self def __next__(self): raise AbstractMethodError(self) + if not compat.PY3: BaseIterator.next = lambda self: self.__next__() @@ -318,7 +323,8 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, memory_map : boolean, default False See parsers._parser_params for more information. is_text : boolean, default True - whether file/buffer is in text format (csv, json, etc.), or in binary mode (pickle, etc.) + whether file/buffer is in text format (csv, json, etc.), or in binary + mode (pickle, etc.) Returns ------- f : file-like @@ -401,7 +407,8 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None, handles.append(f) # in Python 3, convert BytesIO or fileobjects passed with an encoding - if compat.PY3 and is_text and (compression or isinstance(f, need_text_wrapping)): + if compat.PY3 and is_text and\ + (compression or isinstance(f, need_text_wrapping)): from io import TextIOWrapper f = TextIOWrapper(f, encoding=encoding) handles.append(f) @@ -458,7 +465,6 @@ def __next__(self): class UTF8Recoder(BaseIterator): - """ Iterator that reads an encoded stream and reencodes the input to UTF-8 """ @@ -481,6 +487,7 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): # ignore encoding return csv.reader(f, dialect=dialect, **kwds) + def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): return csv.writer(f, dialect=dialect, **kwds) else: @@ -502,6 +509,7 @@ def __next__(self): row = next(self.reader) return [compat.text_type(s, "utf-8") for s in row] + class UnicodeWriter: """ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index b9e593f4b7332..ce693088224bd 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -17,12 +17,19 @@ def to_pickle(obj, path, compression='infer'): path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' - .. versionadded:: 0.19.2 + a string representing the compression to use in the output file + + .. versionadded:: 0.20.0 """ inferred_compression = _infer_compression(path, compression) - f, fh = _get_handle(path, 'wb', compression=inferred_compression, is_text=False) - with f: + f, fh = _get_handle(path, 'wb', + compression=inferred_compression, + is_text=False) + try: pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) + finally: + for _f in fh: + _f.close() def read_pickle(path, compression='infer'): @@ -38,7 +45,12 @@ def read_pickle(path, compression='infer'): path : string File path compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' - .. versionadded:: 0.19.2 + For on-the-fly decompression of on-disk data. If 'infer', then use + gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz', + respectively, and no decompression otherwise. + Set to None for no decompression. + + .. versionadded:: 0.20.0 Returns ------- @@ -57,22 +69,35 @@ def try_read(path, encoding=None): # cpickle # GH 6899 try: - f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) - with f: + f, fh = _get_handle(path, 'rb', + compression=inferred_compression, + is_text=False) + try: return pkl.load(f) + finally: + for _f in fh: + _f.close() except Exception: # reg/patched pickle try: - f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) - with f: + f, fh = _get_handle(path, 'rb', + compression=inferred_compression, + is_text=False) + try: return pc.load(f, encoding=encoding, compat=False) - + finally: + for _f in fh: + _f.close() # compat pickle except: - f, fh = _get_handle(path, 'rb', compression=inferred_compression, is_text=False) - with f: + f, fh = _get_handle(path, 'rb', + compression=inferred_compression, + is_text=False) + try: return pc.load(f, encoding=encoding, compat=True) - + finally: + for _f in fh: + _f.close() try: return try_read(path) except: diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 30041f1741184..934e5599f841c 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -286,10 +286,13 @@ def test_pickle_v0_15_2(self): def compression_explicit(self, compression): # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() with tm.ensure_clean(self.path) as path: df = tm.makeDataFrame() df.to_pickle(path, compression=compression) - tm.assert_frame_equal(df, pandas.read_pickle(path, compression=compression)) + df2 = pd.read_pickle(path, compression=compression) + tm.assert_frame_equal(df, df2) def test_compression_explicit(self): compressions = [None, 'gzip', 'bz2', 'xz'] @@ -297,7 +300,8 @@ def test_compression_explicit(self): yield self.compression_explicit, c def compression_explicit_bad(self, compression): - with tm.assertRaisesRegexp(ValueError, "Unrecognized compression type"): + with tm.assertRaisesRegexp(ValueError, + "Unrecognized compression type"): with tm.ensure_clean(self.path) as path: df = tm.makeDataFrame() df.to_pickle(path, compression=compression) @@ -308,10 +312,12 @@ def test_compression_explicit_bad(self): yield self.compression_explicit_bad, c def compression_infer(self, ext): - with tm.ensure_clean(self.path + ext) as p: + if ext == '.xz': + tm._skip_if_no_lzma() + with tm.ensure_clean(self.path + ext) as path: df = tm.makeDataFrame() - df.to_pickle(p) - tm.assert_frame_equal(df, pandas.read_pickle(p)) + df.to_pickle(path) + tm.assert_frame_equal(df, pd.read_pickle(path)) def test_compression_infer(self): extensions = ['', '.gz', '.bz2', '.xz', '.who_am_i'] From b8c4175d4c680601a976448cbd6297a71dcdfc2e Mon Sep 17 00:00:00 2001 From: goldenbull Date: Thu, 29 Dec 2016 12:58:58 +0800 Subject: [PATCH 3/9] add compressed pickle data file to io/tests --- .../io/tests/data/pickle_compression/data.pickle | Bin 0 -> 2826 bytes .../data/pickle_compression/data.pickle.bz2 | Bin 0 -> 2335 bytes .../tests/data/pickle_compression/data.pickle.gz | Bin 0 -> 2205 bytes .../tests/data/pickle_compression/data.pickle.xz | Bin 0 -> 2076 bytes pandas/io/tests/test_pickle.py | 14 ++++++++++++++ 5 files changed, 14 insertions(+) create mode 100644 pandas/io/tests/data/pickle_compression/data.pickle create mode 100644 pandas/io/tests/data/pickle_compression/data.pickle.bz2 create mode 100644 pandas/io/tests/data/pickle_compression/data.pickle.gz create mode 100644 pandas/io/tests/data/pickle_compression/data.pickle.xz diff --git a/pandas/io/tests/data/pickle_compression/data.pickle b/pandas/io/tests/data/pickle_compression/data.pickle new file mode 100644 index 0000000000000000000000000000000000000000..4e6eb94212745bfb8fb34605025e14a044f0add1 GIT binary patch literal 2826 zcmZuz`BxL!5(Wf>;O>kwZj8%wjH4o9M`vu=ml!YsLV{xKBn=%1A(e#n=~Gcqam9TB zQ4tkEK@<>0`kbo&;Qd4IRbTX+H|MM8Cdo=6JH>rgaWcZY{>VwdYUbAKyFk*28d9c;Va4DIC>^dc9WlW2|GVP*WWpIpG&*A0yKrvs1ln4HT}$D% zwD}`GITVt+46cykX$^!Up*By%07*-$47@Wj3oFb~y|B{b^0aytPuQRY^nn3ZIii`^ z4XYjT{PfEaU%&)w`e3aM)?I@2Hu%YCjyA{k!q49cy^*dU+ih^f@cPOe_{AJ$j=luH z+F+y6yiBm2hD{X}73KfopZGIN5!md$QV&~i|0{+WKIp4gR4nS2+#4jH*FRQpHw&S2@1jHZbb(CHKM}bt8ND!{{(@Ton$F-F?ht zZ8X7N0TjE7(}JdMcZ~)1381nAx*JPvmeOk2FMyg{)!FIGZFZ(YiU4w~N?P&J{LCgR z91wtPt}Lx=wE3GeAXNbQCbhJ&Dk~%81Cs#Km2@yS7i2>kqzS;274A~fo%U2Oqzj-l zJKP~i=^05wg{7Mr=Fp|lhZ3m}lutOSYzIR!S51W@TNbOf^Hv`{)6 z5kO12Dd4PXtuxm`o&Yjyx~w&k^y1Ve$QM8@cse>-Dy!RrP#}PiFD=8D>CDcrg+c-3 zge`VYvDXyzL6HFL5w+1Bsget1C>B6xuCt`Y?r1)i2PFb%&ka;}9&IeC%7RkXPnla4 zvMINLS*eGkKLlgZ1yf;zN+D=-avvO1msH%41J)%K+iXzvL&bkW_1{$-6@eP}6?K*4 z-+U8P;kc9D;9;OIa?>kTmJU$J;^jKwbBIjtobXB z&NDhdAMh)W9*{<_rB(C{ZyxVHL6fMR#V0gOm*^=i(v-A1gS1UC#Wcet_qTp@&SM z(KQ@Y`0qY_;&q?#3e8YoE#2XOuoo8-kJeiC;YS?D3t6jo(=!Q2@x@`9tEcO@AlW&< zaazFV^o&NBU=Y6^z%e?<@ja4g@c@3~aL0Lin4LX~$9rp6(meH`wvoQlt8I9hu2a8c z=9kCRzlko=yi|)k+~GV-qo{dts?AFG@w&cuGv34j_CDW9y^I#(62e3+E4(1?dfk_;?)-bMh`y58ah;$h(~Ga_x14 zwf~|)_u3@VqzZbx-+&LQX-4X#ck7PebR3SMmbjU&+v(d+G%Wcd-a6={mlD@}A*Q`6 zfs4yQ&aB^&gyW*R^Q6^MpJ_956s-%MwHfKMWC`J09QoxKJ>@{22Jn2NA0N?2iTZGm z26;G&6ZBQuhY!<^X2w!4YN_Q5MW_GkgC)wlUA)9c={`M;(UqYK0%wBQkMsCg+KIPtx;(j5uQ6RdUUZyGJ3)7(wM~s@ zV^+{-HXl~#tt*T-bp4((;>QXas-%0Uxf1AVfP?O(S&2!%@!=ecisKDyP3;@YXgHp( zuyr3Rc!HkeFbDWbiaW9c2a;G~4|wVwEgq%|o1$21G+%kEm937tk%P(b#t1{tgSR z40^T;tMKD)Qzl;u`Z@dBP*r51L8+=Gv?~uM>7Iml@WnszrW;2@c`q^hOdeje>Gfn4 z-LujFR7G>TU#h>;t73&m8eQ<>bfO7w(u8ECIeKPnIzS7O^8~%$o5V?EnMRCF)N_Qd zmFH3p&N!KU3Wu-{HTWZZhK5eFB0VbI^h&Jhc?n;TCa>I2H|T+c{Q(a2N%{d!ou26@ zbf0eRXLF5ai}ps*OcQHEYw@Ju>|Wl%sI)=CHzi&gV^=iNOp{(t$5%d+d?^f3PXf*6 z>zhyE5Y92x0G-#Rn@3t4AN7UZI@xKQ+CjH>@(f>KH*m-)<8z;$j(s!`;j80aA=g-| zzz4N7w9L*fafm7J(BRJp=&`iHPIo!e@7W*c4D2fRJ+(G)M8N5m>!EI`={WvBO~(oS z8&YqBli!;nmuNap8P%<7vih5p?vOz9p<(_O~fvXi_U33Eac9G=^NfDh$mYm&@Je<3=ywav8#YpNIP|KYy@d^~S5k z27xwNv`jMhme8ukw}m4RT+Ii7aI86 Au>b%7 literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/pickle_compression/data.pickle.bz2 b/pandas/io/tests/data/pickle_compression/data.pickle.bz2 new file mode 100644 index 0000000000000000000000000000000000000000..993027413de7f6fc419ba8da803d0699ca88b748 GIT binary patch literal 2335 zcmV+)3E=iZT4*^jL0KkKS?Au`0RRJEfBygf|NsC0|NsC0|NsC0|NsC0|NsC0|Noc( z5C8xG;0Av_9n0&jb=~u=+Q+~j8RyH2srpP*^*xB8`6L;snM{U{Q1votq2)9Oq79@p z8hV4!&;SEK00xGd05m;A)Y0kzp`gj?0ib0!s(NYaG9f)pnj-?9r>J^QO+(c3G|D|k zsCqRvr=t@RJW1s}s(K@8dQBdu>S}4|O&+F?Q1pg@>KX&o(?AEP(?ijy^#ZAZBN~QH zCfZX*p3_u*qIonu3V9x;r>T?mPf&VCspSt)4^VAG$_J^D>M|J_pa1{>000000Td>g z1|Vd~GgHthvnqKflxX!dc#-OQXpd3qG#RLA>UxK$X^0I2)E=M!(9kjlfHVQ7ntD%B zfdGnaG$Tq)rjg`NQR%7ZH2~0ho}+32&Wx~p zDWuN{9u7w?=|VLj*q0a`z`TkgaBZ^H^xBD{3=fFl*QHY;+l2ftPG`% z>AR*e0eH+_p3v4mZQ9%=s0XKSpWMa*mYSZ|?Iju(ifDBlO0E6a1 zqrQ}@xy$O0kAiLiI<>9t$s-t?FZpW)BHqUE-7ct#+-CP zJrfjmN|jzXRl02qrq`fh((L&cCLK|isU4{lBFxZyjz|0QC>Z;!nNDY`}aSTGre90%>Jmq`Z<1Co1l# z&wI^U1B+tcdh~Z0W}ym!)xn>O@r>AxG-HEB%xra~(Y{NKmTXhbAO^YS=>(?l4ODJ2 zL}R+2Y0B*@N6)1qoJ&t|XYD zCgjn-sVshh0pneqrw)<`B`PNg=1{C8i?N0-1f{~R!{92xjB*7jn~&g0-e&NHUvhv|SYg?3&Pk zC-!nGm8xZ0$fL995NmTe5e~jgb~Xkq3M3JT76ED)RYp9E?lgj+*z1b6Wb%nn0ZL$sAOe5^`QsKc#yP=FKobSVuR-&n?i~I2@aq#K0`)L}tRZ#hg65(C zIttFb5kjLdCS=Qw{r#o-$IiSCnhWMje?{8~Y$BTT7Zr}$)3l}F(quVhmYhkFki^<0 zx3LB53`OA!(^!ZFs3D1%F9Ac>G|8JW>>+xZWRN~}8jO*i(RFSIh>2?od!^HsRNy@O zF5RJ0r!m$tTmV@miMUOG?e~*W&XB$W!Xb_UJO~3d5>+&ICfVgn9V=BBP%>ynVhgSt zx~jy0GZkBbe-V!^N>&l=ZTsV&pddscEZFxJ5PxPIdGSTrj!6lW2m>u59V|jY14h0p zwQhf=YD~%5vYrgQ$kj|F0(s2$S2}VW_8yy?O8sl3eTF1JJkpTWGN_PCr)_8wLW?s( z%|Wz`h=wh;PP((xJ#<+|E3SMxEo|v-#flGvlT`$^LNhiUgi-*23x}n$94A%WtXeF! z$=7}jB{{K-uP9&$MOhp;u_Y1?AB>nGqGtZ=#E~(gYQ&>#A{7*Av5+){{4&9){FRWK z&Ue#T1uYm7wjTLI3nIME6K!zipH*Ef1#dziz(#czq=KF0>Z##apOP= z3Mz}{;aqZ2O)5Tz@KvIUX%_kDqO?>65f7t?G!+CDim{Mb#fYjZqAZHB7@~qO7@~@z ztVN2Vs3@W_6%<4jR8k0tAc!b}sH{a~Scs~kD;5aFQAAY~Vu-8}RTLN~!9+n2Re~`^ zB8*ld#8pLNtW{AJj1-Fn1(1qGB9MA8?v~tsV7QU!e$@5ag3X$z^U#8pTcbcI1Vnzk3C4p})8?{amUyj7x$siK-nP%cK~z)<<3&OZ)je%%zlSNi=^`f9nl%^cMmT#r6h#yTJxx~%FSjf2+ zleY{czx+HO(vz9O>Va%qXC@K%WQ25s)X$QPoyN@lE!Uw_Cw#yI@&E@oyOJrwgoi%% F&iKV{2?>0Bu%zR})th2NZsYMDVI>n%*bNHlYN0Ex? zV8j1PtZl|#fjGTwJ$u)j>s-FZdVg!!cYUKb(=fmeR3R zZ5O^UMzq71E{ChZExCd^$;S&m-bPy_8+O81witeT#gNBmfUi4XlLa>4f-M&KMsJKX zu5X8L*9^Ktt$xaj>o8GuivvwJ_mz0zg|HVJ;&nSdohx1Mq z?9^-fS+GlcWu+UqUG z=IX*iIH>`?)H=zR=gY{kfDlfhGuP%z7n1`ia7qK}Qw%PfSwXIDaQIO{=^)s^oy_&t!P0oIUQ;|!IFIU?kT zL(jAq)LU(JXR@F`1DY~@3K zuEw3NG^@)woWvW)zB)i70*>R0leAbx_i;tAa)6Vxf}=D-V{BjmmyhBEUE}zk3AB0? zKXbT~ygkg$Mq=>MrY~uU+ECd>%k*kD-lqH1DH!>spE|eGO}KyvEz~Ypu}v`LQMVdUU&+2o16_}zYuJA5awH2aQ6KvoE5`|);Q%fP zF|D|CyvR>)IK=+VILyhrNo~|8;Gla0^>OW0g-yTHfOAtkDMAVLAJO5Ha+()f=-uX1 zI2(-8 z|Efh#w*^xG-{RPJXK088hJ1Lv#*5ErPM{7PpaEWv;}k6mhw*9h>9qCKj!F`JveMH% zRrFY;6g{VfolW1<0IQF3t)^)XS2*|{uJ>vDgDXGnrA3zP$MG0iY<5R+DMp+$7)LX1 zdWed!XA92eQ5P)=MqJ8R?`@)4R2uOW^?rSMH@#N5V&9)v^qN~#Dcs3?={)OGHSW>G9?y|x z8W;5IVlH4epnABMWhd}8-rld4a(XAecAIo+wsXsH?tmeUF9o&FmaEJ1Of(>r)d%)x z;WRxGuoqwahFwk^)5?2`tz{!lz8k!c#z*HYUlzE(zs44k*K^%)$*4piXh;1wFY%#3u&)a4d% zf_npBkRpD0gdWfn0Xuyh=<}4LG^%C#IX$L_M_65Za=|b+br`=2l-hpc=c3Gkd9L zAMfx5_5cU%B9400bR4Gc5MLeda=FGz2|lT$!F5)4iGysimj=E)O8vrCEA?@v-?P6S zKA%i8nAeSVx4 zaEVvIt<6L;t-0=`C38=1rwR&S7F8iOJ5ho>k9Pip|SxV7O? fn{FYXy*uiqkf=?R!72zfIuri}HoNmp3JU-L{=he| literal 0 HcmV?d00001 diff --git a/pandas/io/tests/data/pickle_compression/data.pickle.xz b/pandas/io/tests/data/pickle_compression/data.pickle.xz new file mode 100644 index 0000000000000000000000000000000000000000..3e6dacf0f8432de70939e055f0596f860f1a547a GIT binary patch literal 2076 zcmV+%2;=wtH+ooF0004LBHlIv03iVw0000-SUN-C3ke6`T>wA;hy%+I>67OR zHPj)wP~TxY+*N)wtLstG{$1^gEAj_qDGka-tK88&Jko=Z))U`;Ph1dC)m9kA1ju`V zh(m$C{#)JvzR3QUVGQG`DswaJ`daFjUjNyka}U7lq?52^h`Tw!*Tw<_M2>^dN zy9W*Nbym%EQ~$p}@u<$Z#hZK*Mp*O?7qjC>!VX@p6fD|AhG~NVSb)AWhrhVZiFPSjMQP63 z!=_$ssADR?56f<61nxQ@B30`&j6;8vm=Hqk_Ije_6TbK9VrUs<&!QuH8Z^@A|kvSyRjpD>`#VO zqhcn1mxf?Y9!Sz1`*KQB8Jcf&ZYjo-E162=Eqo(AU|^l>u^eQqQ)Y-C-G>lg@JDNt zMtqr8haq5^LY)KZ1v52qhKo=)p!nd#e(^$N# zz_4^o7gTlLHo%cYzojV-ZXuS$4Qx*E-=a@`-O!qfH4LyE<-ufI$znfa^~LsdLrkcj z36(mr1TtHAfUnvhOzt)%yMHj;)IPzI&uf0BQi&uSfWF!>UG6YAwy6~I3ir`G#sx9q zz_Z{o!`%^T?Hip;1(n{rnJ{o5Wj%wp{=EnR)t1 zW(GLUmf>qPX?|BBjL&qL__@gDW&RO%OyOL1KSkyx64dJ^BVxh-l5q+ywJVCq za;=!!2D@|j)whQgxk=?h?M?pb8q`ut8jTv)*k+DVq-adZQgh@^qx+uscDmb`%ChWWbyzlOG1MUJp0DKj`KF`-O7>eb&bn(N01lzIxCWA~tKlt&Cm4 zr^`;?P^pE4jsF$IPZKXj1R=gEh|~gA&WMV8=M;)7Gjh-m-iq<@aYW*7=?F~v>`E!CE z;hysUl<*M&D_*$U;nIBJ{YB?#+D70rR8A+85L>cD|Jf)P0xz`>p_`F%PDc7;*a_mM z-5)z2Z}ai*xH)4EXZegJfI{q5nuFdE52Rg~>Z~}NHlvw1uq}Azwi|W5toUwB0*(GB z|1Ku9g7CCbS2}V5eJfAZ)fUj6s)YXf@?_})i;_?y$WIH?@fyD+kOA$ z)+tG0W2#b~P}^{Yox%Tahp*VCMWssWFO^buuFc z2R%13TDqXRU8`EbpIOb2b_I1Y3e&CT#m&H=694i}mR!^9^Tbm&e!j5auJzkU^*>}^ z#|CG1*SCzHwx@yLgX3Wx7|UtrAcCY=s;M}*Lkbd0OJ0{D{JI|e^@pwk0Tt+9n$7%o zcHKFYhfRV^hWD{^m|l9!n#m)VQYN)Kyftv}qX-ff=T=psW+6%Kz}zrjcVjc)#Oj$o zA*Lw(hf-<3M6+z>7|hYvHduo`+Qo9mZ;okx>5~`PwDt+K%_!vN`8UJcxw>-`>7>4; zi4d#uKKK+}-si)<Z*>qf^zY zOqdtRNa9`{?;mixG!8sqBOKO`;{Px0z6~xa^7H=F_9+rR2nJbfrIjO?H<$ar_h(|p z9urd(H1LL{_lR5JFlCr7gTi&R9!oZobd|x=hnbxO4^*5kc)immi0LhA8Y$&rbrM?Z z%Xt`Yz$AP%0`A)`b#ga5Yo_yTjvM8+H~;`PyYo!|0rwAz761S+ Date: Fri, 30 Dec 2016 15:07:06 +0800 Subject: [PATCH 4/9] add zip decompression support. refactor using lambda. --- pandas/io/common.py | 14 ++---- pandas/io/pickle.py | 46 ++++++++---------- .../data/pickle_compression/data.pickle.zip | Bin 0 -> 2331 bytes pandas/io/tests/test_pickle.py | 10 ++-- setup.py | 1 + 5 files changed, 30 insertions(+), 41 deletions(-) create mode 100644 pandas/io/tests/data/pickle_compression/data.pickle.zip diff --git a/pandas/io/common.py b/pandas/io/common.py index 80f983406cb5d..3e2bb68f609d1 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -14,7 +14,6 @@ try: from s3fs import S3File - need_text_wrapping = (BytesIO, S3File) except ImportError: need_text_wrapping = (BytesIO,) @@ -29,21 +28,20 @@ try: import pathlib - _PATHLIB_INSTALLED = True except ImportError: _PATHLIB_INSTALLED = False + try: from py.path import local as LocalPath - _PY_PATH_INSTALLED = True except: _PY_PATH_INSTALLED = False + if compat.PY3: from urllib.request import urlopen, pathname2url - _urlopen = urlopen from urllib.parse import urlparse as parse_url from urllib.parse import (uses_relative, uses_netloc, uses_params, @@ -60,13 +58,13 @@ from contextlib import contextmanager, closing # noqa from functools import wraps # noqa - # @wraps(_urlopen) @contextmanager def urlopen(*args, **kwargs): with closing(_urlopen(*args, **kwargs)) as f: yield f + _VALID_URLS = set(uses_relative + uses_netloc + uses_params) _VALID_URLS.discard('') @@ -77,7 +75,6 @@ class ParserError(ValueError): """ pass - # gh-12665: Alias for now and remove later. CParserError = ParserError @@ -112,14 +109,12 @@ class BaseIterator(object): """Subclass this and provide a "__next__()" method to obtain an iterator. Useful only when the object being iterated is non-reusable (e.g. OK for a parser, not for an in-memory table, yes for its iterator).""" - def __iter__(self): return self def __next__(self): raise AbstractMethodError(self) - if not compat.PY3: BaseIterator.next = lambda self: self.__next__() @@ -465,6 +460,7 @@ def __next__(self): class UTF8Recoder(BaseIterator): + """ Iterator that reads an encoded stream and reencodes the input to UTF-8 """ @@ -487,7 +483,6 @@ def UnicodeReader(f, dialect=csv.excel, encoding="utf-8", **kwds): # ignore encoding return csv.reader(f, dialect=dialect, **kwds) - def UnicodeWriter(f, dialect=csv.excel, encoding="utf-8", **kwds): return csv.writer(f, dialect=dialect, **kwds) else: @@ -509,7 +504,6 @@ def __next__(self): row = next(self.reader) return [compat.text_type(s, "utf-8") for s in row] - class UnicodeWriter: """ diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index ce693088224bd..969a2a51cb15d 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -44,10 +44,10 @@ def read_pickle(path, compression='infer'): ---------- path : string File path - compression : {'infer', 'gzip', 'bz2', 'xz', None}, default 'infer' + compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use - gzip, bz2 or xz if path is a string ending in '.gz', '.bz2', or 'xz', - respectively, and no decompression otherwise. + gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz', + or 'zip' respectively, and no decompression otherwise. Set to None for no decompression. .. versionadded:: 0.20.0 @@ -59,6 +59,17 @@ def read_pickle(path, compression='infer'): inferred_compression = _infer_compression(path, compression) + def read_wrapper(func): + # wrapper file handle open/close operation + f, fh = _get_handle(path, 'rb', + compression=inferred_compression, + is_text=False) + try: + return func(f) + finally: + for _f in fh: + _f.close() + def try_read(path, encoding=None): # try with cPickle # try with current pickle, if we have a Type Error then @@ -69,35 +80,16 @@ def try_read(path, encoding=None): # cpickle # GH 6899 try: - f, fh = _get_handle(path, 'rb', - compression=inferred_compression, - is_text=False) - try: - return pkl.load(f) - finally: - for _f in fh: - _f.close() + return read_wrapper(lambda f: pkl.load(f)) except Exception: # reg/patched pickle try: - f, fh = _get_handle(path, 'rb', - compression=inferred_compression, - is_text=False) - try: - return pc.load(f, encoding=encoding, compat=False) - finally: - for _f in fh: - _f.close() + return read_wrapper( + lambda f: pc.load(f, encoding=encoding, compat=False)) # compat pickle except: - f, fh = _get_handle(path, 'rb', - compression=inferred_compression, - is_text=False) - try: - return pc.load(f, encoding=encoding, compat=True) - finally: - for _f in fh: - _f.close() + return read_wrapper( + lambda f: pc.load(f, encoding=encoding, compat=True)) try: return try_read(path) except: diff --git a/pandas/io/tests/data/pickle_compression/data.pickle.zip b/pandas/io/tests/data/pickle_compression/data.pickle.zip new file mode 100644 index 0000000000000000000000000000000000000000..8b1e43cef1cb7035a07cf69575093c573a113ca3 GIT binary patch literal 2331 zcmV+$3FP)rO9KQH0000803c_bNjAIlO@9ag0168L01E&B0AyiwVJ>iKV{2?>ZB}_# z6IT=m6ok;)s;#XXb*VL0MZzMWtt0yq113O7RE(2kU;-iJCSkK$6$KR+WN|}Q5fwq9 zARvOwIrn~pexdfxgr3u$)BKTn^LuxD_x|pido9Z47kzFqnCNNUiXkyy zTS3IZ>kxSbqV%@th=>S>n|EN{e^flakQDHV-k>hq+u*7*i9WGL3g}=xYewsk&5VG!~aUGZN^@KIK6E>d)J)nT)xJ7e{0xxeWN$z5d#6S zRp$sut_EK)6lipXbPykw(y>)-7rrn?w8NJ!hpWLYxq>>$#|u8*Mq4BscEVS-7=C)i zkjH0$uRCCq1vcM;Ef)AjZ;UjqZ-;N!47x+De%9M+i(>iG4EW9%VT`;5-&7hd6k1G$<#0p;Dl%nzizl+kYSdR^G|1|)tcd`28hPe z!qOUxw=NZuG$7j`7uJ*=OAUCypaCgT3K;8h(jgg=HNbT&*ea#itx0Z3(SXAA;3a=$ zX?{ilq-ucqOt3n|kdf&rg)|MwDsL78{*pSA2aah#u`gI$lU`IBYJhYNs0~yYo2<@i z8$gBzfU7R2$=YDD=0axJMp8%=-5&SZA~+uQk|P%Q13|O77*1$Fi{DaUlFn8*o8T7> zFyx&{YYr5pr5D4m;e6EE>n+CS>cT=esR6#!I?0#k%gC{S5Kf^p*XBzXlLIMmN(1Us z3_g2V!&ze`WNAQJMXR|Yl#-uR3)vb_39jbm`qJ_yKjdgYz>}QnNwcSCS3+($>p_#% zmG3tAJ&>mX){tD|43&vFBIJid&$JiRTWxh`vY&MIs?pM7F#S(_797Hf%5;dI3ffU&O7o(+n;eap z;5F=|KKw)-^a-a>*-9UI=RLm1CAx_Ub=J|8-k7c$$A{jgO9vBZQfQzlYWw=vXu8g5 zH~oRjym~?ky%sjo2zH&jbe^VBxr)zem~PP!trJ9^9ES}J^l_tV4yPj1cd^N~b*FGhWUC)d_=cXM!jA9Z zfW&`~=>yAs#5*)k9hKC}0bx6?#+|M-tIIi@#2d%HIzS@=j^m4yv{*&=aYe9lfRnU> zqclQeY+wMFkKzPf82P22 zI=9nJS`sRemwW9eX&e?64lfi%f;m& z=eO*Q$4Rxi9D`Cx9flojBX6_+szpz?1ycau;@Ed*Xov-de0aUai_d6Ipbi|M0bY*d z6fFyf@oDnuwDr`EN)mmt($hUv^jM`7J*S18P2bZ1tB-Q6rfCjWIQSl}_i6ltD?ja} zMV9Qx@fcccc1Lk3Mw~PlM>B4Eh>Eah3(n?I7cB}#T*_GQZK7FJ8u1nNetmg2y;iwm z=B5}>`OlTfIDo4(kx45wB(T?oG|P$I>p0KjOqk(1nq!|mf*f@bXU^|eyTyCXyJJHf zDQu??Z{aG74MnNUPzizae(c00>=*XoL!2#6D5P<9=Um=7?Bav>3Y%(cu10-9A6b1^ zqPLD9K2Z64QIB&aG+0WHP;tc4T^|SCP74AX{mjid5fRM?R%)BJ713}E-C^wx>g7%w z#bFNcl@NVuFLuW>#h&oiJ6b(SH?~JG)oA8IA$CX?)EvZC;WNCBw~w(rO*vG&c+$Ai>3rKEz*d-_9(3g_Ve`qP&_A*X&TbkQrjuMRz`&koVT;} z863n8RN&{}6&k$EjC9G=RpMraEMPZE^HO>O@W&x*cFY{(X^Y>vCKWmm%<>m#nM8ydiX>R;ToIj zrt2zovq*{NR$n=w8oP`$d#Pt1@9+io00-?Nj(XH|9H#CNUmfpqxyDKfKB=U^byjwX zgKV;w2EIK?{lZo&^>L=(v%elbpH=3$r_utp5ZIk!6`TzZ9q0ZvbevbeAypQ*ur?Gq zw4vjoUfv}q$Un&H_#&=-ew-F?iC4g_%~eT>hSWp@NLQd*uK`YVQVc>(706e>6ULfE zZ=)21TDjB~tA0&u0uGi6LY>Rua5j3pJeoNix}ewN;`z(VAFTE2C#&89zD7|SnZy=W zZ;)degCX#Trzi(cYXG*mwc$~lZXuw(JL;v7s7;i?DhM??6aNNKO928u02BZK00;me zXPrqlyYo$d2mk;I3jhEM03-ka0000003ZMW000000AyiwVJ>iKV{2?>3IHGg00000 z0RR{P`NQG+k73dQ`NQG+k73dQ`NQG+k73dQP)h{{000000RRC2T>t<8s0aW60023q BQ;z@u literal 0 HcmV?d00001 diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 795666d74357f..0d4f848f0ea77 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -324,7 +324,9 @@ def test_compression_infer(self): for ext in extensions: yield self.compression_infer, ext - def compression_prepared_data(self, ext): + def decompression_prepared_data(self, ext): + if ext == '.xz': + tm._skip_if_no_lzma() pickle_path = os.path.join(tm.get_data_path(), 'pickle_compression', 'data.pickle') @@ -333,10 +335,10 @@ def compression_prepared_data(self, ext): data2 = pd.read_pickle(compressed_path) tm.assert_frame_equal(data1, data2) - def test_compression_prepared_data(self): - extensions = ['.gz', '.bz2', '.xz'] + def test_decompression_prepared_data(self): + extensions = ['.gz', '.bz2', '.xz', '.zip'] for ext in extensions: - yield self.compression_prepared_data, ext + yield self.decompression_prepared_data, ext if __name__ == '__main__': diff --git a/setup.py b/setup.py index 0a84cf527bfb1..09c83e060dc4d 100755 --- a/setup.py +++ b/setup.py @@ -660,6 +660,7 @@ def pxd(name): package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', 'tests/data/legacy_pickle/*/*.pickle', 'tests/data/legacy_msgpack/*/*.msgpack', + 'tests/data/pickle_compression/*', 'tests/data/*.csv*', 'tests/data/*.dta', 'tests/data/*.pickle', From 9a07250eb935ca7dbeb6ecba82c94daee195fb97 Mon Sep 17 00:00:00 2001 From: goldenbull Date: Tue, 3 Jan 2017 12:56:52 +0800 Subject: [PATCH 5/9] Remove prepared compressed data. _get_handle will take care of compressed I/O --- .../tests/data/pickle_compression/data.pickle | Bin 2826 -> 0 bytes .../data/pickle_compression/data.pickle.bz2 | Bin 2335 -> 0 bytes .../data/pickle_compression/data.pickle.gz | Bin 2205 -> 0 bytes .../data/pickle_compression/data.pickle.xz | Bin 2076 -> 0 bytes .../data/pickle_compression/data.pickle.zip | Bin 2331 -> 0 bytes pandas/io/tests/test_pickle.py | 17 ----------------- setup.py | 1 - 7 files changed, 18 deletions(-) delete mode 100644 pandas/io/tests/data/pickle_compression/data.pickle delete mode 100644 pandas/io/tests/data/pickle_compression/data.pickle.bz2 delete mode 100644 pandas/io/tests/data/pickle_compression/data.pickle.gz delete mode 100644 pandas/io/tests/data/pickle_compression/data.pickle.xz delete mode 100644 pandas/io/tests/data/pickle_compression/data.pickle.zip diff --git a/pandas/io/tests/data/pickle_compression/data.pickle b/pandas/io/tests/data/pickle_compression/data.pickle deleted file mode 100644 index 4e6eb94212745bfb8fb34605025e14a044f0add1..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2826 zcmZuz`BxL!5(Wf>;O>kwZj8%wjH4o9M`vu=ml!YsLV{xKBn=%1A(e#n=~Gcqam9TB zQ4tkEK@<>0`kbo&;Qd4IRbTX+H|MM8Cdo=6JH>rgaWcZY{>VwdYUbAKyFk*28d9c;Va4DIC>^dc9WlW2|GVP*WWpIpG&*A0yKrvs1ln4HT}$D% zwD}`GITVt+46cykX$^!Up*By%07*-$47@Wj3oFb~y|B{b^0aytPuQRY^nn3ZIii`^ z4XYjT{PfEaU%&)w`e3aM)?I@2Hu%YCjyA{k!q49cy^*dU+ih^f@cPOe_{AJ$j=luH z+F+y6yiBm2hD{X}73KfopZGIN5!md$QV&~i|0{+WKIp4gR4nS2+#4jH*FRQpHw&S2@1jHZbb(CHKM}bt8ND!{{(@Ton$F-F?ht zZ8X7N0TjE7(}JdMcZ~)1381nAx*JPvmeOk2FMyg{)!FIGZFZ(YiU4w~N?P&J{LCgR z91wtPt}Lx=wE3GeAXNbQCbhJ&Dk~%81Cs#Km2@yS7i2>kqzS;274A~fo%U2Oqzj-l zJKP~i=^05wg{7Mr=Fp|lhZ3m}lutOSYzIR!S51W@TNbOf^Hv`{)6 z5kO12Dd4PXtuxm`o&Yjyx~w&k^y1Ve$QM8@cse>-Dy!RrP#}PiFD=8D>CDcrg+c-3 zge`VYvDXyzL6HFL5w+1Bsget1C>B6xuCt`Y?r1)i2PFb%&ka;}9&IeC%7RkXPnla4 zvMINLS*eGkKLlgZ1yf;zN+D=-avvO1msH%41J)%K+iXzvL&bkW_1{$-6@eP}6?K*4 z-+U8P;kc9D;9;OIa?>kTmJU$J;^jKwbBIjtobXB z&NDhdAMh)W9*{<_rB(C{ZyxVHL6fMR#V0gOm*^=i(v-A1gS1UC#Wcet_qTp@&SM z(KQ@Y`0qY_;&q?#3e8YoE#2XOuoo8-kJeiC;YS?D3t6jo(=!Q2@x@`9tEcO@AlW&< zaazFV^o&NBU=Y6^z%e?<@ja4g@c@3~aL0Lin4LX~$9rp6(meH`wvoQlt8I9hu2a8c z=9kCRzlko=yi|)k+~GV-qo{dts?AFG@w&cuGv34j_CDW9y^I#(62e3+E4(1?dfk_;?)-bMh`y58ah;$h(~Ga_x14 zwf~|)_u3@VqzZbx-+&LQX-4X#ck7PebR3SMmbjU&+v(d+G%Wcd-a6={mlD@}A*Q`6 zfs4yQ&aB^&gyW*R^Q6^MpJ_956s-%MwHfKMWC`J09QoxKJ>@{22Jn2NA0N?2iTZGm z26;G&6ZBQuhY!<^X2w!4YN_Q5MW_GkgC)wlUA)9c={`M;(UqYK0%wBQkMsCg+KIPtx;(j5uQ6RdUUZyGJ3)7(wM~s@ zV^+{-HXl~#tt*T-bp4((;>QXas-%0Uxf1AVfP?O(S&2!%@!=ecisKDyP3;@YXgHp( zuyr3Rc!HkeFbDWbiaW9c2a;G~4|wVwEgq%|o1$21G+%kEm937tk%P(b#t1{tgSR z40^T;tMKD)Qzl;u`Z@dBP*r51L8+=Gv?~uM>7Iml@WnszrW;2@c`q^hOdeje>Gfn4 z-LujFR7G>TU#h>;t73&m8eQ<>bfO7w(u8ECIeKPnIzS7O^8~%$o5V?EnMRCF)N_Qd zmFH3p&N!KU3Wu-{HTWZZhK5eFB0VbI^h&Jhc?n;TCa>I2H|T+c{Q(a2N%{d!ou26@ zbf0eRXLF5ai}ps*OcQHEYw@Ju>|Wl%sI)=CHzi&gV^=iNOp{(t$5%d+d?^f3PXf*6 z>zhyE5Y92x0G-#Rn@3t4AN7UZI@xKQ+CjH>@(f>KH*m-)<8z;$j(s!`;j80aA=g-| zzz4N7w9L*fafm7J(BRJp=&`iHPIo!e@7W*c4D2fRJ+(G)M8N5m>!EI`={WvBO~(oS z8&YqBli!;nmuNap8P%<7vih5p?vOz9p<(_O~fvXi_U33Eac9G=^NfDh$mYm&@Je<3=ywav8#YpNIP|KYy@d^~S5k z27xwNv`jMhme8ukw}m4RT+Ii7aI86 Au>b%7 diff --git a/pandas/io/tests/data/pickle_compression/data.pickle.bz2 b/pandas/io/tests/data/pickle_compression/data.pickle.bz2 deleted file mode 100644 index 993027413de7f6fc419ba8da803d0699ca88b748..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2335 zcmV+)3E=iZT4*^jL0KkKS?Au`0RRJEfBygf|NsC0|NsC0|NsC0|NsC0|NsC0|Noc( z5C8xG;0Av_9n0&jb=~u=+Q+~j8RyH2srpP*^*xB8`6L;snM{U{Q1votq2)9Oq79@p z8hV4!&;SEK00xGd05m;A)Y0kzp`gj?0ib0!s(NYaG9f)pnj-?9r>J^QO+(c3G|D|k zsCqRvr=t@RJW1s}s(K@8dQBdu>S}4|O&+F?Q1pg@>KX&o(?AEP(?ijy^#ZAZBN~QH zCfZX*p3_u*qIonu3V9x;r>T?mPf&VCspSt)4^VAG$_J^D>M|J_pa1{>000000Td>g z1|Vd~GgHthvnqKflxX!dc#-OQXpd3qG#RLA>UxK$X^0I2)E=M!(9kjlfHVQ7ntD%B zfdGnaG$Tq)rjg`NQR%7ZH2~0ho}+32&Wx~p zDWuN{9u7w?=|VLj*q0a`z`TkgaBZ^H^xBD{3=fFl*QHY;+l2ftPG`% z>AR*e0eH+_p3v4mZQ9%=s0XKSpWMa*mYSZ|?Iju(ifDBlO0E6a1 zqrQ}@xy$O0kAiLiI<>9t$s-t?FZpW)BHqUE-7ct#+-CP zJrfjmN|jzXRl02qrq`fh((L&cCLK|isU4{lBFxZyjz|0QC>Z;!nNDY`}aSTGre90%>Jmq`Z<1Co1l# z&wI^U1B+tcdh~Z0W}ym!)xn>O@r>AxG-HEB%xra~(Y{NKmTXhbAO^YS=>(?l4ODJ2 zL}R+2Y0B*@N6)1qoJ&t|XYD zCgjn-sVshh0pneqrw)<`B`PNg=1{C8i?N0-1f{~R!{92xjB*7jn~&g0-e&NHUvhv|SYg?3&Pk zC-!nGm8xZ0$fL995NmTe5e~jgb~Xkq3M3JT76ED)RYp9E?lgj+*z1b6Wb%nn0ZL$sAOe5^`QsKc#yP=FKobSVuR-&n?i~I2@aq#K0`)L}tRZ#hg65(C zIttFb5kjLdCS=Qw{r#o-$IiSCnhWMje?{8~Y$BTT7Zr}$)3l}F(quVhmYhkFki^<0 zx3LB53`OA!(^!ZFs3D1%F9Ac>G|8JW>>+xZWRN~}8jO*i(RFSIh>2?od!^HsRNy@O zF5RJ0r!m$tTmV@miMUOG?e~*W&XB$W!Xb_UJO~3d5>+&ICfVgn9V=BBP%>ynVhgSt zx~jy0GZkBbe-V!^N>&l=ZTsV&pddscEZFxJ5PxPIdGSTrj!6lW2m>u59V|jY14h0p zwQhf=YD~%5vYrgQ$kj|F0(s2$S2}VW_8yy?O8sl3eTF1JJkpTWGN_PCr)_8wLW?s( z%|Wz`h=wh;PP((xJ#<+|E3SMxEo|v-#flGvlT`$^LNhiUgi-*23x}n$94A%WtXeF! z$=7}jB{{K-uP9&$MOhp;u_Y1?AB>nGqGtZ=#E~(gYQ&>#A{7*Av5+){{4&9){FRWK z&Ue#T1uYm7wjTLI3nIME6K!zipH*Ef1#dziz(#czq=KF0>Z##apOP= z3Mz}{;aqZ2O)5Tz@KvIUX%_kDqO?>65f7t?G!+CDim{Mb#fYjZqAZHB7@~qO7@~@z ztVN2Vs3@W_6%<4jR8k0tAc!b}sH{a~Scs~kD;5aFQAAY~Vu-8}RTLN~!9+n2Re~`^ zB8*ld#8pLNtW{AJj1-Fn1(1qGB9MA8?v~tsV7QU!e$@5ag3X$z^U#8pTcbcI1Vnzk3C4p})8?{amUyj7x$siK-nP%cK~z)<<3&OZ)je%%zlSNi=^`f9nl%^cMmT#r6h#yTJxx~%FSjf2+ zleY{czx+HO(vz9O>Va%qXC@K%WQ25s)X$QPoyN@lE!Uw_Cw#yI@&E@oyOJrwgoi%% F&iKV{2?>0Bu%zR})th2NZsYMDVI>n%*bNHlYN0Ex? zV8j1PtZl|#fjGTwJ$u)j>s-FZdVg!!cYUKb(=fmeR3R zZ5O^UMzq71E{ChZExCd^$;S&m-bPy_8+O81witeT#gNBmfUi4XlLa>4f-M&KMsJKX zu5X8L*9^Ktt$xaj>o8GuivvwJ_mz0zg|HVJ;&nSdohx1Mq z?9^-fS+GlcWu+UqUG z=IX*iIH>`?)H=zR=gY{kfDlfhGuP%z7n1`ia7qK}Qw%PfSwXIDaQIO{=^)s^oy_&t!P0oIUQ;|!IFIU?kT zL(jAq)LU(JXR@F`1DY~@3K zuEw3NG^@)woWvW)zB)i70*>R0leAbx_i;tAa)6Vxf}=D-V{BjmmyhBEUE}zk3AB0? zKXbT~ygkg$Mq=>MrY~uU+ECd>%k*kD-lqH1DH!>spE|eGO}KyvEz~Ypu}v`LQMVdUU&+2o16_}zYuJA5awH2aQ6KvoE5`|);Q%fP zF|D|CyvR>)IK=+VILyhrNo~|8;Gla0^>OW0g-yTHfOAtkDMAVLAJO5Ha+()f=-uX1 zI2(-8 z|Efh#w*^xG-{RPJXK088hJ1Lv#*5ErPM{7PpaEWv;}k6mhw*9h>9qCKj!F`JveMH% zRrFY;6g{VfolW1<0IQF3t)^)XS2*|{uJ>vDgDXGnrA3zP$MG0iY<5R+DMp+$7)LX1 zdWed!XA92eQ5P)=MqJ8R?`@)4R2uOW^?rSMH@#N5V&9)v^qN~#Dcs3?={)OGHSW>G9?y|x z8W;5IVlH4epnABMWhd}8-rld4a(XAecAIo+wsXsH?tmeUF9o&FmaEJ1Of(>r)d%)x z;WRxGuoqwahFwk^)5?2`tz{!lz8k!c#z*HYUlzE(zs44k*K^%)$*4piXh;1wFY%#3u&)a4d% zf_npBkRpD0gdWfn0Xuyh=<}4LG^%C#IX$L_M_65Za=|b+br`=2l-hpc=c3Gkd9L zAMfx5_5cU%B9400bR4Gc5MLeda=FGz2|lT$!F5)4iGysimj=E)O8vrCEA?@v-?P6S zKA%i8nAeSVx4 zaEVvIt<6L;t-0=`C38=1rwR&S7F8iOJ5ho>k9Pip|SxV7O? fn{FYXy*uiqkf=?R!72zfIuri}HoNmp3JU-L{=he| diff --git a/pandas/io/tests/data/pickle_compression/data.pickle.xz b/pandas/io/tests/data/pickle_compression/data.pickle.xz deleted file mode 100644 index 3e6dacf0f8432de70939e055f0596f860f1a547a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2076 zcmV+%2;=wtH+ooF0004LBHlIv03iVw0000-SUN-C3ke6`T>wA;hy%+I>67OR zHPj)wP~TxY+*N)wtLstG{$1^gEAj_qDGka-tK88&Jko=Z))U`;Ph1dC)m9kA1ju`V zh(m$C{#)JvzR3QUVGQG`DswaJ`daFjUjNyka}U7lq?52^h`Tw!*Tw<_M2>^dN zy9W*Nbym%EQ~$p}@u<$Z#hZK*Mp*O?7qjC>!VX@p6fD|AhG~NVSb)AWhrhVZiFPSjMQP63 z!=_$ssADR?56f<61nxQ@B30`&j6;8vm=Hqk_Ije_6TbK9VrUs<&!QuH8Z^@A|kvSyRjpD>`#VO zqhcn1mxf?Y9!Sz1`*KQB8Jcf&ZYjo-E162=Eqo(AU|^l>u^eQqQ)Y-C-G>lg@JDNt zMtqr8haq5^LY)KZ1v52qhKo=)p!nd#e(^$N# zz_4^o7gTlLHo%cYzojV-ZXuS$4Qx*E-=a@`-O!qfH4LyE<-ufI$znfa^~LsdLrkcj z36(mr1TtHAfUnvhOzt)%yMHj;)IPzI&uf0BQi&uSfWF!>UG6YAwy6~I3ir`G#sx9q zz_Z{o!`%^T?Hip;1(n{rnJ{o5Wj%wp{=EnR)t1 zW(GLUmf>qPX?|BBjL&qL__@gDW&RO%OyOL1KSkyx64dJ^BVxh-l5q+ywJVCq za;=!!2D@|j)whQgxk=?h?M?pb8q`ut8jTv)*k+DVq-adZQgh@^qx+uscDmb`%ChWWbyzlOG1MUJp0DKj`KF`-O7>eb&bn(N01lzIxCWA~tKlt&Cm4 zr^`;?P^pE4jsF$IPZKXj1R=gEh|~gA&WMV8=M;)7Gjh-m-iq<@aYW*7=?F~v>`E!CE z;hysUl<*M&D_*$U;nIBJ{YB?#+D70rR8A+85L>cD|Jf)P0xz`>p_`F%PDc7;*a_mM z-5)z2Z}ai*xH)4EXZegJfI{q5nuFdE52Rg~>Z~}NHlvw1uq}Azwi|W5toUwB0*(GB z|1Ku9g7CCbS2}V5eJfAZ)fUj6s)YXf@?_})i;_?y$WIH?@fyD+kOA$ z)+tG0W2#b~P}^{Yox%Tahp*VCMWssWFO^buuFc z2R%13TDqXRU8`EbpIOb2b_I1Y3e&CT#m&H=694i}mR!^9^Tbm&e!j5auJzkU^*>}^ z#|CG1*SCzHwx@yLgX3Wx7|UtrAcCY=s;M}*Lkbd0OJ0{D{JI|e^@pwk0Tt+9n$7%o zcHKFYhfRV^hWD{^m|l9!n#m)VQYN)Kyftv}qX-ff=T=psW+6%Kz}zrjcVjc)#Oj$o zA*Lw(hf-<3M6+z>7|hYvHduo`+Qo9mZ;okx>5~`PwDt+K%_!vN`8UJcxw>-`>7>4; zi4d#uKKK+}-si)<Z*>qf^zY zOqdtRNa9`{?;mixG!8sqBOKO`;{Px0z6~xa^7H=F_9+rR2nJbfrIjO?H<$ar_h(|p z9urd(H1LL{_lR5JFlCr7gTi&R9!oZobd|x=hnbxO4^*5kc)immi0LhA8Y$&rbrM?Z z%Xt`Yz$AP%0`A)`b#ga5Yo_yTjvM8+H~;`PyYo!|0rwAz761S+iKV{2?>ZB}_# z6IT=m6ok;)s;#XXb*VL0MZzMWtt0yq113O7RE(2kU;-iJCSkK$6$KR+WN|}Q5fwq9 zARvOwIrn~pexdfxgr3u$)BKTn^LuxD_x|pido9Z47kzFqnCNNUiXkyy zTS3IZ>kxSbqV%@th=>S>n|EN{e^flakQDHV-k>hq+u*7*i9WGL3g}=xYewsk&5VG!~aUGZN^@KIK6E>d)J)nT)xJ7e{0xxeWN$z5d#6S zRp$sut_EK)6lipXbPykw(y>)-7rrn?w8NJ!hpWLYxq>>$#|u8*Mq4BscEVS-7=C)i zkjH0$uRCCq1vcM;Ef)AjZ;UjqZ-;N!47x+De%9M+i(>iG4EW9%VT`;5-&7hd6k1G$<#0p;Dl%nzizl+kYSdR^G|1|)tcd`28hPe z!qOUxw=NZuG$7j`7uJ*=OAUCypaCgT3K;8h(jgg=HNbT&*ea#itx0Z3(SXAA;3a=$ zX?{ilq-ucqOt3n|kdf&rg)|MwDsL78{*pSA2aah#u`gI$lU`IBYJhYNs0~yYo2<@i z8$gBzfU7R2$=YDD=0axJMp8%=-5&SZA~+uQk|P%Q13|O77*1$Fi{DaUlFn8*o8T7> zFyx&{YYr5pr5D4m;e6EE>n+CS>cT=esR6#!I?0#k%gC{S5Kf^p*XBzXlLIMmN(1Us z3_g2V!&ze`WNAQJMXR|Yl#-uR3)vb_39jbm`qJ_yKjdgYz>}QnNwcSCS3+($>p_#% zmG3tAJ&>mX){tD|43&vFBIJid&$JiRTWxh`vY&MIs?pM7F#S(_797Hf%5;dI3ffU&O7o(+n;eap z;5F=|KKw)-^a-a>*-9UI=RLm1CAx_Ub=J|8-k7c$$A{jgO9vBZQfQzlYWw=vXu8g5 zH~oRjym~?ky%sjo2zH&jbe^VBxr)zem~PP!trJ9^9ES}J^l_tV4yPj1cd^N~b*FGhWUC)d_=cXM!jA9Z zfW&`~=>yAs#5*)k9hKC}0bx6?#+|M-tIIi@#2d%HIzS@=j^m4yv{*&=aYe9lfRnU> zqclQeY+wMFkKzPf82P22 zI=9nJS`sRemwW9eX&e?64lfi%f;m& z=eO*Q$4Rxi9D`Cx9flojBX6_+szpz?1ycau;@Ed*Xov-de0aUai_d6Ipbi|M0bY*d z6fFyf@oDnuwDr`EN)mmt($hUv^jM`7J*S18P2bZ1tB-Q6rfCjWIQSl}_i6ltD?ja} zMV9Qx@fcccc1Lk3Mw~PlM>B4Eh>Eah3(n?I7cB}#T*_GQZK7FJ8u1nNetmg2y;iwm z=B5}>`OlTfIDo4(kx45wB(T?oG|P$I>p0KjOqk(1nq!|mf*f@bXU^|eyTyCXyJJHf zDQu??Z{aG74MnNUPzizae(c00>=*XoL!2#6D5P<9=Um=7?Bav>3Y%(cu10-9A6b1^ zqPLD9K2Z64QIB&aG+0WHP;tc4T^|SCP74AX{mjid5fRM?R%)BJ713}E-C^wx>g7%w z#bFNcl@NVuFLuW>#h&oiJ6b(SH?~JG)oA8IA$CX?)EvZC;WNCBw~w(rO*vG&c+$Ai>3rKEz*d-_9(3g_Ve`qP&_A*X&TbkQrjuMRz`&koVT;} z863n8RN&{}6&k$EjC9G=RpMraEMPZE^HO>O@W&x*cFY{(X^Y>vCKWmm%<>m#nM8ydiX>R;ToIj zrt2zovq*{NR$n=w8oP`$d#Pt1@9+io00-?Nj(XH|9H#CNUmfpqxyDKfKB=U^byjwX zgKV;w2EIK?{lZo&^>L=(v%elbpH=3$r_utp5ZIk!6`TzZ9q0ZvbevbeAypQ*ur?Gq zw4vjoUfv}q$Un&H_#&=-ew-F?iC4g_%~eT>hSWp@NLQd*uK`YVQVc>(706e>6ULfE zZ=)21TDjB~tA0&u0uGi6LY>Rua5j3pJeoNix}ewN;`z(VAFTE2C#&89zD7|SnZy=W zZ;)degCX#Trzi(cYXG*mwc$~lZXuw(JL;v7s7;i?DhM??6aNNKO928u02BZK00;me zXPrqlyYo$d2mk;I3jhEM03-ka0000003ZMW000000AyiwVJ>iKV{2?>3IHGg00000 z0RR{P`NQG+k73dQ`NQG+k73dQ`NQG+k73dQP)h{{000000RRC2T>t<8s0aW60023q BQ;z@u diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 0d4f848f0ea77..d667c2e50bd74 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -324,23 +324,6 @@ def test_compression_infer(self): for ext in extensions: yield self.compression_infer, ext - def decompression_prepared_data(self, ext): - if ext == '.xz': - tm._skip_if_no_lzma() - pickle_path = os.path.join(tm.get_data_path(), - 'pickle_compression', - 'data.pickle') - compressed_path = pickle_path + ext - data1 = pd.read_pickle(pickle_path) - data2 = pd.read_pickle(compressed_path) - tm.assert_frame_equal(data1, data2) - - def test_decompression_prepared_data(self): - extensions = ['.gz', '.bz2', '.xz', '.zip'] - for ext in extensions: - yield self.decompression_prepared_data, ext - - if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], # '--with-coverage', '--cover-package=pandas.core'], diff --git a/setup.py b/setup.py index 09c83e060dc4d..0a84cf527bfb1 100755 --- a/setup.py +++ b/setup.py @@ -660,7 +660,6 @@ def pxd(name): package_data={'pandas.io': ['tests/data/legacy_hdf/*.h5', 'tests/data/legacy_pickle/*/*.pickle', 'tests/data/legacy_msgpack/*/*.msgpack', - 'tests/data/pickle_compression/*', 'tests/data/*.csv*', 'tests/data/*.dta', 'tests/data/*.pickle', From ccbeaa9caa12dc5b1d4061e041ab369d590f5e40 Mon Sep 17 00:00:00 2001 From: goldenbull Date: Wed, 4 Jan 2017 13:25:29 +0800 Subject: [PATCH 6/9] move pickle compression tests into a new class --- pandas/io/tests/test_pickle.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index d667c2e50bd74..5951a1e85fb8e 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -284,6 +284,12 @@ def test_pickle_v0_15_2(self): # tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) + +class TestPickleCompression: + + def setUp(self): + self.path = u('__%s__.pickle' % tm.rands(10)) + def compression_explicit(self, compression): # issue 11666 if compression == 'xz': From 86afd256bfbc42defbaf706833d8df33382133f1 Mon Sep 17 00:00:00 2001 From: goldenbull Date: Mon, 6 Mar 2017 13:00:15 +0800 Subject: [PATCH 7/9] change test to new pytest parameterized style --- pandas/tests/io/test_pickle.py | 74 +++++++++++++++------------------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 009a283280260..d67ca7ce8960b 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -304,47 +304,39 @@ def test_pickle_v0_15_2(): tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) -class TestPickleCompression(object): - - def setup_class(self): - self.path = u'__%s__.pickle' % tm.rands(10) - - def compression_explicit(self, compression): - # issue 11666 - if compression == 'xz': - tm._skip_if_no_lzma() - with tm.ensure_clean(self.path) as path: +# --------------------- +# test pickle compression +# --------------------- +def get_random_path(): + return u'__%s__.pickle' % tm.rands(10) + + +@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) +def test_compression_explicit(compression): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() + with tm.ensure_clean(get_random_path()) as path: + df = tm.makeDataFrame() + df.to_pickle(path, compression=compression) + df2 = pd.read_pickle(path, compression=compression) + tm.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) +def test_compression_explicit_bad(compression): + with tm.assertRaisesRegexp(ValueError, + "Unrecognized compression type"): + with tm.ensure_clean(get_random_path()) as path: df = tm.makeDataFrame() df.to_pickle(path, compression=compression) - df2 = pd.read_pickle(path, compression=compression) - tm.assert_frame_equal(df, df2) - - def test_compression_explicit(self): - compressions = [None, 'gzip', 'bz2', 'xz'] - for c in compressions: - yield self.compression_explicit, c - - def compression_explicit_bad(self, compression): - with tm.assertRaisesRegexp(ValueError, - "Unrecognized compression type"): - with tm.ensure_clean(self.path) as path: - df = tm.makeDataFrame() - df.to_pickle(path, compression=compression) - - def test_compression_explicit_bad(self): - compressions = ['', 'None', 'bad', '7z'] - for c in compressions: - yield self.compression_explicit_bad, c - - def compression_infer(self, ext): - if ext == '.xz': - tm._skip_if_no_lzma() - with tm.ensure_clean(self.path + ext) as path: - df = tm.makeDataFrame() - df.to_pickle(path) - tm.assert_frame_equal(df, pd.read_pickle(path)) - def test_compression_infer(self): - extensions = ['', '.gz', '.bz2', '.xz', '.no_compress'] - for ext in extensions: - yield self.compression_infer, ext + +@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) +def test_compression_infer(ext): + if ext == '.xz': + tm._skip_if_no_lzma() + with tm.ensure_clean(get_random_path() + ext) as path: + df = tm.makeDataFrame() + df.to_pickle(path) + tm.assert_frame_equal(df, pd.read_pickle(path)) From d50e430b0b13b0d7a67ee819cf62049435456f02 Mon Sep 17 00:00:00 2001 From: goldenbull Date: Wed, 8 Mar 2017 17:27:13 +0800 Subject: [PATCH 8/9] update docs. re-write all tests to avoid round-trip read/write comparison. --- doc/source/io.rst | 32 ++++++ doc/source/whatsnew/v0.20.0.txt | 28 +++++ pandas/tests/io/test_pickle.py | 194 ++++++++++++++++++++++++++++++-- 3 files changed, 243 insertions(+), 11 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index b36ae8c2ed450..d54ff5c114bb1 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2908,6 +2908,38 @@ any pickled pandas object (or any other pickled object) from file: import os os.remove('foo.pkl') +The ``to_pickle`` and ``read_pickle`` methods can read and write compressed pickle files. +For ``read_pickle`` method, ``compression`` parameter can be one of +{``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``. +If 'infer', then use gzip, bz2, zip, or xz if filename ends in '.gz', '.bz2', '.zip', or +'.xz', respectively. If using 'zip', the ZIP file must contain only one data file to be +read in. Set to ``None`` for no decompression. +``to_pickle`` works in a similar way, except that 'zip' format is not supported. If the +filename ends with '.zip', an exception will be raised. + + .. versionadded:: 0.20.0 + +.. ipython:: python + + df = pd.DataFrame({ + 'A': np.random.randn(1000), + 'B': np.random.randn(1000), + 'C': np.random.randn(1000)}) + df.to_pickle("data.pkl.xz") + df.to_pickle("data.pkl.compress", compression="gzip") + df["A"].to_pickle("s1.pkl.bz2") + + df = pd.read_pickle("data.pkl.xz") + df = pd.read_pickle("data.pkl.compress", compression="gzip") + s = pd.read_pickle("s1.pkl.bz2") + +.. ipython:: python + :suppress: + import os + os.remove("data.pkl.xz") + os.remove("data.pkl.compress") + os.remove("s1.pkl.bz2") + .. warning:: Loading pickled data received from untrusted sources can be unsafe. diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 54df7514a882d..05e9b77798896 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -99,6 +99,34 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). .. _whatsnew_0200.enhancements.uint64_support: +Pickle file I/O now supports compression +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +``read_pickle`` and ``to_pickle`` can now read from and write to compressed +pickle files. Compression methods can be explicit parameter or be inferred +from file extension. + +.. ipython:: python + + df = pd.DataFrame({ + 'A': np.random.randn(1000), + 'B': np.random.randn(1000), + 'C': np.random.randn(1000)}) + df.to_pickle("data.pkl.xz") + df.to_pickle("data.pkl.compress", compression="gzip") + df["A"].to_pickle("s1.pkl.bz2") + + df = pd.read_pickle("data.pkl.xz") + df = pd.read_pickle("data.pkl.compress", compression="gzip") + s = pd.read_pickle("s1.pkl.bz2") + +.. ipython:: python + :suppress: + import os + os.remove("data.pkl.xz") + os.remove("data.pkl.compress") + os.remove("s1.pkl.bz2") + UInt64 Support Improved ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index d67ca7ce8960b..8e7ac8cb02274 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -15,15 +15,15 @@ import pytest import os - from distutils.version import LooseVersion - import pandas as pd +import numpy as np from pandas import Index from pandas.compat import is_platform_little_endian import pandas import pandas.util.testing as tm from pandas.tseries.offsets import Day, MonthEnd +import shutil @pytest.fixture(scope='module') @@ -307,24 +307,101 @@ def test_pickle_v0_15_2(): # --------------------- # test pickle compression # --------------------- +_compression_to_extension = { + None: ".none", + 'gzip': '.gz', + 'bz2': '.bz2', + 'zip': '.zip', + 'xz': '.xz', +} + + def get_random_path(): return u'__%s__.pickle' % tm.rands(10) +def compress_file(src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return + + if compression == 'gzip': + import gzip + f = gzip.open(dest_path, "w") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(dest_path, "w") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(dest_path, "w", + compression=zipfile.ZIP_DEFLATED) + zip_file.write(src_path, os.path.basename(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(dest_path, "w") + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + if compression != "zip": + f.write(open(src_path, "rb").read()) + f.close() + + +def decompress_file(src_path, dest_path, compression): + if compression is None: + shutil.copyfile(src_path, dest_path) + return + + if compression == 'gzip': + import gzip + f = gzip.open(src_path, "r") + elif compression == 'bz2': + import bz2 + f = bz2.BZ2File(src_path, "r") + elif compression == 'zip': + import zipfile + zip_file = zipfile.ZipFile(src_path) + zip_names = zip_file.namelist() + if len(zip_names) == 1: + f = zip_file.open(zip_names.pop()) + else: + raise ValueError('ZIP file {} error. Only one file per ZIP.' + .format(src_path)) + elif compression == 'xz': + lzma = pandas.compat.import_lzma() + f = lzma.LZMAFile(src_path, "r") + else: + msg = 'Unrecognized compression type: {}'.format(compression) + raise ValueError(msg) + + open(dest_path, "wb").write(f.read()) + f.close() + + @pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) -def test_compression_explicit(compression): +def test_write_explicit(compression): # issue 11666 if compression == 'xz': tm._skip_if_no_lzma() - with tm.ensure_clean(get_random_path()) as path: + + base = get_random_path() + path1 = base + ".compressed" + path2 = base + ".raw" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() - df.to_pickle(path, compression=compression) - df2 = pd.read_pickle(path, compression=compression) + # write to compressed file + df.to_pickle(p1, compression=compression) + # decompress + decompress_file(p1, p2, compression=compression) + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) tm.assert_frame_equal(df, df2) @pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) -def test_compression_explicit_bad(compression): +def test_write_explicit_bad(compression): with tm.assertRaisesRegexp(ValueError, "Unrecognized compression type"): with tm.ensure_clean(get_random_path()) as path: @@ -333,10 +410,105 @@ def test_compression_explicit_bad(compression): @pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) -def test_compression_infer(ext): +def test_write_infer(ext): if ext == '.xz': tm._skip_if_no_lzma() - with tm.ensure_clean(get_random_path() + ext) as path: + + base = get_random_path() + path1 = base + ext + path2 = base + ".raw" + compression = None + for c in _compression_to_extension: + if _compression_to_extension[c] == ext: + compression = c + break + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() - df.to_pickle(path) - tm.assert_frame_equal(df, pd.read_pickle(path)) + # write to compressed file by inferred compression method + df.to_pickle(p1) + # decompress + decompress_file(p1, p2, compression=compression) + # read decompressed file + df2 = pd.read_pickle(p2, compression=None) + tm.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz', "zip"]) +def test_read_explicit(compression): + # issue 11666 + if compression == 'xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ".raw" + path2 = base + ".compressed" + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + # write to uncompressed file + df.to_pickle(p1, compression=None) + # compress + compress_file(p1, p2, compression=compression) + # read compressed file + df2 = pd.read_pickle(p2, compression=compression) + tm.assert_frame_equal(df, df2) + + +@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.zip', + '.no_compress']) +def test_read_infer(ext): + if ext == '.xz': + tm._skip_if_no_lzma() + + base = get_random_path() + path1 = base + ".raw" + path2 = base + ext + compression = None + for c in _compression_to_extension: + if _compression_to_extension[c] == ext: + compression = c + break + + with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: + df = tm.makeDataFrame() + # write to uncompressed file + df.to_pickle(p1, compression=None) + # compress + compress_file(p1, p2, compression=compression) + # read compressed file by inferred compression method + df2 = pd.read_pickle(p2) + tm.assert_frame_equal(df, df2) + + + + + + + + + + + + + + + + +def notest_zip(): + df = pd.DataFrame({ + 'A': np.random.randn(100).repeat(10), + 'B': np.random.randn(100).repeat(10), + 'C': np.random.randn(100).repeat(10)}) + os.chdir("d:\\test") + + df.to_pickle("data.raw") + compress_file("data.raw", "data.zip", "zip") + compress_file("data.raw", "data.xz", "xz") + compress_file("data.raw", "data.bz2", "bz2") + compress_file("data.raw", "data.gz", "gzip") + + decompress_file("data.zip", "data.zip.raw", "zip") + decompress_file("data.xz", "data.xz.raw", "xz") + decompress_file("data.bz2", "data.bz2.raw", "bz2") + decompress_file("data.gz", "data.gz.raw", "gzip") From e9c5fd251331107f61d8e53119874151dd28f8f2 Mon Sep 17 00:00:00 2001 From: goldenbull Date: Thu, 9 Mar 2017 11:34:24 +0800 Subject: [PATCH 9/9] docs update --- doc/source/io.rst | 69 ++++++++++++++++++--------------- doc/source/whatsnew/v0.20.0.txt | 22 +++++++---- pandas/tests/io/test_pickle.py | 50 ++++++++---------------- 3 files changed, 68 insertions(+), 73 deletions(-) diff --git a/doc/source/io.rst b/doc/source/io.rst index d54ff5c114bb1..1b19599177c9a 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -2908,16 +2908,38 @@ any pickled pandas object (or any other pickled object) from file: import os os.remove('foo.pkl') -The ``to_pickle`` and ``read_pickle`` methods can read and write compressed pickle files. -For ``read_pickle`` method, ``compression`` parameter can be one of -{``'infer'``, ``'gzip'``, ``'bz2'``, ``'zip'``, ``'xz'``, ``None``}, default ``'infer'``. -If 'infer', then use gzip, bz2, zip, or xz if filename ends in '.gz', '.bz2', '.zip', or -'.xz', respectively. If using 'zip', the ZIP file must contain only one data file to be -read in. Set to ``None`` for no decompression. -``to_pickle`` works in a similar way, except that 'zip' format is not supported. If the -filename ends with '.zip', an exception will be raised. +.. warning:: + + Loading pickled data received from untrusted sources can be unsafe. + + See: http://docs.python.org/2.7/library/pickle.html + +.. warning:: + + Several internal refactorings, 0.13 (:ref:`Series Refactoring `), and 0.15 (:ref:`Index Refactoring `), + preserve compatibility with pickles created prior to these versions. However, these must + be read with ``pd.read_pickle``, rather than the default python ``pickle.load``. + See `this question `__ + for a detailed explanation. + +.. note:: + + These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated. + +.. _io.pickle.compression: + +Read/Write compressed pickle files +'''''''''''''' + +.. versionadded:: 0.20.0 - .. versionadded:: 0.20.0 +:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` can read +and write compressed pickle files. Compression types of ``gzip``, ``bz2``, ``xz`` supports +both read and write. ``zip`` file supports read only and must contain only one data file +to be read in. +Compression type can be an explicitely parameter or be inferred from the file extension. +If 'infer', then use ``gzip``, ``bz2``, ``zip``, or ``xz`` if filename ends in ``'.gz'``, ``'.bz2'``, ``'.zip'``, or +``'.xz'``, respectively. .. ipython:: python @@ -2925,39 +2947,24 @@ filename ends with '.zip', an exception will be raised. 'A': np.random.randn(1000), 'B': np.random.randn(1000), 'C': np.random.randn(1000)}) - df.to_pickle("data.pkl.xz") - df.to_pickle("data.pkl.compress", compression="gzip") + df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type + df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension + df.to_pickle("data.pkl.gz") # default, using "infer" df["A"].to_pickle("s1.pkl.bz2") - df = pd.read_pickle("data.pkl.xz") df = pd.read_pickle("data.pkl.compress", compression="gzip") + df = pd.read_pickle("data.pkl.xz", compression="infer") + df = pd.read_pickle("data.pkl.gz") s = pd.read_pickle("s1.pkl.bz2") .. ipython:: python :suppress: import os - os.remove("data.pkl.xz") os.remove("data.pkl.compress") + os.remove("data.pkl.xz") + os.remove("data.pkl.gz") os.remove("s1.pkl.bz2") -.. warning:: - - Loading pickled data received from untrusted sources can be unsafe. - - See: http://docs.python.org/2.7/library/pickle.html - -.. warning:: - - Several internal refactorings, 0.13 (:ref:`Series Refactoring `), and 0.15 (:ref:`Index Refactoring `), - preserve compatibility with pickles created prior to these versions. However, these must - be read with ``pd.read_pickle``, rather than the default python ``pickle.load``. - See `this question `__ - for a detailed explanation. - -.. note:: - - These methods were previously ``pd.save`` and ``pd.load``, prior to 0.12.0, and are now deprecated. - .. _io.msgpack: msgpack diff --git a/doc/source/whatsnew/v0.20.0.txt b/doc/source/whatsnew/v0.20.0.txt index 05e9b77798896..d5c438e8c08d1 100644 --- a/doc/source/whatsnew/v0.20.0.txt +++ b/doc/source/whatsnew/v0.20.0.txt @@ -97,14 +97,15 @@ support for bz2 compression in the python 2 c-engine improved (:issue:`14874`). df = pd.read_table(url, compression='bz2') # explicitly specify compression df.head(2) -.. _whatsnew_0200.enhancements.uint64_support: +.. _whatsnew_0200.enhancements.pickle_compression: Pickle file I/O now supports compression ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -``read_pickle`` and ``to_pickle`` can now read from and write to compressed -pickle files. Compression methods can be explicit parameter or be inferred -from file extension. +:func:`read_pickle`, :meth:`DataFame.to_pickle` and :meth:`Series.to_pickle` +can now read from and write to compressed pickle files. Compression methods +can be an explicit parameter or be inferred from the file extension. +See :ref:`Read/Write compressed pickle files ` .. ipython:: python @@ -112,21 +113,26 @@ from file extension. 'A': np.random.randn(1000), 'B': np.random.randn(1000), 'C': np.random.randn(1000)}) - df.to_pickle("data.pkl.xz") - df.to_pickle("data.pkl.compress", compression="gzip") + df.to_pickle("data.pkl.compress", compression="gzip") # explicit compression type + df.to_pickle("data.pkl.xz", compression="infer") # infer compression type from extension + df.to_pickle("data.pkl.gz") # default, using "infer" df["A"].to_pickle("s1.pkl.bz2") - df = pd.read_pickle("data.pkl.xz") df = pd.read_pickle("data.pkl.compress", compression="gzip") + df = pd.read_pickle("data.pkl.xz", compression="infer") + df = pd.read_pickle("data.pkl.gz") s = pd.read_pickle("s1.pkl.bz2") .. ipython:: python :suppress: import os - os.remove("data.pkl.xz") os.remove("data.pkl.compress") + os.remove("data.pkl.xz") + os.remove("data.pkl.gz") os.remove("s1.pkl.bz2") +.. _whatsnew_0200.enhancements.uint64_support: + UInt64 Support Improved ^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/tests/io/test_pickle.py b/pandas/tests/io/test_pickle.py index 8e7ac8cb02274..2fffc3c39ec26 100644 --- a/pandas/tests/io/test_pickle.py +++ b/pandas/tests/io/test_pickle.py @@ -17,7 +17,6 @@ import os from distutils.version import LooseVersion import pandas as pd -import numpy as np from pandas import Index from pandas.compat import is_platform_little_endian import pandas @@ -391,12 +390,16 @@ def test_write_explicit(compression): with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() + # write to compressed file df.to_pickle(p1, compression=compression) + # decompress decompress_file(p1, p2, compression=compression) + # read decompressed file df2 = pd.read_pickle(p2, compression=None) + tm.assert_frame_equal(df, df2) @@ -425,12 +428,16 @@ def test_write_infer(ext): with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() + # write to compressed file by inferred compression method df.to_pickle(p1) + # decompress decompress_file(p1, p2, compression=compression) + # read decompressed file df2 = pd.read_pickle(p2, compression=None) + tm.assert_frame_equal(df, df2) @@ -446,12 +453,16 @@ def test_read_explicit(compression): with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() + # write to uncompressed file df.to_pickle(p1, compression=None) + # compress compress_file(p1, p2, compression=compression) + # read compressed file df2 = pd.read_pickle(p2, compression=compression) + tm.assert_frame_equal(df, df2) @@ -472,43 +483,14 @@ def test_read_infer(ext): with tm.ensure_clean(path1) as p1, tm.ensure_clean(path2) as p2: df = tm.makeDataFrame() + # write to uncompressed file df.to_pickle(p1, compression=None) + # compress compress_file(p1, p2, compression=compression) + # read compressed file by inferred compression method df2 = pd.read_pickle(p2) - tm.assert_frame_equal(df, df2) - - - - - - - - - - - - - - - -def notest_zip(): - df = pd.DataFrame({ - 'A': np.random.randn(100).repeat(10), - 'B': np.random.randn(100).repeat(10), - 'C': np.random.randn(100).repeat(10)}) - os.chdir("d:\\test") - - df.to_pickle("data.raw") - compress_file("data.raw", "data.zip", "zip") - compress_file("data.raw", "data.xz", "xz") - compress_file("data.raw", "data.bz2", "bz2") - compress_file("data.raw", "data.gz", "gzip") - - decompress_file("data.zip", "data.zip.raw", "zip") - decompress_file("data.xz", "data.xz.raw", "xz") - decompress_file("data.bz2", "data.bz2.raw", "bz2") - decompress_file("data.gz", "data.gz.raw", "gzip") + tm.assert_frame_equal(df, df2)