-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
add compression support for 'read_pickle' and 'to_pickle' #13317
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
025a0cd
81d55a0
6df6611
b8c4175
1cb810b
9a07250
ccbeaa9
945e7bb
86afd25
d50e430
e9c5fd2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -304,47 +304,39 @@ def test_pickle_v0_15_2(): | |
tm.assert_categorical_equal(cat, pd.read_pickle(pickle_path)) | ||
|
||
|
||
class TestPickleCompression(object): | ||
|
||
def setup_class(self): | ||
self.path = u'__%s__.pickle' % tm.rands(10) | ||
|
||
def compression_explicit(self, compression): | ||
# issue 11666 | ||
if compression == 'xz': | ||
tm._skip_if_no_lzma() | ||
with tm.ensure_clean(self.path) as path: | ||
# --------------------- | ||
# test pickle compression | ||
# --------------------- | ||
def get_random_path(): | ||
return u'__%s__.pickle' % tm.rands(10) | ||
|
||
|
||
@pytest.mark.parametrize('compression', [None, 'gzip', 'bz2', 'xz']) | ||
def test_compression_explicit(compression): | ||
# issue 11666 | ||
if compression == 'xz': | ||
tm._skip_if_no_lzma() | ||
with tm.ensure_clean(get_random_path()) as path: | ||
df = tm.makeDataFrame() | ||
df.to_pickle(path, compression=compression) | ||
df2 = pd.read_pickle(path, compression=compression) | ||
tm.assert_frame_equal(df, df2) | ||
|
||
|
||
@pytest.mark.parametrize('compression', ['', 'None', 'bad', '7z']) | ||
def test_compression_explicit_bad(compression): | ||
with tm.assertRaisesRegexp(ValueError, | ||
"Unrecognized compression type"): | ||
with tm.ensure_clean(get_random_path()) as path: | ||
df = tm.makeDataFrame() | ||
df.to_pickle(path, compression=compression) | ||
df2 = pd.read_pickle(path, compression=compression) | ||
tm.assert_frame_equal(df, df2) | ||
|
||
def test_compression_explicit(self): | ||
compressions = [None, 'gzip', 'bz2', 'xz'] | ||
for c in compressions: | ||
yield self.compression_explicit, c | ||
|
||
def compression_explicit_bad(self, compression): | ||
with tm.assertRaisesRegexp(ValueError, | ||
"Unrecognized compression type"): | ||
with tm.ensure_clean(self.path) as path: | ||
df = tm.makeDataFrame() | ||
df.to_pickle(path, compression=compression) | ||
|
||
def test_compression_explicit_bad(self): | ||
compressions = ['', 'None', 'bad', '7z'] | ||
for c in compressions: | ||
yield self.compression_explicit_bad, c | ||
|
||
def compression_infer(self, ext): | ||
if ext == '.xz': | ||
tm._skip_if_no_lzma() | ||
with tm.ensure_clean(self.path + ext) as path: | ||
df = tm.makeDataFrame() | ||
df.to_pickle(path) | ||
tm.assert_frame_equal(df, pd.read_pickle(path)) | ||
|
||
def test_compression_infer(self): | ||
extensions = ['', '.gz', '.bz2', '.xz', '.no_compress'] | ||
for ext in extensions: | ||
yield self.compression_infer, ext | ||
|
||
@pytest.mark.parametrize('ext', ['', '.gz', '.bz2', '.xz', '.no_compress']) | ||
def test_compression_infer(ext): | ||
if ext == '.xz': | ||
tm._skip_if_no_lzma() | ||
with tm.ensure_clean(get_random_path() + ext) as path: | ||
df = tm.makeDataFrame() | ||
df.to_pickle(path) | ||
tm.assert_frame_equal(df, pd.read_pickle(path)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These tests just test that the round trip works, but not that inference works. Here's some pseudo-code to more explicitly test inference. with tm.ensure_clean(get_random_path() + ext) as infer_path, tm.ensure_clean(get_random_path() + ext) as explicit_path:
df = tm.makeDataFrame()
df.to_pickle(infer_path)
df.to_pickle(explicit_path, compression=compression)
# Implement: Assert files equal
tm.assert_frame_equal(df, pd.read_pickle(explicit_path)) There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. maybe should add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, actually, all write-then-read operations should split into three steps:
Then compare content from file2 with that written to file1. Seems I need to re-write all the tests. |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make sense to test a
.pkl
extension, as this will be the most common (and shouldn't be compressed)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
.Is there anything official that says
.pkl
is a pickle extension ? (though by definition.pkl
would NOT be compressed, while.pkl.gz
would be for example). Also let's have.gzip
the same as.gz
. I think this is common.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The extension for pickled files is not standardized. The common ones are
.pkl
,.p
, and.pickle
.Currently, we don't infer
.gzip
extension as gzip compression. I think that's out of scope for this PR. Something to consider for the future. Perhaps we could outsource inference tomimetypes.guess_type
's encoding detection.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@dhimmel good point. can you create an issue?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@jreback, I'm thinking this isn't something that I would advocate for. Here is what
mimetypes
does:outputs:
So if we use
mimetypes
, the.gzip
extension won't be recognized. We could code our compression inference to recognize.gzip
and.gz
, but I don't think that would be the right decision, since you could then write gzip compressed files that end in.gzip
using inference. I'd rather stick with the extensions recognized by mimetypes.If you'd still like to discuss this further via a dedicated issue, just let me know and I'll post it... with the caveat I'm not advocating for it 😼.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
sure, specific extensions are prob fine.