diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 5ccf829fd5a42..bd9707381190a 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -31,7 +31,7 @@ New features Other enhancements ^^^^^^^^^^^^^^^^^^ - +- `read_pickle` can now unpickle from compressed files (:issue:`11666`). diff --git a/pandas/io/common.py b/pandas/io/common.py index e46f609077810..bf5c76950fe22 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -237,6 +237,33 @@ def _stringify_path(filepath_or_buffer): return filepath_or_buffer +def get_compression_type(filepath_or_buffer, compression_kwd): + """ + Determine the compression type of a file or buffer. + + Parameters + ---------- + filepath_or_buffer : string + File path + compression_kwd: {'gzip', 'bz2', 'infer', None} + Compression type ('infer' looks for the file extensions .gz and .bz2, using gzip and bz2 to decompress + respectively). + + Returns + ------- + compression : {'gzip', 'bz2', None} depending on result + """ + # If the input could be a filename, check for a recognizable compression extension. + # If we're reading from a URL, the `get_filepath_or_buffer` will use header info + # to determine compression, so use what it finds in that case. + inferred_compression = None + if compression_kwd == 'infer' and isinstance(filepath_or_buffer, compat.string_types): + if filepath_or_buffer.endswith('.gz'): + inferred_compression = 'gzip' + elif filepath_or_buffer.endswith('.bz2'): + inferred_compression = 'bz2' + return inferred_compression + def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ @@ -467,4 +494,4 @@ def _check_as_is(x): # write to the target stream self.stream.write(data) # empty queue - self.queue.truncate(0) \ No newline at end of file + self.queue.truncate(0) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9d25eaecc6620..354b6d59dc719 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -17,7 +17,7 @@ from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser -from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, +from pandas.io.common import (get_filepath_or_buffer, get_compression_type, _validate_header_arg, _get_handle, UnicodeReader, UTF8Recoder) from pandas.tseries import tools @@ -242,25 +242,7 @@ def _read(filepath_or_buffer, kwds): if skipfooter is not None: kwds['skip_footer'] = skipfooter - # If the input could be a filename, check for a recognizable compression extension. - # If we're reading from a URL, the `get_filepath_or_buffer` will use header info - # to determine compression, so use what it finds in that case. - inferred_compression = kwds.get('compression') - if inferred_compression == 'infer': - if isinstance(filepath_or_buffer, compat.string_types): - if filepath_or_buffer.endswith('.gz'): - inferred_compression = 'gzip' - elif filepath_or_buffer.endswith('.bz2'): - inferred_compression = 'bz2' - else: - inferred_compression = None - else: - inferred_compression = None - - filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, - encoding, - compression=kwds.get('compression', None)) - kwds['compression'] = inferred_compression if compression == 'infer' else compression + kwds['compression'] = get_compression_type(filepath_or_buffer, encoding, kwds['compression']) if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 52a9ef0370e9e..e444ed30ecd81 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,4 +1,5 @@ from pandas.compat import cPickle as pkl, pickle_compat as pc, PY3 +from pandas.io.common import _get_handle, get_compression_type def to_pickle(obj, path): """ @@ -14,7 +15,7 @@ def to_pickle(obj, path): pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) -def read_pickle(path): +def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path @@ -26,6 +27,9 @@ def read_pickle(path): ---------- path : string File path + compression: {'gzip', 'bz2', 'infer', None}, default 'infer' + Compression type, ('infer' looks for the file extensions .gz and .bz2, using gzip and bz2 to decompress + respectively). Returns ------- @@ -41,19 +45,20 @@ def try_read(path, encoding=None): # cpickle # GH 6899 + _compression = get_compression_type(path, compression) try: - with open(path, 'rb') as fh: + with _get_handle(path, 'rb', encoding, _compression) as fh: return pkl.load(fh) except (Exception) as e: # reg/patched pickle try: - with open(path, 'rb') as fh: + with _get_handle(path, 'rb', encoding, _compression) as fh: return pc.load(fh, encoding=encoding, compat=False) # compat pickle except: - with open(path, 'rb') as fh: + with _get_handle(path, 'rb', encoding, _compression) as fh: return pc.load(fh, encoding=encoding, compat=True) try: