From 3785c7ee73c1f2666ff2f61f15c9f4b67ea29cfe Mon Sep 17 00:00:00 2001 From: khs26 Date: Sun, 22 Nov 2015 23:20:11 +0000 Subject: [PATCH 1/5] ENH: Allow compressed (gzip/bz2) files to be unpickled. --- pandas/io/parsers.py | 40 ++++++++++++++++++++++++++++++---------- pandas/io/pickle.py | 14 ++++++++++---- 2 files changed, 40 insertions(+), 14 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9d25eaecc6620..de77884964c4e 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -234,18 +234,28 @@ class ParserWarning(Warning): fields if it is not spaces (e.g., '~'). """ % (_parser_params % _fwf_widths) +def get_compression(filepath_or_buffer, encoding, compression_kwd): + """ + Determine the compression type of a file or buffer. -def _read(filepath_or_buffer, kwds): - "Generic reader of line files." - encoding = kwds.get('encoding', None) - skipfooter = kwds.pop('skipfooter', None) - if skipfooter is not None: - kwds['skip_footer'] = skipfooter - + Parameters + ---------- + filepath_or_buffer : string + File path + encoding: string + Encoding type + compression_kwd: {'gzip', 'bz2', 'infer', None} + Compression type ('infer' looks for the file extensions .gz and .bz2, using gzip and bz2 to decompress + respectively). + + Returns + ------- + compression : {'gzip', 'bz2', None} depending on result + """ # If the input could be a filename, check for a recognizable compression extension. # If we're reading from a URL, the `get_filepath_or_buffer` will use header info # to determine compression, so use what it finds in that case. - inferred_compression = kwds.get('compression') + inferred_compression = compression_kwd if inferred_compression == 'infer': if isinstance(filepath_or_buffer, compat.string_types): if filepath_or_buffer.endswith('.gz'): @@ -259,8 +269,18 @@ def _read(filepath_or_buffer, kwds): filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, encoding, - compression=kwds.get('compression', None)) - kwds['compression'] = inferred_compression if compression == 'infer' else compression + compression=compression_kwd) + return inferred_compression if compression == 'infer' else compression + + +def _read(filepath_or_buffer, kwds): + "Generic reader of line files." + encoding = kwds.get('encoding', None) + skipfooter = kwds.pop('skipfooter', None) + if skipfooter is not None: + kwds['skip_footer'] = skipfooter + + kwds['compression'] = get_compression(filepath_or_buffer, encoding, kwds['compression']) if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 52a9ef0370e9e..d40486d1fd77f 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,4 +1,6 @@ from pandas.compat import cPickle as pkl, pickle_compat as pc, PY3 +from pandas.io.common import _get_handle +from pandas.io.parsers import get_compression def to_pickle(obj, path): """ @@ -14,7 +16,7 @@ def to_pickle(obj, path): pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) -def read_pickle(path): +def read_pickle(path, compression_arg='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path @@ -26,6 +28,9 @@ def read_pickle(path): ---------- path : string File path + compression_arg: {'gzip', 'bz2', 'infer', None}, default 'infer' + Compression type, ('infer' looks for the file extensions .gz and .bz2, using gzip and bz2 to decompress + respectively). Returns ------- @@ -41,19 +46,20 @@ def try_read(path, encoding=None): # cpickle # GH 6899 + compression = get_compression(path, encoding, compression_arg) try: - with open(path, 'rb') as fh: + with _get_handle(path, 'rb', encoding, compression) as fh: return pkl.load(fh) except (Exception) as e: # reg/patched pickle try: - with open(path, 'rb') as fh: + with _get_handle(path, 'rb', encoding, compression) as fh: return pc.load(fh, encoding=encoding, compat=False) # compat pickle except: - with open(path, 'rb') as fh: + with _get_handle(path, 'rb', encoding, compression) as fh: return pc.load(fh, encoding=encoding, compat=True) try: From c33bc1d1039cf8d2414d95b9b6fd9a50d483e603 Mon Sep 17 00:00:00 2001 From: khs26 Date: Sun, 22 Nov 2015 23:32:26 +0000 Subject: [PATCH 2/5] Updated whatsnew. --- doc/source/whatsnew/v0.18.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 5ccf829fd5a42..27a5ee1a3e649 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -31,7 +31,7 @@ New features Other enhancements ^^^^^^^^^^^^^^^^^^ - +- `read_pickle` can now unpickle from compressed files (:issue:``). From 5743f795edd80d6c53e236b8bde95f2404a12242 Mon Sep 17 00:00:00 2001 From: khs26 Date: Thu, 3 Dec 2015 01:02:29 +0000 Subject: [PATCH 3/5] Moved function to common and made it lighter. --- pandas/io/common.py | 29 ++++++++++++++++++++++++++++- pandas/io/parsers.py | 42 ++---------------------------------------- 2 files changed, 30 insertions(+), 41 deletions(-) diff --git a/pandas/io/common.py b/pandas/io/common.py index e46f609077810..bf5c76950fe22 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -237,6 +237,33 @@ def _stringify_path(filepath_or_buffer): return filepath_or_buffer +def get_compression_type(filepath_or_buffer, compression_kwd): + """ + Determine the compression type of a file or buffer. + + Parameters + ---------- + filepath_or_buffer : string + File path + compression_kwd: {'gzip', 'bz2', 'infer', None} + Compression type ('infer' looks for the file extensions .gz and .bz2, using gzip and bz2 to decompress + respectively). + + Returns + ------- + compression : {'gzip', 'bz2', None} depending on result + """ + # If the input could be a filename, check for a recognizable compression extension. + # If we're reading from a URL, the `get_filepath_or_buffer` will use header info + # to determine compression, so use what it finds in that case. + inferred_compression = None + if compression_kwd == 'infer' and isinstance(filepath_or_buffer, compat.string_types): + if filepath_or_buffer.endswith('.gz'): + inferred_compression = 'gzip' + elif filepath_or_buffer.endswith('.bz2'): + inferred_compression = 'bz2' + return inferred_compression + def get_filepath_or_buffer(filepath_or_buffer, encoding=None, compression=None): """ @@ -467,4 +494,4 @@ def _check_as_is(x): # write to the target stream self.stream.write(data) # empty queue - self.queue.truncate(0) \ No newline at end of file + self.queue.truncate(0) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index de77884964c4e..354b6d59dc719 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -17,7 +17,7 @@ from pandas.core.common import AbstractMethodError from pandas.core.config import get_option from pandas.io.date_converters import generic_parser -from pandas.io.common import (get_filepath_or_buffer, _validate_header_arg, +from pandas.io.common import (get_filepath_or_buffer, get_compression_type, _validate_header_arg, _get_handle, UnicodeReader, UTF8Recoder) from pandas.tseries import tools @@ -234,44 +234,6 @@ class ParserWarning(Warning): fields if it is not spaces (e.g., '~'). """ % (_parser_params % _fwf_widths) -def get_compression(filepath_or_buffer, encoding, compression_kwd): - """ - Determine the compression type of a file or buffer. - - Parameters - ---------- - filepath_or_buffer : string - File path - encoding: string - Encoding type - compression_kwd: {'gzip', 'bz2', 'infer', None} - Compression type ('infer' looks for the file extensions .gz and .bz2, using gzip and bz2 to decompress - respectively). - - Returns - ------- - compression : {'gzip', 'bz2', None} depending on result - """ - # If the input could be a filename, check for a recognizable compression extension. - # If we're reading from a URL, the `get_filepath_or_buffer` will use header info - # to determine compression, so use what it finds in that case. - inferred_compression = compression_kwd - if inferred_compression == 'infer': - if isinstance(filepath_or_buffer, compat.string_types): - if filepath_or_buffer.endswith('.gz'): - inferred_compression = 'gzip' - elif filepath_or_buffer.endswith('.bz2'): - inferred_compression = 'bz2' - else: - inferred_compression = None - else: - inferred_compression = None - - filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, - encoding, - compression=compression_kwd) - return inferred_compression if compression == 'infer' else compression - def _read(filepath_or_buffer, kwds): "Generic reader of line files." @@ -280,7 +242,7 @@ def _read(filepath_or_buffer, kwds): if skipfooter is not None: kwds['skip_footer'] = skipfooter - kwds['compression'] = get_compression(filepath_or_buffer, encoding, kwds['compression']) + kwds['compression'] = get_compression_type(filepath_or_buffer, encoding, kwds['compression']) if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): From f409cb677adae4e4e03e94c19b3207f8effe1b5a Mon Sep 17 00:00:00 2001 From: khs26 Date: Sun, 6 Dec 2015 23:26:04 +0000 Subject: [PATCH 4/5] Updated whatsnew to address issue number. --- doc/source/whatsnew/v0.18.0.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v0.18.0.txt b/doc/source/whatsnew/v0.18.0.txt index 27a5ee1a3e649..bd9707381190a 100644 --- a/doc/source/whatsnew/v0.18.0.txt +++ b/doc/source/whatsnew/v0.18.0.txt @@ -31,7 +31,7 @@ New features Other enhancements ^^^^^^^^^^^^^^^^^^ -- `read_pickle` can now unpickle from compressed files (:issue:``). +- `read_pickle` can now unpickle from compressed files (:issue:`11666`). From 0ecdc931d33bba058fccd9d7de3676d14750738f Mon Sep 17 00:00:00 2001 From: khs26 Date: Sun, 6 Dec 2015 23:46:22 +0000 Subject: [PATCH 5/5] Renamed argument and fixed an import. --- pandas/io/pickle.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index d40486d1fd77f..e444ed30ecd81 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,6 +1,5 @@ from pandas.compat import cPickle as pkl, pickle_compat as pc, PY3 -from pandas.io.common import _get_handle -from pandas.io.parsers import get_compression +from pandas.io.common import _get_handle, get_compression_type def to_pickle(obj, path): """ @@ -16,7 +15,7 @@ def to_pickle(obj, path): pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) -def read_pickle(path, compression_arg='infer'): +def read_pickle(path, compression='infer'): """ Load pickled pandas object (or any other pickled object) from the specified file path @@ -28,7 +27,7 @@ def read_pickle(path, compression_arg='infer'): ---------- path : string File path - compression_arg: {'gzip', 'bz2', 'infer', None}, default 'infer' + compression: {'gzip', 'bz2', 'infer', None}, default 'infer' Compression type, ('infer' looks for the file extensions .gz and .bz2, using gzip and bz2 to decompress respectively). @@ -46,20 +45,20 @@ def try_read(path, encoding=None): # cpickle # GH 6899 - compression = get_compression(path, encoding, compression_arg) + _compression = get_compression_type(path, compression) try: - with _get_handle(path, 'rb', encoding, compression) as fh: + with _get_handle(path, 'rb', encoding, _compression) as fh: return pkl.load(fh) except (Exception) as e: # reg/patched pickle try: - with _get_handle(path, 'rb', encoding, compression) as fh: + with _get_handle(path, 'rb', encoding, _compression) as fh: return pc.load(fh, encoding=encoding, compat=False) # compat pickle except: - with _get_handle(path, 'rb', encoding, compression) as fh: + with _get_handle(path, 'rb', encoding, _compression) as fh: return pc.load(fh, encoding=encoding, compat=True) try: