-
-
Notifications
You must be signed in to change notification settings - Fork 18.4k
ENH: Read from compressed data sources #11677
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 2 commits
3785c7e
c33bc1d
5743f79
f409cb6
0ecdc93
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -234,18 +234,28 @@ class ParserWarning(Warning): | |
fields if it is not spaces (e.g., '~'). | ||
""" % (_parser_params % _fwf_widths) | ||
|
||
def get_compression(filepath_or_buffer, encoding, compression_kwd): | ||
""" | ||
Determine the compression type of a file or buffer. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. move this entire function to This should be a bit more lightweight as you can see there are already other functions which use the compression. This will simply infer from a keywork and/or file extension and return the type of compression. Which can then be passed to other routines. |
||
|
||
def _read(filepath_or_buffer, kwds): | ||
"Generic reader of line files." | ||
encoding = kwds.get('encoding', None) | ||
skipfooter = kwds.pop('skipfooter', None) | ||
if skipfooter is not None: | ||
kwds['skip_footer'] = skipfooter | ||
|
||
Parameters | ||
---------- | ||
filepath_or_buffer : string | ||
File path | ||
encoding: string | ||
Encoding type | ||
compression_kwd: {'gzip', 'bz2', 'infer', None} | ||
Compression type ('infer' looks for the file extensions .gz and .bz2, using gzip and bz2 to decompress | ||
respectively). | ||
|
||
Returns | ||
------- | ||
compression : {'gzip', 'bz2', None} depending on result | ||
""" | ||
# If the input could be a filename, check for a recognizable compression extension. | ||
# If we're reading from a URL, the `get_filepath_or_buffer` will use header info | ||
# to determine compression, so use what it finds in that case. | ||
inferred_compression = kwds.get('compression') | ||
inferred_compression = compression_kwd | ||
if inferred_compression == 'infer': | ||
if isinstance(filepath_or_buffer, compat.string_types): | ||
if filepath_or_buffer.endswith('.gz'): | ||
|
@@ -259,8 +269,18 @@ def _read(filepath_or_buffer, kwds): | |
|
||
filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, | ||
encoding, | ||
compression=kwds.get('compression', None)) | ||
kwds['compression'] = inferred_compression if compression == 'infer' else compression | ||
compression=compression_kwd) | ||
return inferred_compression if compression == 'infer' else compression | ||
|
||
|
||
def _read(filepath_or_buffer, kwds): | ||
"Generic reader of line files." | ||
encoding = kwds.get('encoding', None) | ||
skipfooter = kwds.pop('skipfooter', None) | ||
if skipfooter is not None: | ||
kwds['skip_footer'] = skipfooter | ||
|
||
kwds['compression'] = get_compression(filepath_or_buffer, encoding, kwds['compression']) | ||
|
||
if kwds.get('date_parser', None) is not None: | ||
if isinstance(kwds['parse_dates'], bool): | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,4 +1,6 @@ | ||
from pandas.compat import cPickle as pkl, pickle_compat as pc, PY3 | ||
from pandas.io.common import _get_handle | ||
from pandas.io.parsers import get_compression | ||
|
||
def to_pickle(obj, path): | ||
""" | ||
|
@@ -14,7 +16,7 @@ def to_pickle(obj, path): | |
pkl.dump(obj, f, protocol=pkl.HIGHEST_PROTOCOL) | ||
|
||
|
||
def read_pickle(path): | ||
def read_pickle(path, compression_arg='infer'): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should be named |
||
""" | ||
Load pickled pandas object (or any other pickled object) from the specified | ||
file path | ||
|
@@ -26,6 +28,9 @@ def read_pickle(path): | |
---------- | ||
path : string | ||
File path | ||
compression_arg: {'gzip', 'bz2', 'infer', None}, default 'infer' | ||
Compression type, ('infer' looks for the file extensions .gz and .bz2, using gzip and bz2 to decompress | ||
respectively). | ||
|
||
Returns | ||
------- | ||
|
@@ -41,19 +46,20 @@ def try_read(path, encoding=None): | |
|
||
# cpickle | ||
# GH 6899 | ||
compression = get_compression(path, encoding, compression_arg) | ||
try: | ||
with open(path, 'rb') as fh: | ||
with _get_handle(path, 'rb', encoding, compression) as fh: | ||
return pkl.load(fh) | ||
except (Exception) as e: | ||
|
||
# reg/patched pickle | ||
try: | ||
with open(path, 'rb') as fh: | ||
with _get_handle(path, 'rb', encoding, compression) as fh: | ||
return pc.load(fh, encoding=encoding, compat=False) | ||
|
||
# compat pickle | ||
except: | ||
with open(path, 'rb') as fh: | ||
with _get_handle(path, 'rb', encoding, compression) as fh: | ||
return pc.load(fh, encoding=encoding, compat=True) | ||
|
||
try: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add in the actual number here (11666)