Skip to content

Commit a1ff671

Browse files
dhimmeljowens
authored andcommitted
ENH: Infer compression from non-string paths (pandas-dev#17206)
1 parent 536b761 commit a1ff671

File tree

5 files changed

+44
-23
lines changed

5 files changed

+44
-23
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@ Other Enhancements
127127
- :func:`read_html` handles colspan and rowspan arguments and attempts to infer a header if the header is not explicitly specified (:issue:`17054`)
128128
- Integration with `Apache Parquet <https://parquet.apache.org/>`__, including a new top-level :func:`read_parquet` and :func:`DataFrame.to_parquet` method, see :ref:`here <io.parquet>`.
129129
- :func:`DataFrame.add_prefix` and :func:`DataFrame.add_suffix` now accept strings containing the '%' character. (:issue:`17151`)
130+
- `read_*` methods can now infer compression from non-string paths, such as ``pathlib.Path`` objects (:issue:`17206`).
130131

131132
.. _whatsnew_0210.api_breaking:
132133

pandas/io/common.py

+8-6
Original file line numberDiff line numberDiff line change
@@ -272,13 +272,15 @@ def _infer_compression(filepath_or_buffer, compression):
272272
if compression is None:
273273
return None
274274

275-
# Cannot infer compression of a buffer. Hence assume no compression.
276-
is_path = isinstance(filepath_or_buffer, compat.string_types)
277-
if compression == 'infer' and not is_path:
278-
return None
279-
280-
# Infer compression from the filename/URL extension
275+
# Infer compression
281276
if compression == 'infer':
277+
# Convert all path types (e.g. pathlib.Path) to strings
278+
filepath_or_buffer = _stringify_path(filepath_or_buffer)
279+
if not isinstance(filepath_or_buffer, compat.string_types):
280+
# Cannot infer compression of a buffer, assume no compression
281+
return None
282+
283+
# Infer compression from the filename/URL extension
282284
for compression, extension in _compression_to_extension.items():
283285
if filepath_or_buffer.endswith(extension):
284286
return compression

pandas/io/parsers.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -208,11 +208,11 @@
208208
<http://pandas.pydata.org/pandas-docs/stable/io.html#io-chunking>`_
209209
for more information on ``iterator`` and ``chunksize``.
210210
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer'
211-
For on-the-fly decompression of on-disk data. If 'infer', then use gzip,
212-
bz2, zip or xz if filepath_or_buffer is a string ending in '.gz', '.bz2',
213-
'.zip', or 'xz', respectively, and no decompression otherwise. If using
214-
'zip', the ZIP file must contain only one data file to be read in.
215-
Set to None for no decompression.
211+
For on-the-fly decompression of on-disk data. If 'infer' and
212+
`filepath_or_buffer` is path-like, then detect compression from the
213+
following extensions: '.gz', '.bz2', '.zip', or '.xz' (otherwise no
214+
decompression). If using 'zip', the ZIP file must contain only one data
215+
file to be read in. Set to None for no decompression.
216216
217217
.. versionadded:: 0.18.1 support for 'zip' and 'xz' compression.
218218

pandas/io/pickle.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ def read_pickle(path, compression='infer'):
6262
File path
6363
compression : {'infer', 'gzip', 'bz2', 'xz', 'zip', None}, default 'infer'
6464
For on-the-fly decompression of on-disk data. If 'infer', then use
65-
gzip, bz2, xz or zip if path is a string ending in '.gz', '.bz2', 'xz',
66-
or 'zip' respectively, and no decompression otherwise.
65+
gzip, bz2, xz or zip if path ends in '.gz', '.bz2', '.xz',
66+
or '.zip' respectively, and no decompression otherwise.
6767
Set to None for no decompression.
6868
6969
.. versionadded:: 0.20.0

pandas/tests/io/test_common.py

+28-10
Original file line numberDiff line numberDiff line change
@@ -14,16 +14,6 @@
1414

1515
from pandas import read_csv, concat
1616

17-
try:
18-
from pathlib import Path
19-
except ImportError:
20-
pass
21-
22-
try:
23-
from py.path import local as LocalPath
24-
except ImportError:
25-
pass
26-
2717

2818
class CustomFSPath(object):
2919
"""For testing fspath on unknown objects"""
@@ -34,6 +24,21 @@ def __fspath__(self):
3424
return self.path
3525

3626

27+
# Functions that consume a string path and return a string or path-like object
28+
path_types = [str, CustomFSPath]
29+
30+
try:
31+
from pathlib import Path
32+
path_types.append(Path)
33+
except ImportError:
34+
pass
35+
36+
try:
37+
from py.path import local as LocalPath
38+
path_types.append(LocalPath)
39+
except ImportError:
40+
pass
41+
3742
HERE = os.path.dirname(__file__)
3843

3944

@@ -83,6 +88,19 @@ def test_stringify_path_fspath(self):
8388
result = common._stringify_path(p)
8489
assert result == 'foo/bar.csv'
8590

91+
@pytest.mark.parametrize('extension,expected', [
92+
('', None),
93+
('.gz', 'gzip'),
94+
('.bz2', 'bz2'),
95+
('.zip', 'zip'),
96+
('.xz', 'xz'),
97+
])
98+
@pytest.mark.parametrize('path_type', path_types)
99+
def test_infer_compression_from_path(self, extension, expected, path_type):
100+
path = path_type('foo/bar.csv' + extension)
101+
compression = common._infer_compression(path, compression='infer')
102+
assert compression == expected
103+
86104
def test_get_filepath_or_buffer_with_path(self):
87105
filename = '~/sometest'
88106
filepath_or_buffer, _, _ = common.get_filepath_or_buffer(filename)

0 commit comments

Comments
 (0)