Skip to content

Commit 6008d75

Browse files
Dobatymojreback
authored andcommitted
ENH: support 'infer' compression in _get_handle() (pandas-dev#17900)
xref pandas-devgh-15008 xref pandas-devgh-17262
1 parent 1986dbe commit 6008d75

File tree

7 files changed

+89
-16
lines changed

7 files changed

+89
-16
lines changed

doc/source/whatsnew/v0.24.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,7 @@ Other Enhancements
8383
- :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
8484
- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
8585
- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
86+
- :func:`~DataFrame.to_csv` and :func:`~DataFrame.to_json` now support ``compression='infer'`` to infer compression based on filename (:issue:`15008`)
8687
-
8788

8889
.. _whatsnew_0240.api_breaking:

pandas/core/frame.py

+4-4
Original file line numberDiff line numberDiff line change
@@ -1695,10 +1695,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
16951695
encoding : string, optional
16961696
A string representing the encoding to use in the output file,
16971697
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
1698-
compression : string, optional
1699-
A string representing the compression to use in the output file.
1700-
Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
1701-
used when the first argument is a filename.
1698+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default None
1699+
If 'infer' and `path_or_buf` is path-like, then detect compression
1700+
from the following extensions: '.gz', '.bz2' or '.xz'
1701+
(otherwise no compression).
17021702
line_terminator : string, default ``'\n'``
17031703
The newline character or character sequence to use in the output
17041704
file

pandas/core/generic.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -1906,7 +1906,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
19061906
19071907
.. versionadded:: 0.19.0
19081908
1909-
compression : {None, 'gzip', 'bz2', 'zip', 'xz'}
1909+
compression : {'infer', 'gzip', 'bz2', 'xz', None}, default None
19101910
A string representing the compression to use in the output file,
19111911
only used when the first argument is a filename.
19121912

pandas/io/common.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -267,10 +267,12 @@ def _infer_compression(filepath_or_buffer, compression):
267267
268268
Parameters
269269
----------
270-
filepath_or_buf :
270+
filepath_or_buffer :
271271
a path (str) or buffer
272-
compression : str or None
273-
the compression method including None for no compression and 'infer'
272+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
273+
If 'infer' and `filepath_or_buffer` is path-like, then detect
274+
compression from the following extensions: '.gz', '.bz2', '.zip',
275+
or '.xz' (otherwise no compression).
274276
275277
Returns
276278
-------
@@ -322,8 +324,10 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
322324
mode : str
323325
mode to open path_or_buf with
324326
encoding : str or None
325-
compression : str or None
326-
Supported compression protocols are gzip, bz2, zip, and xz
327+
compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
328+
If 'infer' and `filepath_or_buffer` is path-like, then detect
329+
compression from the following extensions: '.gz', '.bz2', '.zip',
330+
or '.xz' (otherwise no compression).
327331
memory_map : boolean, default False
328332
See parsers._parser_params for more information.
329333
is_text : boolean, default True
@@ -350,6 +354,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
350354
path_or_buf = _stringify_path(path_or_buf)
351355
is_path = isinstance(path_or_buf, compat.string_types)
352356

357+
if is_path:
358+
compression = _infer_compression(path_or_buf, compression)
359+
353360
if compression:
354361

355362
if compat.PY2 and not is_path and encoding:

pandas/io/pickle.py

+3-5
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from numpy.lib.format import read_array, write_array
66
from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
77
from pandas.core.dtypes.common import is_datetime64_dtype, _NS_DTYPE
8-
from pandas.io.common import _get_handle, _infer_compression, _stringify_path
8+
from pandas.io.common import _get_handle, _stringify_path
99

1010

1111
def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
@@ -67,9 +67,8 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
6767
>>> os.remove("./dummy.pkl")
6868
"""
6969
path = _stringify_path(path)
70-
inferred_compression = _infer_compression(path, compression)
7170
f, fh = _get_handle(path, 'wb',
72-
compression=inferred_compression,
71+
compression=compression,
7372
is_text=False)
7473
if protocol < 0:
7574
protocol = pkl.HIGHEST_PROTOCOL
@@ -138,12 +137,11 @@ def read_pickle(path, compression='infer'):
138137
>>> os.remove("./dummy.pkl")
139138
"""
140139
path = _stringify_path(path)
141-
inferred_compression = _infer_compression(path, compression)
142140

143141
def read_wrapper(func):
144142
# wrapper file handle open/close operation
145143
f, fh = _get_handle(path, 'rb',
146-
compression=inferred_compression,
144+
compression=compression,
147145
is_text=False)
148146
try:
149147
return func(f)

pandas/tests/io/formats/test_to_csv.py

+36-1
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,12 @@
11
# -*- coding: utf-8 -*-
22

33
import sys
4+
5+
import pytest
6+
47
import numpy as np
58
import pandas as pd
6-
import pytest
9+
710
from pandas import DataFrame
811
from pandas.util import testing as tm
912

@@ -316,3 +319,35 @@ def test_to_csv_write_to_open_file(self):
316319
df.to_csv(f, header=None, index=None)
317320
with open(path, 'r') as f:
318321
assert f.read() == expected
322+
323+
@pytest.mark.parametrize("to_infer", [True, False])
324+
@pytest.mark.parametrize("read_infer", [True, False])
325+
def test_to_csv_compression(self, compression_only,
326+
read_infer, to_infer):
327+
# see gh-15008
328+
compression = compression_only
329+
330+
if compression == "zip":
331+
pytest.skip("{compression} is not supported "
332+
"for to_csv".format(compression=compression))
333+
334+
# We'll complete file extension subsequently.
335+
filename = "test."
336+
337+
if compression == "gzip":
338+
filename += "gz"
339+
else:
340+
# xz --> .xz
341+
# bz2 --> .bz2
342+
filename += compression
343+
344+
df = DataFrame({"A": [1]})
345+
346+
to_compression = "infer" if to_infer else compression
347+
read_compression = "infer" if read_infer else compression
348+
349+
with tm.ensure_clean(filename) as path:
350+
df.to_csv(path, compression=to_compression)
351+
result = pd.read_csv(path, index_col=0,
352+
compression=read_compression)
353+
tm.assert_frame_equal(result, df)

pandas/tests/io/json/test_compression.py

+32
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,35 @@ def test_read_unsupported_compression_type():
8888
msg = "Unrecognized compression type: unsupported"
8989
assert_raises_regex(ValueError, msg, pd.read_json,
9090
path, compression="unsupported")
91+
92+
93+
@pytest.mark.parametrize("to_infer", [True, False])
94+
@pytest.mark.parametrize("read_infer", [True, False])
95+
def test_to_json_compression(compression_only,
96+
read_infer, to_infer):
97+
# see gh-15008
98+
compression = compression_only
99+
100+
if compression == "zip":
101+
pytest.skip("{compression} is not supported "
102+
"for to_csv".format(compression=compression))
103+
104+
# We'll complete file extension subsequently.
105+
filename = "test."
106+
107+
if compression == "gzip":
108+
filename += "gz"
109+
else:
110+
# xz --> .xz
111+
# bz2 --> .bz2
112+
filename += compression
113+
114+
df = pd.DataFrame({"A": [1]})
115+
116+
to_compression = "infer" if to_infer else compression
117+
read_compression = "infer" if read_infer else compression
118+
119+
with tm.ensure_clean(filename) as path:
120+
df.to_json(path, compression=to_compression)
121+
result = pd.read_json(path, compression=read_compression)
122+
tm.assert_frame_equal(result, df)

0 commit comments

Comments
 (0)