ENH: support 'infer' compression in _get_handle() (pandas-dev#17900)

Dobatymo · jreback · commit 6008d75ac00a · 2018-07-08T08:05:01.000-05:00
xref pandas-devgh-15008 xref pandas-devgh-17262
diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt
@@ -83,6 +83,7 @@ Other Enhancements
 - :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
 - :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
 - :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
+- :func:`~DataFrame.to_csv` and :func:`~DataFrame.to_json` now support ``compression='infer'`` to infer compression based on filename (:issue:`15008`)
 -
 
 .. _whatsnew_0240.api_breaking:
diff --git a/pandas/core/frame.py b/pandas/core/frame.py
@@ -1695,10 +1695,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
         encoding : string, optional
             A string representing the encoding to use in the output file,
             defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
-        compression : string, optional
-            A string representing the compression to use in the output file.
-            Allowed values are 'gzip', 'bz2', 'zip', 'xz'. This input is only
-            used when the first argument is a filename.
+        compression : {'infer', 'gzip', 'bz2', 'xz', None}, default None
+            If 'infer' and `path_or_buf` is path-like, then detect compression
+            from the following extensions: '.gz', '.bz2' or '.xz'
+            (otherwise no compression).
         line_terminator : string, default ``'\n'``
             The newline character or character sequence to use in the output
             file
diff --git a/pandas/core/generic.py b/pandas/core/generic.py
@@ -1906,7 +1906,7 @@ def to_json(self, path_or_buf=None, orient=None, date_format=None,
 
             .. versionadded:: 0.19.0
 
-        compression : {None, 'gzip', 'bz2', 'zip', 'xz'}
+        compression : {'infer', 'gzip', 'bz2', 'xz', None}, default None
             A string representing the compression to use in the output file,
             only used when the first argument is a filename.
 
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -267,10 +267,12 @@ def _infer_compression(filepath_or_buffer, compression):
 
     Parameters
     ----------
-    filepath_or_buf :
+    filepath_or_buffer :
         a path (str) or buffer
-    compression : str or None
-        the compression method including None for no compression and 'infer'
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}
+        If 'infer' and `filepath_or_buffer` is path-like, then detect
+        compression from the following extensions: '.gz', '.bz2', '.zip',
+        or '.xz' (otherwise no compression).
 
     Returns
     -------
@@ -322,8 +324,10 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     mode : str
         mode to open path_or_buf with
     encoding : str or None
-    compression : str or None
-        Supported compression protocols are gzip, bz2, zip, and xz
+    compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default None
+        If 'infer' and `filepath_or_buffer` is path-like, then detect
+        compression from the following extensions: '.gz', '.bz2', '.zip',
+        or '.xz' (otherwise no compression).
     memory_map : boolean, default False
         See parsers._parser_params for more information.
     is_text : boolean, default True
@@ -350,6 +354,9 @@ def _get_handle(path_or_buf, mode, encoding=None, compression=None,
     path_or_buf = _stringify_path(path_or_buf)
     is_path = isinstance(path_or_buf, compat.string_types)
 
+    if is_path:
+        compression = _infer_compression(path_or_buf, compression)
+
     if compression:
 
         if compat.PY2 and not is_path and encoding:
diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py
@@ -5,7 +5,7 @@
 from numpy.lib.format import read_array, write_array
 from pandas.compat import BytesIO, cPickle as pkl, pickle_compat as pc, PY3
 from pandas.core.dtypes.common import is_datetime64_dtype, _NS_DTYPE
-from pandas.io.common import _get_handle, _infer_compression, _stringify_path
+from pandas.io.common import _get_handle, _stringify_path
 
 
 def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
@@ -67,9 +67,8 @@ def to_pickle(obj, path, compression='infer', protocol=pkl.HIGHEST_PROTOCOL):
     >>> os.remove("./dummy.pkl")
     """
     path = _stringify_path(path)
-    inferred_compression = _infer_compression(path, compression)
     f, fh = _get_handle(path, 'wb',
-                        compression=inferred_compression,
+                        compression=compression,
                         is_text=False)
     if protocol < 0:
         protocol = pkl.HIGHEST_PROTOCOL
@@ -138,12 +137,11 @@ def read_pickle(path, compression='infer'):
     >>> os.remove("./dummy.pkl")
     """
     path = _stringify_path(path)
-    inferred_compression = _infer_compression(path, compression)
 
     def read_wrapper(func):
         # wrapper file handle open/close operation
         f, fh = _get_handle(path, 'rb',
-                            compression=inferred_compression,
+                            compression=compression,
                             is_text=False)
         try:
             return func(f)
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -1,9 +1,12 @@
 # -*- coding: utf-8 -*-
 
 import sys
+
+import pytest
+
 import numpy as np
 import pandas as pd
-import pytest
+
 from pandas import DataFrame
 from pandas.util import testing as tm
 
@@ -316,3 +319,35 @@ def test_to_csv_write_to_open_file(self):
                 df.to_csv(f, header=None, index=None)
             with open(path, 'r') as f:
                 assert f.read() == expected
+
+    @pytest.mark.parametrize("to_infer", [True, False])
+    @pytest.mark.parametrize("read_infer", [True, False])
+    def test_to_csv_compression(self, compression_only,
+                                read_infer, to_infer):
+        # see gh-15008
+        compression = compression_only
+
+        if compression == "zip":
+            pytest.skip("{compression} is not supported "
+                        "for to_csv".format(compression=compression))
+
+        # We'll complete file extension subsequently.
+        filename = "test."
+
+        if compression == "gzip":
+            filename += "gz"
+        else:
+            # xz --> .xz
+            # bz2 --> .bz2
+            filename += compression
+
+        df = DataFrame({"A": [1]})
+
+        to_compression = "infer" if to_infer else compression
+        read_compression = "infer" if read_infer else compression
+
+        with tm.ensure_clean(filename) as path:
+            df.to_csv(path, compression=to_compression)
+            result = pd.read_csv(path, index_col=0,
+                                 compression=read_compression)
+            tm.assert_frame_equal(result, df)
diff --git a/pandas/tests/io/json/test_compression.py b/pandas/tests/io/json/test_compression.py
@@ -88,3 +88,35 @@ def test_read_unsupported_compression_type():
         msg = "Unrecognized compression type: unsupported"
         assert_raises_regex(ValueError, msg, pd.read_json,
                             path, compression="unsupported")
+
+
+@pytest.mark.parametrize("to_infer", [True, False])
+@pytest.mark.parametrize("read_infer", [True, False])
+def test_to_json_compression(compression_only,
+                             read_infer, to_infer):
+    # see gh-15008
+    compression = compression_only
+
+    if compression == "zip":
+        pytest.skip("{compression} is not supported "
+                    "for to_csv".format(compression=compression))
+
+    # We'll complete file extension subsequently.
+    filename = "test."
+
+    if compression == "gzip":
+        filename += "gz"
+    else:
+        # xz --> .xz
+        # bz2 --> .bz2
+        filename += compression
+
+    df = pd.DataFrame({"A": [1]})
+
+    to_compression = "infer" if to_infer else compression
+    read_compression = "infer" if read_infer else compression
+
+    with tm.ensure_clean(filename) as path:
+        df.to_json(path, compression=to_compression)
+        result = pd.read_json(path, compression=read_compression)
+        tm.assert_frame_equal(result, df)

Original file line number	Diff line number	Diff line change
`@@ -83,6 +83,7 @@ Other Enhancements`
`83`	`83`	- :func:`read_html` copies cell data across ``colspan``s and ``rowspan``s, and it treats all-``th`` table rows as headers if ``header`` kwarg is not given and there is no ``thead`` (:issue:`17054`)
`84`	`84`	- :meth:`Series.nlargest`, :meth:`Series.nsmallest`, :meth:`DataFrame.nlargest`, and :meth:`DataFrame.nsmallest` now accept the value ``"all"`` for the ``keep`` argument. This keeps all ties for the nth largest/smallest value (:issue:`16818`)
`85`	`85`	- :class:`IntervalIndex` has gained the :meth:`~IntervalIndex.set_closed` method to change the existing ``closed`` value (:issue:`21670`)
	`86`	+- :func:`~DataFrame.to_csv` and :func:`~DataFrame.to_json` now support ``compression='infer'`` to infer compression based on filename (:issue:`15008`)
`86`	`87`	`-`
`87`	`88`
`88`	`89`	`.. _whatsnew_0240.api_breaking:`