BUG/REG: file-handle object handled incorrectly in to_csv (#21478)

minggli · jorisvandenbossche · commit cfc787e6e3f8 · 2018-06-29T16:57:27.000+02:00
(cherry picked from commit 91451cb)
diff --git a/doc/source/whatsnew/v0.23.2.txt b/doc/source/whatsnew/v0.23.2.txt
@@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version.
 Fixed Regressions
 ~~~~~~~~~~~~~~~~~
 
--
+- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
 -
 
 .. _whatsnew_0232.performance:
diff --git a/pandas/io/common.py b/pandas/io/common.py
@@ -445,6 +445,10 @@ def __init__(self, file, mode, compression=zipfile.ZIP_DEFLATED, **kwargs):
     def write(self, data):
         super(BytesZipFile, self).writestr(self.filename, data)
 
+    @property
+    def closed(self):
+        return self.fp is None
+
 
 class MMapWrapper(BaseIterator):
     """
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -5,11 +5,13 @@
 
 from __future__ import print_function
 
+import warnings
+
 import csv as csvlib
+from zipfile import ZipFile
 import numpy as np
 
 from pandas.core.dtypes.missing import notna
-from pandas.core.dtypes.inference import is_file_like
 from pandas.core.index import Index, MultiIndex
 from pandas import compat
 from pandas.compat import (StringIO, range, zip)
@@ -128,19 +130,31 @@ def save(self):
         else:
             encoding = self.encoding
 
-        # PR 21300 uses string buffer to receive csv writing and dump into
-        # file-like output with compression as option. GH 21241, 21118
-        f = StringIO()
-        if not is_file_like(self.path_or_buf):
-            # path_or_buf is path
-            path_or_buf = self.path_or_buf
-        elif hasattr(self.path_or_buf, 'name'):
-            # path_or_buf is file handle
-            path_or_buf = self.path_or_buf.name
-        else:
-            # path_or_buf is file-like IO objects.
+        # GH 21227 internal compression is not used when file-like passed.
+        if self.compression and hasattr(self.path_or_buf, 'write'):
+            msg = ("compression has no effect when passing file-like "
+                   "object as input.")
+            warnings.warn(msg, RuntimeWarning, stacklevel=2)
+
+        # when zip compression is called.
+        is_zip = isinstance(self.path_or_buf, ZipFile) or (
+            not hasattr(self.path_or_buf, 'write')
+            and self.compression == 'zip')
+
+        if is_zip:
+            # zipfile doesn't support writing string to archive. uses string
+            # buffer to receive csv writing and dump into zip compression
+            # file handle. GH 21241, 21118
+            f = StringIO()
+            close = False
+        elif hasattr(self.path_or_buf, 'write'):
             f = self.path_or_buf
-            path_or_buf = None
+            close = False
+        else:
+            f, handles = _get_handle(self.path_or_buf, self.mode,
+                                     encoding=encoding,
+                                     compression=self.compression)
+            close = True
 
         try:
             writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -157,13 +171,18 @@ def save(self):
             self._save()
 
         finally:
-            # GH 17778 handles zip compression for byte strings separately.
-            buf = f.getvalue()
-            if path_or_buf:
-                f, handles = _get_handle(path_or_buf, self.mode,
-                                         encoding=encoding,
-                                         compression=self.compression)
-                f.write(buf)
+            if is_zip:
+                # GH 17778 handles zip compression separately.
+                buf = f.getvalue()
+                if hasattr(self.path_or_buf, 'write'):
+                    self.path_or_buf.write(buf)
+                else:
+                    f, handles = _get_handle(self.path_or_buf, self.mode,
+                                             encoding=encoding,
+                                             compression=self.compression)
+                    f.write(buf)
+                    close = True
+            if close:
                 f.close()
                 for _fh in handles:
                     _fh.close()
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -9,6 +9,7 @@
 import numpy as np
 
 from pandas.compat import (lmap, range, lrange, StringIO, u)
+from pandas.io.common import _get_handle
 import pandas.core.common as com
 from pandas.errors import ParserError
 from pandas import (DataFrame, Index, Series, MultiIndex, Timestamp,
@@ -935,18 +936,19 @@ def test_to_csv_compression(self, df, encoding, compression):
         with ensure_clean() as filename:
 
             df.to_csv(filename, compression=compression, encoding=encoding)
-
             # test the round trip - to_csv -> read_csv
             result = read_csv(filename, compression=compression,
                               index_col=0, encoding=encoding)
+            assert_frame_equal(df, result)
 
-            with open(filename, 'w') as fh:
-                df.to_csv(fh, compression=compression, encoding=encoding)
-
-            result_fh = read_csv(filename, compression=compression,
-                                 index_col=0, encoding=encoding)
+            # test the round trip using file handle - to_csv -> read_csv
+            f, _handles = _get_handle(filename, 'w', compression=compression,
+                                      encoding=encoding)
+            with f:
+                df.to_csv(f, encoding=encoding)
+            result = pd.read_csv(filename, compression=compression,
+                                 encoding=encoding, index_col=0, squeeze=True)
             assert_frame_equal(df, result)
-            assert_frame_equal(df, result_fh)
 
             # explicitly make sure file is compressed
             with tm.decompress_file(filename, compression) as fh:
diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
@@ -11,6 +11,7 @@
 from pandas import Series, DataFrame
 
 from pandas.compat import StringIO, u
+from pandas.io.common import _get_handle
 from pandas.util.testing import (assert_series_equal, assert_almost_equal,
                                  assert_frame_equal, ensure_clean)
 import pandas.util.testing as tm
@@ -152,20 +153,19 @@ def test_to_csv_compression(self, s, encoding, compression):
 
             s.to_csv(filename, compression=compression, encoding=encoding,
                      header=True)
-
             # test the round trip - to_csv -> read_csv
             result = pd.read_csv(filename, compression=compression,
                                  encoding=encoding, index_col=0, squeeze=True)
+            assert_series_equal(s, result)
 
-            with open(filename, 'w') as fh:
-                s.to_csv(fh, compression=compression, encoding=encoding,
-                         header=True)
-
-            result_fh = pd.read_csv(filename, compression=compression,
-                                    encoding=encoding, index_col=0,
-                                    squeeze=True)
+            # test the round trip using file handle - to_csv -> read_csv
+            f, _handles = _get_handle(filename, 'w', compression=compression,
+                                      encoding=encoding)
+            with f:
+                s.to_csv(f, encoding=encoding, header=True)
+            result = pd.read_csv(filename, compression=compression,
+                                 encoding=encoding, index_col=0, squeeze=True)
             assert_series_equal(s, result)
-            assert_series_equal(s, result_fh)
 
             # explicitly ensure file was compressed
             with tm.decompress_file(filename, compression) as fh:
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
@@ -11,6 +11,7 @@
 from pandas.compat import range, lmap
 import pandas.core.common as com
 from pandas.core import ops
+from pandas.io.common import _get_handle
 import pandas.util.testing as tm
 
 
@@ -248,19 +249,34 @@ def test_compression_size(obj, method, compression):
                      [12.32112, 123123.2, 321321.2]],
               columns=['X', 'Y', 'Z']),
     Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
-@pytest.mark.parametrize('method', ['to_csv'])
+@pytest.mark.parametrize('method', ['to_csv', 'to_json'])
 def test_compression_size_fh(obj, method, compression_only):
 
     with tm.ensure_clean() as filename:
-        with open(filename, 'w') as fh:
-            getattr(obj, method)(fh, compression=compression_only)
-            assert not fh.closed
-        assert fh.closed
+        f, _handles = _get_handle(filename, 'w', compression=compression_only)
+        with f:
+            getattr(obj, method)(f)
+            assert not f.closed
+        assert f.closed
         compressed = os.path.getsize(filename)
     with tm.ensure_clean() as filename:
-        with open(filename, 'w') as fh:
-            getattr(obj, method)(fh, compression=None)
-            assert not fh.closed
-        assert fh.closed
+        f, _handles = _get_handle(filename, 'w', compression=None)
+        with f:
+            getattr(obj, method)(f)
+            assert not f.closed
+        assert f.closed
         uncompressed = os.path.getsize(filename)
         assert uncompressed > compressed
+
+
+# GH 21227
+def test_compression_warning(compression_only):
+    df = DataFrame(100 * [[0.123456, 0.234567, 0.567567],
+                          [12.32112, 123123.2, 321321.2]],
+                   columns=['X', 'Y', 'Z'])
+    with tm.ensure_clean() as filename:
+        f, _handles = _get_handle(filename, 'w', compression=compression_only)
+        with tm.assert_produces_warning(RuntimeWarning,
+                                        check_stacklevel=False):
+            with f:
+                df.to_csv(f, compression=compression_only)

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@ and bug fixes. We recommend that all users upgrade to this version.`
`16`	`16`	`Fixed Regressions`
`17`	`17`	`~~~~~~~~~~~~~~~~~`
`18`	`18`
`19`		`--`
	`19`	+- Fixed regression in :meth:`to_csv` when handling file-like object incorrectly (:issue:`21471`)
`20`	`20`	`-`
`21`	`21`
`22`	`22`	`.. _whatsnew_0232.performance:`