BUG: Fix encoding error in to_csv compression (pandas-dev#21300)

minggli · TomAugspurger · commit 8350429b74f5 · 2018-06-12T10:08:41.000-05:00
(cherry picked from commit b32fdc4)
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -121,6 +121,14 @@ I/O
 - Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
 - Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
 - Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`)
+-
+
+Plotting
+^^^^^^^^
+
+-
+-
+>>>>>>> b32fdc442... BUG: Fix encoding error in to_csv compression (#21300)
 
 Reshaping
 
diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -9,6 +9,7 @@
 import numpy as np
 
 from pandas.core.dtypes.missing import notna
+from pandas.core.dtypes.inference import is_file_like
 from pandas.core.index import Index, MultiIndex
 from pandas import compat
 from pandas.compat import (StringIO, range, zip)
@@ -127,14 +128,19 @@ def save(self):
         else:
             encoding = self.encoding
 
-        if hasattr(self.path_or_buf, 'write'):
-            f = self.path_or_buf
-            close = False
+        # PR 21300 uses string buffer to receive csv writing and dump into
+        # file-like output with compression as option. GH 21241, 21118
+        f = StringIO()
+        if not is_file_like(self.path_or_buf):
+            # path_or_buf is path
+            path_or_buf = self.path_or_buf
+        elif hasattr(self.path_or_buf, 'name'):
+            # path_or_buf is file handle
+            path_or_buf = self.path_or_buf.name
         else:
-            f, handles = _get_handle(self.path_or_buf, self.mode,
-                                     encoding=encoding,
-                                     compression=None)
-            close = True if self.compression is None else False
+            # path_or_buf is file-like IO objects.
+            f = self.path_or_buf
+            path_or_buf = None
 
         try:
             writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,18 +157,16 @@ def save(self):
             self._save()
 
         finally:
-            # GH 17778 handles compression for byte strings.
-            if not close and self.compression:
-                f.close()
-                with open(self.path_or_buf, 'r') as f:
-                    data = f.read()
-                f, handles = _get_handle(self.path_or_buf, self.mode,
+            # GH 17778 handles zip compression for byte strings separately.
+            buf = f.getvalue()
+            if path_or_buf:
+                f, handles = _get_handle(path_or_buf, self.mode,
                                          encoding=encoding,
                                          compression=self.compression)
-                f.write(data)
-                close = True
-            if close:
+                f.write(buf)
                 f.close()
+                for _fh in handles:
+                    _fh.close()
 
     def _save_header(self):
 
diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self):
         recons = pd.read_csv(StringIO(csv_str), index_col=0)
         assert_frame_equal(self.frame, recons)
 
-    def test_to_csv_compression(self, compression):
-
-        df = DataFrame([[0.123456, 0.234567, 0.567567],
-                        [12.32112, 123123.2, 321321.2]],
-                       index=['A', 'B'], columns=['X', 'Y', 'Z'])
+    @pytest.mark.parametrize('df,encoding', [
+        (DataFrame([[0.123456, 0.234567, 0.567567],
+                    [12.32112, 123123.2, 321321.2]],
+                   index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
+        # GH 21241, 21118
+        (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
+        (DataFrame(5 * [[123, u"你好", u"世界"]],
+                   columns=['X', 'Y', 'Z']), 'gb2312'),
+        (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
+                   columns=['X', 'Y', 'Z']), 'cp737')
+    ])
+    def test_to_csv_compression(self, df, encoding, compression):
 
         with ensure_clean() as filename:
 
-            df.to_csv(filename, compression=compression)
+            df.to_csv(filename, compression=compression, encoding=encoding)
 
             # test the round trip - to_csv -> read_csv
-            rs = read_csv(filename, compression=compression,
-                          index_col=0)
-            assert_frame_equal(df, rs)
+            result = read_csv(filename, compression=compression,
+                              index_col=0, encoding=encoding)
+
+            with open(filename, 'w') as fh:
+                df.to_csv(fh, compression=compression, encoding=encoding)
+
+            result_fh = read_csv(filename, compression=compression,
+                                 index_col=0, encoding=encoding)
+            assert_frame_equal(df, result)
+            assert_frame_equal(df, result_fh)
 
             # explicitly make sure file is compressed
             with tm.decompress_file(filename, compression) as fh:
-                text = fh.read().decode('utf8')
+                text = fh.read().decode(encoding or 'utf8')
                 for col in df.columns:
                     assert col in text
 
             with tm.decompress_file(filename, compression) as fh:
-                assert_frame_equal(df, read_csv(fh, index_col=0))
+                assert_frame_equal(df, read_csv(fh,
+                                                index_col=0,
+                                                encoding=encoding))
 
     def test_to_csv_date_format(self):
         with ensure_clean('__tmp_to_csv_date_format__') as path:
diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
@@ -138,29 +138,45 @@ def test_to_csv_path_is_none(self):
         csv_str = s.to_csv(path=None)
         assert isinstance(csv_str, str)
 
-    def test_to_csv_compression(self, compression):
-
-        s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
-                   name='X')
+    @pytest.mark.parametrize('s,encoding', [
+        (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
+                name='X'), None),
+        # GH 21241, 21118
+        (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'),
+        (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'),
+        (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737')
+    ])
+    def test_to_csv_compression(self, s, encoding, compression):
 
         with ensure_clean() as filename:
 
-            s.to_csv(filename, compression=compression, header=True)
+            s.to_csv(filename, compression=compression, encoding=encoding,
+                     header=True)
 
             # test the round trip - to_csv -> read_csv
-            rs = pd.read_csv(filename, compression=compression,
-                             index_col=0, squeeze=True)
-            assert_series_equal(s, rs)
+            result = pd.read_csv(filename, compression=compression,
+                                 encoding=encoding, index_col=0, squeeze=True)
+
+            with open(filename, 'w') as fh:
+                s.to_csv(fh, compression=compression, encoding=encoding,
+                         header=True)
+
+            result_fh = pd.read_csv(filename, compression=compression,
+                                    encoding=encoding, index_col=0,
+                                    squeeze=True)
+            assert_series_equal(s, result)
+            assert_series_equal(s, result_fh)
 
             # explicitly ensure file was compressed
             with tm.decompress_file(filename, compression) as fh:
-                text = fh.read().decode('utf8')
+                text = fh.read().decode(encoding or 'utf8')
                 assert s.name in text
 
             with tm.decompress_file(filename, compression) as fh:
                 assert_series_equal(s, pd.read_csv(fh,
                                                    index_col=0,
-                                                   squeeze=True))
+                                                   squeeze=True,
+                                                   encoding=encoding))
 
 
 class TestSeriesIO(TestData):
diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py
@@ -241,3 +241,26 @@ def test_compression_size(obj, method, compression):
         getattr(obj, method)(filename, compression=None)
         uncompressed = os.path.getsize(filename)
         assert uncompressed > compressed
+
+
+@pytest.mark.parametrize('obj', [
+    DataFrame(100 * [[0.123456, 0.234567, 0.567567],
+                     [12.32112, 123123.2, 321321.2]],
+              columns=['X', 'Y', 'Z']),
+    Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
+@pytest.mark.parametrize('method', ['to_csv'])
+def test_compression_size_fh(obj, method, compression_only):
+
+    with tm.ensure_clean() as filename:
+        with open(filename, 'w') as fh:
+            getattr(obj, method)(fh, compression=compression_only)
+            assert not fh.closed
+        assert fh.closed
+        compressed = os.path.getsize(filename)
+    with tm.ensure_clean() as filename:
+        with open(filename, 'w') as fh:
+            getattr(obj, method)(fh, compression=None)
+            assert not fh.closed
+        assert fh.closed
+        uncompressed = os.path.getsize(filename)
+        assert uncompressed > compressed