pandas-dev · WillAyd · Jun 5, 2018 · May 31, 2018 · Jun 2, 2018 · Jun 2, 2018
diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt
@@ -92,6 +92,7 @@ I/O
 
 - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
 - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
+- Bug in :meth:`DataFrame.to_csv` using compression causes encoding error (:issue:`21241`)
 -
 
 Plotting

diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py
@@ -151,11 +151,14 @@ def save(self):
             self._save()
 
         finally:
-            # GH 17778 handles compression for byte strings.
+            # GH 17778 handles zip compression for byte strings separately to
+            # support Python 2, also allow compression file handle
             if not close and self.compression:
                 f.close()
-                with open(f.name, 'r') as f:
+                with open(f.name, 'rb') as f:
                     data = f.read()
+                if not compat.PY2:
+                    data = data.decode(encoding)
                 f, handles = _get_handle(f.name, self.mode,
                                          encoding=encoding,
                                          compression=self.compression)

diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py
@@ -919,29 +919,37 @@ def test_to_csv_path_is_none(self):
         recons = pd.read_csv(StringIO(csv_str), index_col=0)
         assert_frame_equal(self.frame, recons)
 
-    def test_to_csv_compression(self, compression):
-
-        df = DataFrame([[0.123456, 0.234567, 0.567567],
-                        [12.32112, 123123.2, 321321.2]],
-                       index=['A', 'B'], columns=['X', 'Y', 'Z'])
+    @pytest.mark.parametrize('frame, encoding', [
+        (DataFrame([[0.123456, 0.234567, 0.567567],
+                    [12.32112, 123123.2, 321321.2]],
+                   index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
+        (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
+        (DataFrame(5 * [[123, u"你好", u"世界"]],
+                   columns=['X', 'Y', 'Z']), 'gb2312'),
+        (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
+                   columns=['X', 'Y', 'Z']), 'cp737')
+    ])
+    def test_to_csv_compression(self, frame, encoding, compression):
 
         with ensure_clean() as filename:
 
-            df.to_csv(filename, compression=compression)
+            frame.to_csv(filename, compression=compression, encoding=encoding)
 
             # test the round trip - to_csv -> read_csv
             rs = read_csv(filename, compression=compression,
-                          index_col=0)
-            assert_frame_equal(df, rs)
+                          index_col=0, encoding=encoding)
+            assert_frame_equal(frame, rs)
 
             # explicitly make sure file is compressed
             with tm.decompress_file(filename, compression) as fh:
-                text = fh.read().decode('utf8')
-                for col in df.columns:
+                text = fh.read().decode(encoding or 'utf8')
+                for col in frame.columns:
                     assert col in text
 
             with tm.decompress_file(filename, compression) as fh:
-                assert_frame_equal(df, read_csv(fh, index_col=0))
+                assert_frame_equal(frame, read_csv(fh,
+                                                   index_col=0,
+                                                   encoding=encoding))
 
     def test_to_csv_date_format(self):
         with ensure_clean('__tmp_to_csv_date_format__') as path:

diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py
@@ -137,29 +137,35 @@ def test_to_csv_path_is_none(self):
         csv_str = s.to_csv(path=None)
         assert isinstance(csv_str, str)
 
-    def test_to_csv_compression(self, compression):
-
-        s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
-                   name='X')
+    @pytest.mark.parametrize('s, encoding', [
+        (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
+                name='X'), None),
+        (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'),
+        (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'),
+        (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737')
+    ])
+    def test_to_csv_compression(self, s, encoding, compression):
 
         with ensure_clean() as filename:
 
-            s.to_csv(filename, compression=compression, header=True)
+            s.to_csv(filename, compression=compression, encoding=encoding,
+                     header=True)
 
             # test the round trip - to_csv -> read_csv
             rs = pd.read_csv(filename, compression=compression,
-                             index_col=0, squeeze=True)
+                             encoding=encoding, index_col=0, squeeze=True)
             assert_series_equal(s, rs)
 
             # explicitly ensure file was compressed
             with tm.decompress_file(filename, compression) as fh:
-                text = fh.read().decode('utf8')
+                text = fh.read().decode(encoding or 'utf8')
                 assert s.name in text
 
             with tm.decompress_file(filename, compression) as fh:
                 assert_series_equal(s, pd.read_csv(fh,
                                                    index_col=0,
-                                                   squeeze=True))
+                                                   squeeze=True,
+                                                   encoding=encoding))
 
 
 class TestSeriesIO(TestData):