BUG: Fix default encoding for CSVFormatter.save (pandas-dev#17821)

Licht-T · TomAugspurger · commit 9772c2288331 · 2017-10-11T10:19:56.000-05:00
* BUG: Fix default encoding for CSVFormatter.save * TST: Add to_csv defualt encoding test * DOC: Add comments on to_csv defualt encoding test * DOC: added release note * DOC: Add the fixing to_csv default encoding to whatsnew note * Revert "DOC: Add the fixing to_csv default encoding to whatsnew note" This reverts commit 039f2cf.
diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt
@@ -946,6 +946,7 @@ I/O
 - Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`).
 - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
 - Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`)
+- Bug in :meth:`DataFrame.to_csv` defaulting to 'ascii' encoding in Python 3, instead of 'utf-8' (:issue:`17097`)
 - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
 - Bug in :func:`read_stata` where the index was not set (:issue:`16342`)
 - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)
diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py
@@ -1612,12 +1612,20 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
 
     def save(self):
         # create the writer & save
+        if self.encoding is None:
+            if compat.PY2:
+                encoding = 'ascii'
+            else:
+                encoding = 'utf-8'
+        else:
+            encoding = self.encoding
+
         if hasattr(self.path_or_buf, 'write'):
             f = self.path_or_buf
             close = False
         else:
             f, handles = _get_handle(self.path_or_buf, self.mode,
-                                     encoding=self.encoding,
+                                     encoding=encoding,
                                      compression=self.compression)
             close = True
 
@@ -1627,11 +1635,11 @@ def save(self):
                                  doublequote=self.doublequote,
                                  escapechar=self.escapechar,
                                  quotechar=self.quotechar)
-            if self.encoding is not None:
-                writer_kwargs['encoding'] = self.encoding
-                self.writer = UnicodeWriter(f, **writer_kwargs)
-            else:
+            if encoding == 'ascii':
                 self.writer = csv.writer(f, **writer_kwargs)
+            else:
+                writer_kwargs['encoding'] = encoding
+                self.writer = UnicodeWriter(f, **writer_kwargs)
 
             self._save()
 
diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py
@@ -1,3 +1,5 @@
+# -*- coding: utf-8 -*-
+
 from pandas import DataFrame
 import numpy as np
 import pandas as pd
@@ -6,6 +8,21 @@
 
 class TestToCSV(object):
 
+    def test_to_csv_defualt_encoding(self):
+        # GH17097
+        df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]})
+
+        with tm.ensure_clean('test.csv') as path:
+            # the default to_csv encoding in Python 2 is ascii, and that in
+            # Python 3 is uft-8.
+            if pd.compat.PY2:
+                # the encoding argument parameter should be utf-8
+                with tm.assert_raises_regex(UnicodeEncodeError, 'ascii'):
+                    df.to_csv(path)
+            else:
+                df.to_csv(path)
+                tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
+
     def test_to_csv_quotechar(self):
         df = DataFrame({'col': [1, 2]})
         expected = """\