diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 8b2c4d16f4e1a..ad184d0a6a792 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -334,6 +334,7 @@ I/O - Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`). - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`). - Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`) +- Bug in :meth:`DataFrame.to_csv` defaulting to 'ascii' encoding in Python 3, instead of 'utf-8' (:issue:`17097`) - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 733fd3bd39b52..73837efd633fe 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1573,12 +1573,20 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', def save(self): # create the writer & save + if self.encoding is None: + if compat.PY2: + encoding = 'ascii' + else: + encoding = 'utf-8' + else: + encoding = self.encoding + if hasattr(self.path_or_buf, 'write'): f = self.path_or_buf close = False else: f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, + encoding=encoding, compression=self.compression) close = True @@ -1588,11 +1596,11 @@ def save(self): doublequote=self.doublequote, escapechar=self.escapechar, quotechar=self.quotechar) - if self.encoding is not None: - writer_kwargs['encoding'] = self.encoding - self.writer = UnicodeWriter(f, **writer_kwargs) - else: + if encoding == 'ascii': self.writer = csv.writer(f, **writer_kwargs) + else: + writer_kwargs['encoding'] = encoding + self.writer = UnicodeWriter(f, **writer_kwargs) self._save() diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 1073fbcef5aec..b82d9895ddcf5 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + from pandas import DataFrame import numpy as np import pandas as pd @@ -6,6 +8,21 @@ class TestToCSV(object): + def test_to_csv_defualt_encoding(self): + # GH17097 + df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]}) + + with tm.ensure_clean('test.csv') as path: + # the default to_csv encoding in Python 2 is ascii, and that in + # Python 3 is uft-8. + if pd.compat.PY2: + # the encoding argument parameter should be utf-8 + with tm.assert_raises_regex(UnicodeEncodeError, 'ascii'): + df.to_csv(path) + else: + df.to_csv(path) + tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) + def test_to_csv_quotechar(self): df = DataFrame({'col': [1, 2]}) expected = """\