From cfb5e16fa783cdbc440a8e0290ee18ae1f34b057 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 9 Oct 2017 09:37:15 +0900 Subject: [PATCH 1/6] BUG: Fix default encoding for CSVFormatter.save --- pandas/io/formats/format.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 733fd3bd39b52..73837efd633fe 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -1573,12 +1573,20 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', def save(self): # create the writer & save + if self.encoding is None: + if compat.PY2: + encoding = 'ascii' + else: + encoding = 'utf-8' + else: + encoding = self.encoding + if hasattr(self.path_or_buf, 'write'): f = self.path_or_buf close = False else: f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=self.encoding, + encoding=encoding, compression=self.compression) close = True @@ -1588,11 +1596,11 @@ def save(self): doublequote=self.doublequote, escapechar=self.escapechar, quotechar=self.quotechar) - if self.encoding is not None: - writer_kwargs['encoding'] = self.encoding - self.writer = UnicodeWriter(f, **writer_kwargs) - else: + if encoding == 'ascii': self.writer = csv.writer(f, **writer_kwargs) + else: + writer_kwargs['encoding'] = encoding + self.writer = UnicodeWriter(f, **writer_kwargs) self._save() From a632835bf309b46253c2bf1dece48ad7e5e192d4 Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 9 Oct 2017 10:02:26 +0900 Subject: [PATCH 2/6] TST: Add to_csv defualt encoding test --- pandas/tests/io/formats/test_to_csv.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index 1073fbcef5aec..b73638209dcae 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -1,3 +1,5 @@ +# -*- coding: utf-8 -*- + from pandas import DataFrame import numpy as np import pandas as pd @@ -6,6 +8,18 @@ class TestToCSV(object): + def test_to_csv_defualt_encoding(self): + # GH17097 + df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]}) + + with tm.ensure_clean('test.csv') as path: + if pd.compat.PY2: + with tm.assert_raises_regex(UnicodeEncodeError, 'ascii'): + df.to_csv(path) + else: + df.to_csv(path) + tm.assert_frame_equal(pd.read_csv(path, index_col=0), df) + def test_to_csv_quotechar(self): df = DataFrame({'col': [1, 2]}) expected = """\ From 0cd7bf7e5c0684f0274205f0a9084ca82a7a104e Mon Sep 17 00:00:00 2001 From: Licht-T Date: Mon, 9 Oct 2017 12:48:40 +0900 Subject: [PATCH 3/6] DOC: Add comments on to_csv defualt encoding test --- pandas/tests/io/formats/test_to_csv.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pandas/tests/io/formats/test_to_csv.py b/pandas/tests/io/formats/test_to_csv.py index b73638209dcae..b82d9895ddcf5 100644 --- a/pandas/tests/io/formats/test_to_csv.py +++ b/pandas/tests/io/formats/test_to_csv.py @@ -13,7 +13,10 @@ def test_to_csv_defualt_encoding(self): df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]}) with tm.ensure_clean('test.csv') as path: + # the default to_csv encoding in Python 2 is ascii, and that in + # Python 3 is uft-8. if pd.compat.PY2: + # the encoding argument parameter should be utf-8 with tm.assert_raises_regex(UnicodeEncodeError, 'ascii'): df.to_csv(path) else: From 08df290182ac2706ff875068016320e1998dbf69 Mon Sep 17 00:00:00 2001 From: Tom Augspurger Date: Tue, 10 Oct 2017 10:32:32 -0500 Subject: [PATCH 4/6] DOC: added release note --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 8b2c4d16f4e1a..ad184d0a6a792 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -334,6 +334,7 @@ I/O - Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`). - Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`). - Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`) +- Bug in :meth:`DataFrame.to_csv` defaulting to 'ascii' encoding in Python 3, instead of 'utf-8' (:issue:`17097`) - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) From 039f2cf670c82b2a47e33c0b0387f3df359b4fd4 Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Wed, 11 Oct 2017 09:06:20 +0900 Subject: [PATCH 5/6] DOC: Add the fixing to_csv default encoding to whatsnew note --- doc/source/whatsnew/v0.21.0.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index ad184d0a6a792..3c0bae84d5683 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -337,6 +337,7 @@ I/O - Bug in :meth:`DataFrame.to_csv` defaulting to 'ascii' encoding in Python 3, instead of 'utf-8' (:issue:`17097`) - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) +- Bug in :func:`to_csv` where the default encoding is not set correctly (:issue:`17097`) Plotting ^^^^^^^^ From c222ee82eb60cab4c35b89f5dd5cb6c23c3119b7 Mon Sep 17 00:00:00 2001 From: Licht Takeuchi Date: Wed, 11 Oct 2017 09:08:46 +0900 Subject: [PATCH 6/6] Revert "DOC: Add the fixing to_csv default encoding to whatsnew note" This reverts commit 039f2cf670c82b2a47e33c0b0387f3df359b4fd4. --- doc/source/whatsnew/v0.21.0.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v0.21.0.txt b/doc/source/whatsnew/v0.21.0.txt index 3c0bae84d5683..ad184d0a6a792 100644 --- a/doc/source/whatsnew/v0.21.0.txt +++ b/doc/source/whatsnew/v0.21.0.txt @@ -337,7 +337,6 @@ I/O - Bug in :meth:`DataFrame.to_csv` defaulting to 'ascii' encoding in Python 3, instead of 'utf-8' (:issue:`17097`) - Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`) - Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`) -- Bug in :func:`to_csv` where the default encoding is not set correctly (:issue:`17097`) Plotting ^^^^^^^^