Skip to content

Commit 9772c22

Browse files
Licht-TTomAugspurger
authored andcommitted
BUG: Fix default encoding for CSVFormatter.save (pandas-dev#17821)
* BUG: Fix default encoding for CSVFormatter.save * TST: Add to_csv defualt encoding test * DOC: Add comments on to_csv defualt encoding test * DOC: added release note * DOC: Add the fixing to_csv default encoding to whatsnew note * Revert "DOC: Add the fixing to_csv default encoding to whatsnew note" This reverts commit 039f2cf.
1 parent 3544394 commit 9772c22

File tree

3 files changed

+31
-5
lines changed

3 files changed

+31
-5
lines changed

doc/source/whatsnew/v0.21.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -946,6 +946,7 @@ I/O
946946
- Bug in :func:`read_csv` in which memory management issues in exception handling, under certain conditions, would cause the interpreter to segfault (:issue:`14696`, :issue:`16798`).
947947
- Bug in :func:`read_csv` when called with ``low_memory=False`` in which a CSV with at least one column > 2GB in size would incorrectly raise a ``MemoryError`` (:issue:`16798`).
948948
- Bug in :func:`read_csv` when called with a single-element list ``header`` would return a ``DataFrame`` of all NaN values (:issue:`7757`)
949+
- Bug in :meth:`DataFrame.to_csv` defaulting to 'ascii' encoding in Python 3, instead of 'utf-8' (:issue:`17097`)
949950
- Bug in :func:`read_stata` where value labels could not be read when using an iterator (:issue:`16923`)
950951
- Bug in :func:`read_stata` where the index was not set (:issue:`16342`)
951952
- Bug in :func:`read_html` where import check fails when run in multiple threads (:issue:`16928`)

pandas/io/formats/format.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -1612,12 +1612,20 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='',
16121612

16131613
def save(self):
16141614
# create the writer & save
1615+
if self.encoding is None:
1616+
if compat.PY2:
1617+
encoding = 'ascii'
1618+
else:
1619+
encoding = 'utf-8'
1620+
else:
1621+
encoding = self.encoding
1622+
16151623
if hasattr(self.path_or_buf, 'write'):
16161624
f = self.path_or_buf
16171625
close = False
16181626
else:
16191627
f, handles = _get_handle(self.path_or_buf, self.mode,
1620-
encoding=self.encoding,
1628+
encoding=encoding,
16211629
compression=self.compression)
16221630
close = True
16231631

@@ -1627,11 +1635,11 @@ def save(self):
16271635
doublequote=self.doublequote,
16281636
escapechar=self.escapechar,
16291637
quotechar=self.quotechar)
1630-
if self.encoding is not None:
1631-
writer_kwargs['encoding'] = self.encoding
1632-
self.writer = UnicodeWriter(f, **writer_kwargs)
1633-
else:
1638+
if encoding == 'ascii':
16341639
self.writer = csv.writer(f, **writer_kwargs)
1640+
else:
1641+
writer_kwargs['encoding'] = encoding
1642+
self.writer = UnicodeWriter(f, **writer_kwargs)
16351643

16361644
self._save()
16371645

pandas/tests/io/formats/test_to_csv.py

+17
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
# -*- coding: utf-8 -*-
2+
13
from pandas import DataFrame
24
import numpy as np
35
import pandas as pd
@@ -6,6 +8,21 @@
68

79
class TestToCSV(object):
810

11+
def test_to_csv_defualt_encoding(self):
12+
# GH17097
13+
df = DataFrame({'col': [u"AAAAA", u"ÄÄÄÄÄ", u"ßßßßß", u"聞聞聞聞聞"]})
14+
15+
with tm.ensure_clean('test.csv') as path:
16+
# the default to_csv encoding in Python 2 is ascii, and that in
17+
# Python 3 is uft-8.
18+
if pd.compat.PY2:
19+
# the encoding argument parameter should be utf-8
20+
with tm.assert_raises_regex(UnicodeEncodeError, 'ascii'):
21+
df.to_csv(path)
22+
else:
23+
df.to_csv(path)
24+
tm.assert_frame_equal(pd.read_csv(path, index_col=0), df)
25+
926
def test_to_csv_quotechar(self):
1027
df = DataFrame({'col': [1, 2]})
1128
expected = """\

0 commit comments

Comments
 (0)