Skip to content

Commit c0fc368

Browse files
committed
BUG: closes #705, csv is encoded utf-8 and then decoded on the read side
1 parent 163d8b4 commit c0fc368

File tree

4 files changed

+60
-6
lines changed

4 files changed

+60
-6
lines changed

pandas/core/common.py

+39
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
import pandas._tseries as lib
2323
from pandas.util import py3compat
24+
import codecs
25+
import csv
2426

2527
# XXX: HACK for NumPy 1.5.1 to suppress warnings
2628
try:
@@ -828,3 +830,40 @@ def console_encode(value):
828830
return value.encode(sys.stdin.encoding, 'replace')
829831
except (AttributeError, TypeError):
830832
return value.encode('ascii', 'replace')
833+
834+
def csv_encode(value):
835+
if py3compat.PY3 or not isinstance(value, unicode):
836+
return value
837+
838+
return value.encode('UTF-8', 'replace')
839+
840+
class UTF8Recoder:
841+
"""
842+
Iterator that reads an encoded stream and reencodes the input to UTF-8
843+
"""
844+
def __init__(self, f, encoding):
845+
self.reader = codecs.getreader(encoding)(f)
846+
847+
def __iter__(self):
848+
return self
849+
850+
def next(self):
851+
return self.reader.next().encode("utf-8")
852+
853+
class UnicodeReader:
854+
"""
855+
A CSV reader which will iterate over lines in the CSV file "f",
856+
which is encoded in the given encoding.
857+
"""
858+
859+
def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
860+
f = UTF8Recoder(f, encoding)
861+
self.reader = csv.reader(f, dialect=dialect, **kwds)
862+
863+
def next(self):
864+
row = self.reader.next()
865+
return [unicode(s, "utf-8") for s in row]
866+
867+
def __iter__(self):
868+
return self
869+

pandas/core/frame.py

+9-4
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
import numpy.ma as ma
2424

2525
from pandas.core.common import (isnull, notnull, PandasError, _try_sort,
26-
_default_index, _stringify)
26+
_default_index, _stringify, csv_encode)
2727
from pandas.core.daterange import DateRange
2828
from pandas.core.generic import NDFrame
2929
from pandas.core.index import Index, MultiIndex, NULL_INDEX, _ensure_index
@@ -890,9 +890,13 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
890890
elif not isinstance(index_label, (list, tuple, np.ndarray)):
891891
# given a string for a DF with Index
892892
index_label = [index_label]
893-
csvout.writerow(list(index_label) + list(cols))
893+
894+
encoded_labels = [csv_encode(val) for val in index_label]
895+
encoded_cols = [csv_encode(val) for val in cols]
896+
csvout.writerow(encoded_labels + encoded_cols)
894897
else:
895-
csvout.writerow(cols)
898+
encoded_cols = [csv_encode(val) for val in cols]
899+
csvout.writerow(encoded_cols)
896900

897901
nlevels = getattr(self.index, 'nlevels', 1)
898902
for idx in self.index:
@@ -909,7 +913,8 @@ def to_csv(self, path, sep=",", na_rep='', cols=None, header=True,
909913

910914
row_fields.append(val)
911915

912-
csvout.writerow(row_fields)
916+
encoded_rows = [csv_encode(val) for val in row_fields]
917+
csvout.writerow(encoded_rows)
913918

914919
f.close()
915920

pandas/io/parsers.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -261,8 +261,9 @@ def _make_reader(self, f):
261261
self.pos += 1
262262
sniffed = csv.Sniffer().sniff(line)
263263
dia.delimiter = sniffed.delimiter
264-
self.buf.extend(list(csv.reader(StringIO(line), dialect=dia)))
265-
reader = csv.reader(f, dialect=dia)
264+
self.buf.extend(list(com.UnicodeReader(StringIO(line),
265+
dialect=dia)))
266+
reader = com.UnicodeReader(f, dialect=dia)
266267
else:
267268
reader = (re.split(sep, line.strip()) for line in f)
268269

pandas/tests/test_frame.py

+9
Original file line numberDiff line numberDiff line change
@@ -2482,6 +2482,15 @@ def test_to_csv_bug(self):
24822482

24832483
os.remove(path)
24842484

2485+
def test_to_csv_unicode(self):
2486+
from pandas import read_csv
2487+
path = '__tmp__.csv'
2488+
df = DataFrame({u'c/\u03c3':[1,2,3]})
2489+
df.to_csv(path)
2490+
df2 = read_csv(path, index_col=0)
2491+
assert_frame_equal(df, df2)
2492+
os.remove(path)
2493+
24852494
def test_info(self):
24862495
io = StringIO()
24872496
self.frame.info(buf=io)

0 commit comments

Comments
 (0)