Skip to content

Commit b32fdc4

Browse files
minggliWillAyd
authored andcommitted
BUG: Fix encoding error in to_csv compression (#21300)
1 parent 9f95f7d commit b32fdc4

File tree

5 files changed

+77
-39
lines changed

5 files changed

+77
-39
lines changed

doc/source/whatsnew/v0.23.1.txt

+1
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ I/O
9494

9595
- Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`)
9696
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
97+
- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
9798
-
9899

99100
Plotting

pandas/io/formats/csvs.py

+20-16
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010

1111
from pandas.core.dtypes.missing import notna
12+
from pandas.core.dtypes.inference import is_file_like
1213
from pandas.core.index import Index, MultiIndex
1314
from pandas import compat
1415
from pandas.compat import (StringIO, range, zip)
@@ -127,14 +128,19 @@ def save(self):
127128
else:
128129
encoding = self.encoding
129130

130-
if hasattr(self.path_or_buf, 'write'):
131-
f = self.path_or_buf
132-
close = False
131+
# PR 21300 uses string buffer to receive csv writing and dump into
132+
# file-like output with compression as option. GH 21241, 21118
133+
f = StringIO()
134+
if not is_file_like(self.path_or_buf):
135+
# path_or_buf is path
136+
path_or_buf = self.path_or_buf
137+
elif hasattr(self.path_or_buf, 'name'):
138+
# path_or_buf is file handle
139+
path_or_buf = self.path_or_buf.name
133140
else:
134-
f, handles = _get_handle(self.path_or_buf, self.mode,
135-
encoding=encoding,
136-
compression=None)
137-
close = True if self.compression is None else False
141+
# path_or_buf is file-like IO objects.
142+
f = self.path_or_buf
143+
path_or_buf = None
138144

139145
try:
140146
writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,18 +157,16 @@ def save(self):
151157
self._save()
152158

153159
finally:
154-
# GH 17778 handles compression for byte strings.
155-
if not close and self.compression:
156-
f.close()
157-
with open(f.name, 'r') as f:
158-
data = f.read()
159-
f, handles = _get_handle(f.name, self.mode,
160+
# GH 17778 handles zip compression for byte strings separately.
161+
buf = f.getvalue()
162+
if path_or_buf:
163+
f, handles = _get_handle(path_or_buf, self.mode,
160164
encoding=encoding,
161165
compression=self.compression)
162-
f.write(data)
163-
close = True
164-
if close:
166+
f.write(buf)
165167
f.close()
168+
for _fh in handles:
169+
_fh.close()
166170

167171
def _save_header(self):
168172

pandas/tests/frame/test_to_csv.py

+27-11
Original file line numberDiff line numberDiff line change
@@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self):
919919
recons = pd.read_csv(StringIO(csv_str), index_col=0)
920920
assert_frame_equal(self.frame, recons)
921921

922-
def test_to_csv_compression(self, compression):
923-
924-
df = DataFrame([[0.123456, 0.234567, 0.567567],
925-
[12.32112, 123123.2, 321321.2]],
926-
index=['A', 'B'], columns=['X', 'Y', 'Z'])
922+
@pytest.mark.parametrize('df,encoding', [
923+
(DataFrame([[0.123456, 0.234567, 0.567567],
924+
[12.32112, 123123.2, 321321.2]],
925+
index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
926+
# GH 21241, 21118
927+
(DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
928+
(DataFrame(5 * [[123, u"你好", u"世界"]],
929+
columns=['X', 'Y', 'Z']), 'gb2312'),
930+
(DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
931+
columns=['X', 'Y', 'Z']), 'cp737')
932+
])
933+
def test_to_csv_compression(self, df, encoding, compression):
927934

928935
with ensure_clean() as filename:
929936

930-
df.to_csv(filename, compression=compression)
937+
df.to_csv(filename, compression=compression, encoding=encoding)
931938

932939
# test the round trip - to_csv -> read_csv
933-
rs = read_csv(filename, compression=compression,
934-
index_col=0)
935-
assert_frame_equal(df, rs)
940+
result = read_csv(filename, compression=compression,
941+
index_col=0, encoding=encoding)
942+
943+
with open(filename, 'w') as fh:
944+
df.to_csv(fh, compression=compression, encoding=encoding)
945+
946+
result_fh = read_csv(filename, compression=compression,
947+
index_col=0, encoding=encoding)
948+
assert_frame_equal(df, result)
949+
assert_frame_equal(df, result_fh)
936950

937951
# explicitly make sure file is compressed
938952
with tm.decompress_file(filename, compression) as fh:
939-
text = fh.read().decode('utf8')
953+
text = fh.read().decode(encoding or 'utf8')
940954
for col in df.columns:
941955
assert col in text
942956

943957
with tm.decompress_file(filename, compression) as fh:
944-
assert_frame_equal(df, read_csv(fh, index_col=0))
958+
assert_frame_equal(df, read_csv(fh,
959+
index_col=0,
960+
encoding=encoding))
945961

946962
def test_to_csv_date_format(self):
947963
with ensure_clean('__tmp_to_csv_date_format__') as path:

pandas/tests/series/test_io.py

+26-10
Original file line numberDiff line numberDiff line change
@@ -137,29 +137,45 @@ def test_to_csv_path_is_none(self):
137137
csv_str = s.to_csv(path=None)
138138
assert isinstance(csv_str, str)
139139

140-
def test_to_csv_compression(self, compression):
141-
142-
s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
143-
name='X')
140+
@pytest.mark.parametrize('s,encoding', [
141+
(Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
142+
name='X'), None),
143+
# GH 21241, 21118
144+
(Series(['abc', 'def', 'ghi'], name='X'), 'ascii'),
145+
(Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'),
146+
(Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737')
147+
])
148+
def test_to_csv_compression(self, s, encoding, compression):
144149

145150
with ensure_clean() as filename:
146151

147-
s.to_csv(filename, compression=compression, header=True)
152+
s.to_csv(filename, compression=compression, encoding=encoding,
153+
header=True)
148154

149155
# test the round trip - to_csv -> read_csv
150-
rs = pd.read_csv(filename, compression=compression,
151-
index_col=0, squeeze=True)
152-
assert_series_equal(s, rs)
156+
result = pd.read_csv(filename, compression=compression,
157+
encoding=encoding, index_col=0, squeeze=True)
158+
159+
with open(filename, 'w') as fh:
160+
s.to_csv(fh, compression=compression, encoding=encoding,
161+
header=True)
162+
163+
result_fh = pd.read_csv(filename, compression=compression,
164+
encoding=encoding, index_col=0,
165+
squeeze=True)
166+
assert_series_equal(s, result)
167+
assert_series_equal(s, result_fh)
153168

154169
# explicitly ensure file was compressed
155170
with tm.decompress_file(filename, compression) as fh:
156-
text = fh.read().decode('utf8')
171+
text = fh.read().decode(encoding or 'utf8')
157172
assert s.name in text
158173

159174
with tm.decompress_file(filename, compression) as fh:
160175
assert_series_equal(s, pd.read_csv(fh,
161176
index_col=0,
162-
squeeze=True))
177+
squeeze=True,
178+
encoding=encoding))
163179

164180

165181
class TestSeriesIO(TestData):

pandas/tests/test_common.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -252,12 +252,13 @@ def test_compression_size_fh(obj, method, compression_only):
252252
with tm.ensure_clean() as filename:
253253
with open(filename, 'w') as fh:
254254
getattr(obj, method)(fh, compression=compression_only)
255-
# GH 17778
256-
assert fh.closed
255+
assert not fh.closed
256+
assert fh.closed
257257
compressed = os.path.getsize(filename)
258258
with tm.ensure_clean() as filename:
259259
with open(filename, 'w') as fh:
260260
getattr(obj, method)(fh, compression=None)
261261
assert not fh.closed
262+
assert fh.closed
262263
uncompressed = os.path.getsize(filename)
263264
assert uncompressed > compressed

0 commit comments

Comments
 (0)