Skip to content

Commit 8350429

Browse files
minggliTomAugspurger
authored andcommitted
BUG: Fix encoding error in to_csv compression (pandas-dev#21300)
(cherry picked from commit b32fdc4)
1 parent 9646587 commit 8350429

File tree

5 files changed

+104
-37
lines changed

5 files changed

+104
-37
lines changed

doc/source/whatsnew/v0.23.1.txt

+8
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,14 @@ I/O
121121
- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`)
122122
- Bug in :meth:`read_stata` and :class:`StataReader` which did not correctly decode utf-8 strings on Python 3 from Stata 14 files (dta version 118) (:issue:`21244`)
123123
- Bug in IO JSON :func:`read_json` reading empty JSON schema with ``orient='table'`` back to :class:`DataFrame` caused an error (:issue:`21287`)
124+
-
125+
126+
Plotting
127+
^^^^^^^^
128+
129+
-
130+
-
131+
>>>>>>> b32fdc442... BUG: Fix encoding error in to_csv compression (#21300)
124132

125133
Reshaping
126134

pandas/io/formats/csvs.py

+20-16
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
import numpy as np
1010

1111
from pandas.core.dtypes.missing import notna
12+
from pandas.core.dtypes.inference import is_file_like
1213
from pandas.core.index import Index, MultiIndex
1314
from pandas import compat
1415
from pandas.compat import (StringIO, range, zip)
@@ -127,14 +128,19 @@ def save(self):
127128
else:
128129
encoding = self.encoding
129130

130-
if hasattr(self.path_or_buf, 'write'):
131-
f = self.path_or_buf
132-
close = False
131+
# PR 21300 uses string buffer to receive csv writing and dump into
132+
# file-like output with compression as option. GH 21241, 21118
133+
f = StringIO()
134+
if not is_file_like(self.path_or_buf):
135+
# path_or_buf is path
136+
path_or_buf = self.path_or_buf
137+
elif hasattr(self.path_or_buf, 'name'):
138+
# path_or_buf is file handle
139+
path_or_buf = self.path_or_buf.name
133140
else:
134-
f, handles = _get_handle(self.path_or_buf, self.mode,
135-
encoding=encoding,
136-
compression=None)
137-
close = True if self.compression is None else False
141+
# path_or_buf is file-like IO objects.
142+
f = self.path_or_buf
143+
path_or_buf = None
138144

139145
try:
140146
writer_kwargs = dict(lineterminator=self.line_terminator,
@@ -151,18 +157,16 @@ def save(self):
151157
self._save()
152158

153159
finally:
154-
# GH 17778 handles compression for byte strings.
155-
if not close and self.compression:
156-
f.close()
157-
with open(self.path_or_buf, 'r') as f:
158-
data = f.read()
159-
f, handles = _get_handle(self.path_or_buf, self.mode,
160+
# GH 17778 handles zip compression for byte strings separately.
161+
buf = f.getvalue()
162+
if path_or_buf:
163+
f, handles = _get_handle(path_or_buf, self.mode,
160164
encoding=encoding,
161165
compression=self.compression)
162-
f.write(data)
163-
close = True
164-
if close:
166+
f.write(buf)
165167
f.close()
168+
for _fh in handles:
169+
_fh.close()
166170

167171
def _save_header(self):
168172

pandas/tests/frame/test_to_csv.py

+27-11
Original file line numberDiff line numberDiff line change
@@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self):
919919
recons = pd.read_csv(StringIO(csv_str), index_col=0)
920920
assert_frame_equal(self.frame, recons)
921921

922-
def test_to_csv_compression(self, compression):
923-
924-
df = DataFrame([[0.123456, 0.234567, 0.567567],
925-
[12.32112, 123123.2, 321321.2]],
926-
index=['A', 'B'], columns=['X', 'Y', 'Z'])
922+
@pytest.mark.parametrize('df,encoding', [
923+
(DataFrame([[0.123456, 0.234567, 0.567567],
924+
[12.32112, 123123.2, 321321.2]],
925+
index=['A', 'B'], columns=['X', 'Y', 'Z']), None),
926+
# GH 21241, 21118
927+
(DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'),
928+
(DataFrame(5 * [[123, u"你好", u"世界"]],
929+
columns=['X', 'Y', 'Z']), 'gb2312'),
930+
(DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]],
931+
columns=['X', 'Y', 'Z']), 'cp737')
932+
])
933+
def test_to_csv_compression(self, df, encoding, compression):
927934

928935
with ensure_clean() as filename:
929936

930-
df.to_csv(filename, compression=compression)
937+
df.to_csv(filename, compression=compression, encoding=encoding)
931938

932939
# test the round trip - to_csv -> read_csv
933-
rs = read_csv(filename, compression=compression,
934-
index_col=0)
935-
assert_frame_equal(df, rs)
940+
result = read_csv(filename, compression=compression,
941+
index_col=0, encoding=encoding)
942+
943+
with open(filename, 'w') as fh:
944+
df.to_csv(fh, compression=compression, encoding=encoding)
945+
946+
result_fh = read_csv(filename, compression=compression,
947+
index_col=0, encoding=encoding)
948+
assert_frame_equal(df, result)
949+
assert_frame_equal(df, result_fh)
936950

937951
# explicitly make sure file is compressed
938952
with tm.decompress_file(filename, compression) as fh:
939-
text = fh.read().decode('utf8')
953+
text = fh.read().decode(encoding or 'utf8')
940954
for col in df.columns:
941955
assert col in text
942956

943957
with tm.decompress_file(filename, compression) as fh:
944-
assert_frame_equal(df, read_csv(fh, index_col=0))
958+
assert_frame_equal(df, read_csv(fh,
959+
index_col=0,
960+
encoding=encoding))
945961

946962
def test_to_csv_date_format(self):
947963
with ensure_clean('__tmp_to_csv_date_format__') as path:

pandas/tests/series/test_io.py

+26-10
Original file line numberDiff line numberDiff line change
@@ -138,29 +138,45 @@ def test_to_csv_path_is_none(self):
138138
csv_str = s.to_csv(path=None)
139139
assert isinstance(csv_str, str)
140140

141-
def test_to_csv_compression(self, compression):
142-
143-
s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
144-
name='X')
141+
@pytest.mark.parametrize('s,encoding', [
142+
(Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'],
143+
name='X'), None),
144+
# GH 21241, 21118
145+
(Series(['abc', 'def', 'ghi'], name='X'), 'ascii'),
146+
(Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'),
147+
(Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737')
148+
])
149+
def test_to_csv_compression(self, s, encoding, compression):
145150

146151
with ensure_clean() as filename:
147152

148-
s.to_csv(filename, compression=compression, header=True)
153+
s.to_csv(filename, compression=compression, encoding=encoding,
154+
header=True)
149155

150156
# test the round trip - to_csv -> read_csv
151-
rs = pd.read_csv(filename, compression=compression,
152-
index_col=0, squeeze=True)
153-
assert_series_equal(s, rs)
157+
result = pd.read_csv(filename, compression=compression,
158+
encoding=encoding, index_col=0, squeeze=True)
159+
160+
with open(filename, 'w') as fh:
161+
s.to_csv(fh, compression=compression, encoding=encoding,
162+
header=True)
163+
164+
result_fh = pd.read_csv(filename, compression=compression,
165+
encoding=encoding, index_col=0,
166+
squeeze=True)
167+
assert_series_equal(s, result)
168+
assert_series_equal(s, result_fh)
154169

155170
# explicitly ensure file was compressed
156171
with tm.decompress_file(filename, compression) as fh:
157-
text = fh.read().decode('utf8')
172+
text = fh.read().decode(encoding or 'utf8')
158173
assert s.name in text
159174

160175
with tm.decompress_file(filename, compression) as fh:
161176
assert_series_equal(s, pd.read_csv(fh,
162177
index_col=0,
163-
squeeze=True))
178+
squeeze=True,
179+
encoding=encoding))
164180

165181

166182
class TestSeriesIO(TestData):

pandas/tests/test_common.py

+23
Original file line numberDiff line numberDiff line change
@@ -241,3 +241,26 @@ def test_compression_size(obj, method, compression):
241241
getattr(obj, method)(filename, compression=None)
242242
uncompressed = os.path.getsize(filename)
243243
assert uncompressed > compressed
244+
245+
246+
@pytest.mark.parametrize('obj', [
247+
DataFrame(100 * [[0.123456, 0.234567, 0.567567],
248+
[12.32112, 123123.2, 321321.2]],
249+
columns=['X', 'Y', 'Z']),
250+
Series(100 * [0.123456, 0.234567, 0.567567], name='X')])
251+
@pytest.mark.parametrize('method', ['to_csv'])
252+
def test_compression_size_fh(obj, method, compression_only):
253+
254+
with tm.ensure_clean() as filename:
255+
with open(filename, 'w') as fh:
256+
getattr(obj, method)(fh, compression=compression_only)
257+
assert not fh.closed
258+
assert fh.closed
259+
compressed = os.path.getsize(filename)
260+
with tm.ensure_clean() as filename:
261+
with open(filename, 'w') as fh:
262+
getattr(obj, method)(fh, compression=None)
263+
assert not fh.closed
264+
assert fh.closed
265+
uncompressed = os.path.getsize(filename)
266+
assert uncompressed > compressed

0 commit comments

Comments
 (0)