diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index f2bc81eea186b..db4f4acc7ee16 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -92,6 +92,7 @@ I/O - Bug in IO methods specifying ``compression='zip'`` which produced uncompressed zip archives (:issue:`17778`, :issue:`21144`) - Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) +- Bug in :meth:`DataFrame.to_csv` and :meth:`Series.to_csv` causes encoding error when compression and encoding are specified (:issue:`21241`, :issue:`21118`) - Plotting diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 0be2a180fbfa2..7f660e2644fa4 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -9,6 +9,7 @@ import numpy as np from pandas.core.dtypes.missing import notna +from pandas.core.dtypes.inference import is_file_like from pandas.core.index import Index, MultiIndex from pandas import compat from pandas.compat import (StringIO, range, zip) @@ -127,14 +128,19 @@ def save(self): else: encoding = self.encoding - if hasattr(self.path_or_buf, 'write'): - f = self.path_or_buf - close = False + # PR 21300 uses string buffer to receive csv writing and dump into + # file-like output with compression as option. GH 21241, 21118 + f = StringIO() + if not is_file_like(self.path_or_buf): + # path_or_buf is path + path_or_buf = self.path_or_buf + elif hasattr(self.path_or_buf, 'name'): + # path_or_buf is file handle + path_or_buf = self.path_or_buf.name else: - f, handles = _get_handle(self.path_or_buf, self.mode, - encoding=encoding, - compression=None) - close = True if self.compression is None else False + # path_or_buf is file-like IO objects. + f = self.path_or_buf + path_or_buf = None try: writer_kwargs = dict(lineterminator=self.line_terminator, @@ -151,18 +157,16 @@ def save(self): self._save() finally: - # GH 17778 handles compression for byte strings. - if not close and self.compression: - f.close() - with open(f.name, 'r') as f: - data = f.read() - f, handles = _get_handle(f.name, self.mode, + # GH 17778 handles zip compression for byte strings separately. + buf = f.getvalue() + if path_or_buf: + f, handles = _get_handle(path_or_buf, self.mode, encoding=encoding, compression=self.compression) - f.write(data) - close = True - if close: + f.write(buf) f.close() + for _fh in handles: + _fh.close() def _save_header(self): diff --git a/pandas/tests/frame/test_to_csv.py b/pandas/tests/frame/test_to_csv.py index e4829ebf48561..60dc336a85388 100644 --- a/pandas/tests/frame/test_to_csv.py +++ b/pandas/tests/frame/test_to_csv.py @@ -919,29 +919,45 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) - def test_to_csv_compression(self, compression): - - df = DataFrame([[0.123456, 0.234567, 0.567567], - [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + @pytest.mark.parametrize('df,encoding', [ + (DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']), None), + # GH 21241, 21118 + (DataFrame([['abc', 'def', 'ghi']], columns=['X', 'Y', 'Z']), 'ascii'), + (DataFrame(5 * [[123, u"你好", u"世界"]], + columns=['X', 'Y', 'Z']), 'gb2312'), + (DataFrame(5 * [[123, u"Γειά σου", u"Κόσμε"]], + columns=['X', 'Y', 'Z']), 'cp737') + ]) + def test_to_csv_compression(self, df, encoding, compression): with ensure_clean() as filename: - df.to_csv(filename, compression=compression) + df.to_csv(filename, compression=compression, encoding=encoding) # test the round trip - to_csv -> read_csv - rs = read_csv(filename, compression=compression, - index_col=0) - assert_frame_equal(df, rs) + result = read_csv(filename, compression=compression, + index_col=0, encoding=encoding) + + with open(filename, 'w') as fh: + df.to_csv(fh, compression=compression, encoding=encoding) + + result_fh = read_csv(filename, compression=compression, + index_col=0, encoding=encoding) + assert_frame_equal(df, result) + assert_frame_equal(df, result_fh) # explicitly make sure file is compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') + text = fh.read().decode(encoding or 'utf8') for col in df.columns: assert col in text with tm.decompress_file(filename, compression) as fh: - assert_frame_equal(df, read_csv(fh, index_col=0)) + assert_frame_equal(df, read_csv(fh, + index_col=0, + encoding=encoding)) def test_to_csv_date_format(self): with ensure_clean('__tmp_to_csv_date_format__') as path: diff --git a/pandas/tests/series/test_io.py b/pandas/tests/series/test_io.py index e369dfda6deac..f98962685ad9a 100644 --- a/pandas/tests/series/test_io.py +++ b/pandas/tests/series/test_io.py @@ -137,29 +137,45 @@ def test_to_csv_path_is_none(self): csv_str = s.to_csv(path=None) assert isinstance(csv_str, str) - def test_to_csv_compression(self, compression): - - s = Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], - name='X') + @pytest.mark.parametrize('s,encoding', [ + (Series([0.123456, 0.234567, 0.567567], index=['A', 'B', 'C'], + name='X'), None), + # GH 21241, 21118 + (Series(['abc', 'def', 'ghi'], name='X'), 'ascii'), + (Series(["123", u"你好", u"世界"], name=u"中文"), 'gb2312'), + (Series(["123", u"Γειά σου", u"Κόσμε"], name=u"Ελληνικά"), 'cp737') + ]) + def test_to_csv_compression(self, s, encoding, compression): with ensure_clean() as filename: - s.to_csv(filename, compression=compression, header=True) + s.to_csv(filename, compression=compression, encoding=encoding, + header=True) # test the round trip - to_csv -> read_csv - rs = pd.read_csv(filename, compression=compression, - index_col=0, squeeze=True) - assert_series_equal(s, rs) + result = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, squeeze=True) + + with open(filename, 'w') as fh: + s.to_csv(fh, compression=compression, encoding=encoding, + header=True) + + result_fh = pd.read_csv(filename, compression=compression, + encoding=encoding, index_col=0, + squeeze=True) + assert_series_equal(s, result) + assert_series_equal(s, result_fh) # explicitly ensure file was compressed with tm.decompress_file(filename, compression) as fh: - text = fh.read().decode('utf8') + text = fh.read().decode(encoding or 'utf8') assert s.name in text with tm.decompress_file(filename, compression) as fh: assert_series_equal(s, pd.read_csv(fh, index_col=0, - squeeze=True)) + squeeze=True, + encoding=encoding)) class TestSeriesIO(TestData): diff --git a/pandas/tests/test_common.py b/pandas/tests/test_common.py index 88e469731060d..7034e9ac2e0c8 100644 --- a/pandas/tests/test_common.py +++ b/pandas/tests/test_common.py @@ -252,12 +252,13 @@ def test_compression_size_fh(obj, method, compression_only): with tm.ensure_clean() as filename: with open(filename, 'w') as fh: getattr(obj, method)(fh, compression=compression_only) - # GH 17778 - assert fh.closed + assert not fh.closed + assert fh.closed compressed = os.path.getsize(filename) with tm.ensure_clean() as filename: with open(filename, 'w') as fh: getattr(obj, method)(fh, compression=None) assert not fh.closed + assert fh.closed uncompressed = os.path.getsize(filename) assert uncompressed > compressed