From 5889f4ee9b14f0650c23cc64e78ddbddef2afa4d Mon Sep 17 00:00:00 2001 From: Yoav Ram Date: Fri, 2 Oct 2015 15:19:13 +0300 Subject: [PATCH] ENH: added compression kw to to_csv GH7615 --- doc/source/whatsnew/v0.17.1.txt | 2 ++ pandas/core/common.py | 5 ++- pandas/core/format.py | 6 ++-- pandas/core/frame.py | 7 +++- pandas/tests/test_frame.py | 57 +++++++++++++++++++++++++++++++++ 5 files changed, 71 insertions(+), 6 deletions(-) diff --git a/doc/source/whatsnew/v0.17.1.txt b/doc/source/whatsnew/v0.17.1.txt index 74ace42eb7e22..94f66f8cfc672 100755 --- a/doc/source/whatsnew/v0.17.1.txt +++ b/doc/source/whatsnew/v0.17.1.txt @@ -18,6 +18,8 @@ Highlights include: Enhancements ~~~~~~~~~~~~ +- Support for ``compression`` (gzip/bz2) in :method:`DataFrame.to_csv` (:issue:`7615`) + .. _whatsnew_0171.enhancements.other: - Improve the error message in :func:`pandas.io.gbq.to_gbq` when a streaming insert fails (:issue:`11285`) diff --git a/pandas/core/common.py b/pandas/core/common.py index 0de1f8ca5f7ae..724843d379f64 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -2846,11 +2846,10 @@ def _get_handle(path, mode, encoding=None, compression=None): if compression == 'gzip': import gzip - f = gzip.GzipFile(path, 'rb') + f = gzip.GzipFile(path, mode) elif compression == 'bz2': import bz2 - - f = bz2.BZ2File(path, 'rb') + f = bz2.BZ2File(path, mode) else: raise ValueError('Unrecognized compression type: %s' % compression) diff --git a/pandas/core/format.py b/pandas/core/format.py index 22e8d6502b358..bf9b3bc8040de 100644 --- a/pandas/core/format.py +++ b/pandas/core/format.py @@ -1259,7 +1259,7 @@ class CSVFormatter(object): def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, cols=None, header=True, index=True, index_label=None, - mode='w', nanRep=None, encoding=None, quoting=None, + mode='w', nanRep=None, encoding=None, compression=None, quoting=None, line_terminator='\n', chunksize=None, engine=None, tupleize_cols=False, quotechar='"', date_format=None, doublequote=True, escapechar=None, decimal='.'): @@ -1281,6 +1281,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None, self.index_label = index_label self.mode = mode self.encoding = encoding + self.compression = compression if quoting is None: quoting = csv.QUOTE_MINIMAL @@ -1470,7 +1471,8 @@ def save(self): close = False else: f = com._get_handle(self.path_or_buf, self.mode, - encoding=self.encoding) + encoding=self.encoding, + compression=self.compression) close = True try: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 013bd1c230662..2cdb6d9b04341 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1210,7 +1210,7 @@ def to_panel(self): def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, quoting=None, + mode='w', encoding=None, compression=None, quoting=None, quotechar='"', line_terminator='\n', chunksize=None, tupleize_cols=False, date_format=None, doublequote=True, escapechar=None, decimal='.', **kwds): @@ -1247,6 +1247,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, encoding : string, optional A string representing the encoding to use in the output file, defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. + compression : string, optional + a string representing the compression to use in the output file, + allowed values are 'gzip', 'bz2', + only used when the first argument is a filename line_terminator : string, default '\\n' The newline character or character sequence to use in the output file @@ -1275,6 +1279,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, formatter = fmt.CSVFormatter(self, path_or_buf, line_terminator=line_terminator, sep=sep, encoding=encoding, + compression=compression, quoting=quoting, na_rep=na_rep, float_format=float_format, cols=columns, header=header, index=index, diff --git a/pandas/tests/test_frame.py b/pandas/tests/test_frame.py index 5a9b90f93bb0c..eb88fec716627 100644 --- a/pandas/tests/test_frame.py +++ b/pandas/tests/test_frame.py @@ -7328,6 +7328,63 @@ def test_to_csv_path_is_none(self): recons = pd.read_csv(StringIO(csv_str), index_col=0) assert_frame_equal(self.frame, recons) + def test_to_csv_compression_gzip(self): + ## GH7615 + ## use the compression kw in to_csv + df = DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + + with ensure_clean() as filename: + + df.to_csv(filename, compression="gzip") + + # test the round trip - to_csv -> read_csv + rs = read_csv(filename, compression="gzip", index_col=0) + assert_frame_equal(df, rs) + + # explicitly make sure file is gziped + import gzip + f = gzip.open(filename, 'rb') + text = f.read().decode('utf8') + f.close() + for col in df.columns: + self.assertIn(col, text) + + def test_to_csv_compression_bz2(self): + ## GH7615 + ## use the compression kw in to_csv + df = DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + + with ensure_clean() as filename: + + df.to_csv(filename, compression="bz2") + + # test the round trip - to_csv -> read_csv + rs = read_csv(filename, compression="bz2", index_col=0) + assert_frame_equal(df, rs) + + # explicitly make sure file is bz2ed + import bz2 + f = bz2.BZ2File(filename, 'rb') + text = f.read().decode('utf8') + f.close() + for col in df.columns: + self.assertIn(col, text) + + def test_to_csv_compression_value_error(self): + ## GH7615 + ## use the compression kw in to_csv + df = DataFrame([[0.123456, 0.234567, 0.567567], + [12.32112, 123123.2, 321321.2]], + index=['A', 'B'], columns=['X', 'Y', 'Z']) + + with ensure_clean() as filename: + # zip compression is not supported and should raise ValueError + self.assertRaises(ValueError, df.to_csv, filename, compression="zip") + def test_info(self): io = StringIO() self.frame.info(buf=io)