Skip to content

ENH: added compression kw to to_csv GH7615 #11219

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 12, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions doc/source/whatsnew/v0.17.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ Highlights include:
Enhancements
~~~~~~~~~~~~

- Support for ``compression`` (gzip/bz2) in :method:`DataFrame.to_csv` (:issue:`7615`)

.. _whatsnew_0171.enhancements.other:

- Improve the error message in :func:`pandas.io.gbq.to_gbq` when a streaming insert fails (:issue:`11285`)
Expand Down
5 changes: 2 additions & 3 deletions pandas/core/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -2846,11 +2846,10 @@ def _get_handle(path, mode, encoding=None, compression=None):

if compression == 'gzip':
import gzip
f = gzip.GzipFile(path, 'rb')
f = gzip.GzipFile(path, mode)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this defaults to rb or wb as appropriate, but should this be passed by the caller?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

as far as I cant tell, if we don't pass mode then it will default to rb and then to_csv will fail.

elif compression == 'bz2':
import bz2

f = bz2.BZ2File(path, 'rb')
f = bz2.BZ2File(path, mode)
else:
raise ValueError('Unrecognized compression type: %s' %
compression)
Expand Down
6 changes: 4 additions & 2 deletions pandas/core/format.py
Original file line number Diff line number Diff line change
Expand Up @@ -1259,7 +1259,7 @@ class CSVFormatter(object):

def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
cols=None, header=True, index=True, index_label=None,
mode='w', nanRep=None, encoding=None, quoting=None,
mode='w', nanRep=None, encoding=None, compression=None, quoting=None,
line_terminator='\n', chunksize=None, engine=None,
tupleize_cols=False, quotechar='"', date_format=None,
doublequote=True, escapechar=None, decimal='.'):
Expand All @@ -1281,6 +1281,7 @@ def __init__(self, obj, path_or_buf=None, sep=",", na_rep='', float_format=None,
self.index_label = index_label
self.mode = mode
self.encoding = encoding
self.compression = compression

if quoting is None:
quoting = csv.QUOTE_MINIMAL
Expand Down Expand Up @@ -1470,7 +1471,8 @@ def save(self):
close = False
else:
f = com._get_handle(self.path_or_buf, self.mode,
encoding=self.encoding)
encoding=self.encoding,
compression=self.compression)
close = True

try:
Expand Down
7 changes: 6 additions & 1 deletion pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1210,7 +1210,7 @@ def to_panel(self):

def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
columns=None, header=True, index=True, index_label=None,
mode='w', encoding=None, quoting=None,
mode='w', encoding=None, compression=None, quoting=None,
quotechar='"', line_terminator='\n', chunksize=None,
tupleize_cols=False, date_format=None, doublequote=True,
escapechar=None, decimal='.', **kwds):
Expand Down Expand Up @@ -1247,6 +1247,10 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
encoding : string, optional
A string representing the encoding to use in the output file,
defaults to 'ascii' on Python 2 and 'utf-8' on Python 3.
compression : string, optional
a string representing the compression to use in the output file,
allowed values are 'gzip', 'bz2',
only used when the first argument is a filename
line_terminator : string, default '\\n'
The newline character or character sequence to use in the output
file
Expand Down Expand Up @@ -1275,6 +1279,7 @@ def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None,
formatter = fmt.CSVFormatter(self, path_or_buf,
line_terminator=line_terminator,
sep=sep, encoding=encoding,
compression=compression,
quoting=quoting, na_rep=na_rep,
float_format=float_format, cols=columns,
header=header, index=index,
Expand Down
57 changes: 57 additions & 0 deletions pandas/tests/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -7328,6 +7328,63 @@ def test_to_csv_path_is_none(self):
recons = pd.read_csv(StringIO(csv_str), index_col=0)
assert_frame_equal(self.frame, recons)

def test_to_csv_compression_gzip(self):
## GH7615
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can you add a test to ensure that the ValueError is raised when passing an invalid string as compression

## use the compression kw in to_csv
df = DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with ensure_clean() as filename:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

pls test with bz2 as well


df.to_csv(filename, compression="gzip")

# test the round trip - to_csv -> read_csv
rs = read_csv(filename, compression="gzip", index_col=0)
assert_frame_equal(df, rs)

# explicitly make sure file is gziped
import gzip
f = gzip.open(filename, 'rb')
text = f.read().decode('utf8')
f.close()
for col in df.columns:
self.assertIn(col, text)

def test_to_csv_compression_bz2(self):
## GH7615
## use the compression kw in to_csv
df = DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with ensure_clean() as filename:

df.to_csv(filename, compression="bz2")

# test the round trip - to_csv -> read_csv
rs = read_csv(filename, compression="bz2", index_col=0)
assert_frame_equal(df, rs)

# explicitly make sure file is bz2ed
import bz2
f = bz2.BZ2File(filename, 'rb')
text = f.read().decode('utf8')
f.close()
for col in df.columns:
self.assertIn(col, text)

def test_to_csv_compression_value_error(self):
## GH7615
## use the compression kw in to_csv
df = DataFrame([[0.123456, 0.234567, 0.567567],
[12.32112, 123123.2, 321321.2]],
index=['A', 'B'], columns=['X', 'Y', 'Z'])

with ensure_clean() as filename:
# zip compression is not supported and should raise ValueError
self.assertRaises(ValueError, df.to_csv, filename, compression="zip")

def test_info(self):
io = StringIO()
self.frame.info(buf=io)
Expand Down