Skip to content

Commit 3b88446

Browse files
authored
support binary file handles in to_csv (#35129)
1 parent 1104f0d commit 3b88446

File tree

8 files changed

+158
-60
lines changed

8 files changed

+158
-60
lines changed

doc/source/user_guide/io.rst

+17
Original file line numberDiff line numberDiff line change
@@ -1064,6 +1064,23 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided:
10641064
pd.read_csv('tmp.csv', parse_dates=[0])
10651065
pd.read_csv('tmp.csv', dayfirst=True, parse_dates=[0])
10661066
1067+
Writing CSVs to binary file objects
1068+
+++++++++++++++++++++++++++++++++++
1069+
1070+
.. versionadded:: 1.2.0
1071+
1072+
``df.to_csv(..., mode="w+b")`` allows writing a CSV to a file object
1073+
opened binary mode. For this to work, it is necessary that ``mode``
1074+
contains a "b":
1075+
1076+
.. ipython:: python
1077+
1078+
import io
1079+
1080+
data = pd.DataFrame([0, 1, 2])
1081+
buffer = io.BytesIO()
1082+
data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
1083+
10671084
.. _io.float_precision:
10681085

10691086
Specifying method for floating-point conversion

doc/source/whatsnew/v1.2.0.rst

+21-2
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,25 @@ including other versions of pandas.
1313
Enhancements
1414
~~~~~~~~~~~~
1515

16+
.. _whatsnew_120.binary_handle_to_csv:
17+
18+
Support for binary file handles in ``to_csv``
19+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
20+
21+
:meth:`to_csv` supports file handles in binary mode (:issue:`19827` and :issue:`35058`)
22+
with ``encoding`` (:issue:`13068` and :issue:`23854`) and ``compression`` (:issue:`22555`).
23+
``mode`` has to contain a ``b`` for binary handles to be supported.
24+
25+
For example:
26+
27+
.. ipython:: python
28+
29+
import io
30+
31+
data = pd.DataFrame([0, 1, 2])
32+
buffer = io.BytesIO()
33+
data.to_csv(buffer, mode="w+b", encoding="utf-8", compression="gzip")
34+
1635
.. _whatsnew_120.enhancements.other:
1736

1837
Other enhancements
@@ -121,7 +140,7 @@ MultiIndex
121140
I/O
122141
^^^
123142

124-
-
143+
- Bug in :meth:`to_csv` caused a ``ValueError`` when it was called with a filename in combination with ``mode`` containing a ``b`` (:issue:`35058`)
125144
-
126145

127146
Plotting
@@ -167,4 +186,4 @@ Other
167186
.. _whatsnew_120.contributors:
168187

169188
Contributors
170-
~~~~~~~~~~~~
189+
~~~~~~~~~~~~

pandas/core/generic.py

+13-3
Original file line numberDiff line numberDiff line change
@@ -3021,13 +3021,18 @@ def to_csv(
30213021
----------
30223022
path_or_buf : str or file handle, default None
30233023
File path or object, if None is provided the result is returned as
3024-
a string. If a file object is passed it should be opened with
3025-
`newline=''`, disabling universal newlines.
3024+
a string. If a non-binary file object is passed, it should be opened
3025+
with `newline=''`, disabling universal newlines. If a binary
3026+
file object is passed, `mode` needs to contain a `'b'`.
30263027
30273028
.. versionchanged:: 0.24.0
30283029
30293030
Was previously named "path" for Series.
30303031
3032+
.. versionchanged:: 1.2.0
3033+
3034+
Support for binary file objects was introduced.
3035+
30313036
sep : str, default ','
30323037
String of length 1. Field delimiter for the output file.
30333038
na_rep : str, default ''
@@ -3056,7 +3061,8 @@ def to_csv(
30563061
Python write mode, default 'w'.
30573062
encoding : str, optional
30583063
A string representing the encoding to use in the output file,
3059-
defaults to 'utf-8'.
3064+
defaults to 'utf-8'. `encoding` is not supported if `path_or_buf`
3065+
is a non-binary file object.
30603066
compression : str or dict, default 'infer'
30613067
If str, represents compression mode. If dict, value at 'method' is
30623068
the compression mode. Compression mode may be any of the following
@@ -3080,6 +3086,10 @@ def to_csv(
30803086
supported for compression modes 'gzip' and 'bz2'
30813087
as well as 'zip'.
30823088
3089+
.. versionchanged:: 1.2.0
3090+
3091+
Compression is supported for non-binary file objects.
3092+
30833093
quoting : optional constant from csv module
30843094
Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format`
30853095
then floats are converted to strings and thus csv.QUOTE_NONNUMERIC

pandas/io/common.py

+11-6
Original file line numberDiff line numberDiff line change
@@ -407,8 +407,9 @@ def get_handle(
407407
memory_map : boolean, default False
408408
See parsers._parser_params for more information.
409409
is_text : boolean, default True
410-
whether file/buffer is in text format (csv, json, etc.), or in binary
411-
mode (pickle, etc.).
410+
Whether the type of the content passed to the file/buffer is string or
411+
bytes. This is not the same as `"b" not in mode`. If a string content is
412+
passed to a binary file/buffer, a wrapper is inserted.
412413
errors : str, default 'strict'
413414
Specifies how encoding and decoding errors are to be handled.
414415
See the errors argument for :func:`open` for a full list
@@ -449,14 +450,14 @@ def get_handle(
449450
if is_path:
450451
f = gzip.open(path_or_buf, mode, **compression_args)
451452
else:
452-
f = gzip.GzipFile(fileobj=path_or_buf, **compression_args)
453+
f = gzip.GzipFile(fileobj=path_or_buf, mode=mode, **compression_args)
453454

454455
# BZ Compression
455456
elif compression == "bz2":
456457
if is_path:
457458
f = bz2.BZ2File(path_or_buf, mode, **compression_args)
458459
else:
459-
f = bz2.BZ2File(path_or_buf, **compression_args)
460+
f = bz2.BZ2File(path_or_buf, mode=mode, **compression_args)
460461

461462
# ZIP Compression
462463
elif compression == "zip":
@@ -489,10 +490,14 @@ def get_handle(
489490
handles.append(f)
490491

491492
elif is_path:
492-
if encoding:
493+
# Check whether the filename is to be opened in binary mode.
494+
# Binary mode does not support 'encoding' and 'newline'.
495+
is_binary_mode = "b" in mode
496+
497+
if encoding and not is_binary_mode:
493498
# Encoding
494499
f = open(path_or_buf, mode, encoding=encoding, errors=errors, newline="")
495-
elif is_text:
500+
elif is_text and not is_binary_mode:
496501
# No explicit encoding
497502
f = open(path_or_buf, mode, errors="replace", newline="")
498503
else:

pandas/io/formats/csvs.py

+33-49
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,10 @@
33
"""
44

55
import csv as csvlib
6-
from io import StringIO
6+
from io import StringIO, TextIOWrapper
77
import os
88
from typing import Hashable, List, Mapping, Optional, Sequence, Union
99
import warnings
10-
from zipfile import ZipFile
1110

1211
import numpy as np
1312

@@ -159,38 +158,29 @@ def save(self) -> None:
159158
"""
160159
Create the writer & save.
161160
"""
162-
# GH21227 internal compression is not used when file-like passed.
163-
if self.compression and hasattr(self.path_or_buf, "write"):
161+
# GH21227 internal compression is not used for non-binary handles.
162+
if (
163+
self.compression
164+
and hasattr(self.path_or_buf, "write")
165+
and "b" not in self.mode
166+
):
164167
warnings.warn(
165-
"compression has no effect when passing file-like object as input.",
168+
"compression has no effect when passing a non-binary object as input.",
166169
RuntimeWarning,
167170
stacklevel=2,
168171
)
169-
170-
# when zip compression is called.
171-
is_zip = isinstance(self.path_or_buf, ZipFile) or (
172-
not hasattr(self.path_or_buf, "write") and self.compression == "zip"
172+
self.compression = None
173+
174+
# get a handle or wrap an existing handle to take care of 1) compression and
175+
# 2) text -> byte conversion
176+
f, handles = get_handle(
177+
self.path_or_buf,
178+
self.mode,
179+
encoding=self.encoding,
180+
errors=self.errors,
181+
compression=dict(self.compression_args, method=self.compression),
173182
)
174183

175-
if is_zip:
176-
# zipfile doesn't support writing string to archive. uses string
177-
# buffer to receive csv writing and dump into zip compression
178-
# file handle. GH21241, GH21118
179-
f = StringIO()
180-
close = False
181-
elif hasattr(self.path_or_buf, "write"):
182-
f = self.path_or_buf
183-
close = False
184-
else:
185-
f, handles = get_handle(
186-
self.path_or_buf,
187-
self.mode,
188-
encoding=self.encoding,
189-
errors=self.errors,
190-
compression=dict(self.compression_args, method=self.compression),
191-
)
192-
close = True
193-
194184
try:
195185
# Note: self.encoding is irrelevant here
196186
self.writer = csvlib.writer(
@@ -206,29 +196,23 @@ def save(self) -> None:
206196
self._save()
207197

208198
finally:
209-
if is_zip:
210-
# GH17778 handles zip compression separately.
211-
buf = f.getvalue()
212-
if hasattr(self.path_or_buf, "write"):
213-
self.path_or_buf.write(buf)
214-
else:
215-
compression = dict(self.compression_args, method=self.compression)
216-
217-
f, handles = get_handle(
218-
self.path_or_buf,
219-
self.mode,
220-
encoding=self.encoding,
221-
errors=self.errors,
222-
compression=compression,
223-
)
224-
f.write(buf)
225-
close = True
226-
if close:
199+
if self.should_close:
227200
f.close()
228-
for _fh in handles:
229-
_fh.close()
230-
elif self.should_close:
201+
elif (
202+
isinstance(f, TextIOWrapper)
203+
and not f.closed
204+
and f != self.path_or_buf
205+
and hasattr(self.path_or_buf, "write")
206+
):
207+
# get_handle uses TextIOWrapper for non-binary handles. TextIOWrapper
208+
# closes the wrapped handle if it is not detached.
209+
f.flush() # make sure everything is written
210+
f.detach() # makes f unusable
211+
del f
212+
elif f != self.path_or_buf:
231213
f.close()
214+
for _fh in handles:
215+
_fh.close()
232216

233217
def _save_header(self):
234218
writer = self.writer

pandas/tests/io/formats/test_to_csv.py

+36
Original file line numberDiff line numberDiff line change
@@ -607,3 +607,39 @@ def test_to_csv_errors(self, errors):
607607
ser.to_csv(path, errors=errors)
608608
# No use in reading back the data as it is not the same anymore
609609
# due to the error handling
610+
611+
def test_to_csv_binary_handle(self):
612+
"""
613+
Binary file objects should work if 'mode' contains a 'b'.
614+
615+
GH 35058 and GH 19827
616+
"""
617+
df = tm.makeDataFrame()
618+
with tm.ensure_clean() as path:
619+
with open(path, mode="w+b") as handle:
620+
df.to_csv(handle, mode="w+b")
621+
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
622+
623+
def test_to_csv_encoding_binary_handle(self):
624+
"""
625+
Binary file objects should honor a specified encoding.
626+
627+
GH 23854 and GH 13068 with binary handles
628+
"""
629+
# example from GH 23854
630+
content = "a, b, 🐟".encode("utf-8-sig")
631+
buffer = io.BytesIO(content)
632+
df = pd.read_csv(buffer, encoding="utf-8-sig")
633+
634+
buffer = io.BytesIO()
635+
df.to_csv(buffer, mode="w+b", encoding="utf-8-sig", index=False)
636+
buffer.seek(0) # tests whether file handle wasn't closed
637+
assert buffer.getvalue().startswith(content)
638+
639+
# example from GH 13068
640+
with tm.ensure_clean() as path:
641+
with open(path, "w+b") as handle:
642+
pd.DataFrame().to_csv(handle, mode="w+b", encoding="utf-8-sig")
643+
644+
handle.seek(0)
645+
assert handle.read().startswith(b'\xef\xbb\xbf""')

pandas/tests/io/test_common.py

+11
Original file line numberDiff line numberDiff line change
@@ -378,6 +378,17 @@ def test_unknown_engine(self):
378378
with pytest.raises(ValueError, match="Unknown engine"):
379379
pd.read_csv(path, engine="pyt")
380380

381+
def test_binary_mode(self):
382+
"""
383+
'encoding' shouldn't be passed to 'open' in binary mode.
384+
385+
GH 35058
386+
"""
387+
with tm.ensure_clean() as path:
388+
df = tm.makeDataFrame()
389+
df.to_csv(path, mode="w+b")
390+
tm.assert_frame_equal(df, pd.read_csv(path, index_col=0))
391+
381392

382393
def test_is_fsspec_url():
383394
assert icom.is_fsspec_url("gcs://pandas/somethingelse.com")

pandas/tests/io/test_compression.py

+16
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,22 @@ def test_compression_warning(compression_only):
114114
df.to_csv(f, compression=compression_only)
115115

116116

117+
def test_compression_binary(compression_only):
118+
"""
119+
Binary file handles support compression.
120+
121+
GH22555
122+
"""
123+
df = tm.makeDataFrame()
124+
with tm.ensure_clean() as path:
125+
with open(path, mode="wb") as file:
126+
df.to_csv(file, mode="wb", compression=compression_only)
127+
file.seek(0) # file shouldn't be closed
128+
tm.assert_frame_equal(
129+
df, pd.read_csv(path, index_col=0, compression=compression_only)
130+
)
131+
132+
117133
def test_with_missing_lzma():
118134
"""Tests if import pandas works when lzma is not present."""
119135
# https://github.com/pandas-dev/pandas/issues/27575

0 commit comments

Comments
 (0)