Skip to content

Commit aaa716c

Browse files
bashtagejorisvandenbossche
authored andcommitted
BUG: Enable stata files to be written to buffers (#21169)
Enable support for general file-like objects when exporting stata files closes #21041 (cherry picked from commit f91e28c)
1 parent c851246 commit aaa716c

File tree

4 files changed

+77
-13
lines changed

4 files changed

+77
-13
lines changed

doc/source/whatsnew/v0.23.1.txt

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ Indexing
8181
I/O
8282
^^^
8383

84-
-
84+
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
8585
-
8686

8787
Plotting

pandas/core/frame.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -1774,8 +1774,11 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
17741774
17751775
Parameters
17761776
----------
1777-
fname : str or buffer
1778-
String path of file-like object.
1777+
fname : path (string), buffer or path object
1778+
string, path object (pathlib.Path or py._path.local.LocalPath) or
1779+
object implementing a binary write() functions. If using a buffer
1780+
then the buffer will not be automatically closed after the file
1781+
data has been written.
17791782
convert_dates : dict
17801783
Dictionary mapping columns containing datetime types to stata
17811784
internal format to use when writing the dates. Options are 'tc',

pandas/io/stata.py

+44-10
Original file line numberDiff line numberDiff line change
@@ -1758,11 +1758,25 @@ def value_labels(self):
17581758
return self.value_label_dict
17591759

17601760

1761-
def _open_file_binary_write(fname, encoding):
1761+
def _open_file_binary_write(fname):
1762+
"""
1763+
Open a binary file or no-op if file-like
1764+
1765+
Parameters
1766+
----------
1767+
fname : string path, path object or buffer
1768+
1769+
Returns
1770+
-------
1771+
file : file-like object
1772+
File object supporting write
1773+
own : bool
1774+
True if the file was created, otherwise False
1775+
"""
17621776
if hasattr(fname, 'write'):
17631777
# if 'b' not in fname.mode:
1764-
return fname
1765-
return open(fname, "wb")
1778+
return fname, False
1779+
return open(fname, "wb"), True
17661780

17671781

17681782
def _set_endianness(endianness):
@@ -1899,7 +1913,9 @@ class StataWriter(StataParser):
18991913
----------
19001914
fname : path (string), buffer or path object
19011915
string, path object (pathlib.Path or py._path.local.LocalPath) or
1902-
object implementing a binary write() functions.
1916+
object implementing a binary write() functions. If using a buffer
1917+
then the buffer will not be automatically closed after the file
1918+
is written.
19031919
19041920
.. versionadded:: 0.23.0 support for pathlib, py.path.
19051921
@@ -1970,6 +1986,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
19701986
self._time_stamp = time_stamp
19711987
self._data_label = data_label
19721988
self._variable_labels = variable_labels
1989+
self._own_file = True
19731990
# attach nobs, nvars, data, varlist, typlist
19741991
self._prepare_pandas(data)
19751992

@@ -2183,9 +2200,7 @@ def _prepare_pandas(self, data):
21832200
self.fmtlist[key] = self._convert_dates[key]
21842201

21852202
def write_file(self):
2186-
self._file = _open_file_binary_write(
2187-
self._fname, self._encoding or self._default_encoding
2188-
)
2203+
self._file, self._own_file = _open_file_binary_write(self._fname)
21892204
try:
21902205
self._write_header(time_stamp=self._time_stamp,
21912206
data_label=self._data_label)
@@ -2205,6 +2220,23 @@ def write_file(self):
22052220
self._write_file_close_tag()
22062221
self._write_map()
22072222
finally:
2223+
self._close()
2224+
2225+
def _close(self):
2226+
"""
2227+
Close the file if it was created by the writer.
2228+
2229+
If a buffer or file-like object was passed in, for example a GzipFile,
2230+
then leave this file open for the caller to close. In either case,
2231+
attempt to flush the file contents to ensure they are written to disk
2232+
(if supported)
2233+
"""
2234+
# Some file-like objects might not support flush
2235+
try:
2236+
self._file.flush()
2237+
except AttributeError:
2238+
pass
2239+
if self._own_file:
22082240
self._file.close()
22092241

22102242
def _write_map(self):
@@ -2374,7 +2406,7 @@ def _prepare_data(self):
23742406

23752407
def _write_data(self):
23762408
data = self.data
2377-
data.tofile(self._file)
2409+
self._file.write(data.tobytes())
23782410

23792411
def _null_terminate(self, s, as_string=False):
23802412
null_byte = '\x00'
@@ -2641,7 +2673,9 @@ class StataWriter117(StataWriter):
26412673
----------
26422674
fname : path (string), buffer or path object
26432675
string, path object (pathlib.Path or py._path.local.LocalPath) or
2644-
object implementing a binary write() functions.
2676+
object implementing a binary write() functions. If using a buffer
2677+
then the buffer will not be automatically closed after the file
2678+
is written.
26452679
data : DataFrame
26462680
Input to save
26472681
convert_dates : dict
@@ -2879,7 +2913,7 @@ def _write_data(self):
28792913
self._update_map('data')
28802914
data = self.data
28812915
self._file.write(b'<data>')
2882-
data.tofile(self._file)
2916+
self._file.write(data.tobytes())
28832917
self._file.write(b'</data>')
28842918

28852919
def _write_strls(self):

pandas/tests/io/test_stata.py

+27
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22
# pylint: disable=E1101
33

44
import datetime as dt
5+
import io
6+
import gzip
57
import os
68
import struct
79
import warnings
@@ -1473,3 +1475,28 @@ def test_invalid_date_conversion(self):
14731475
with pytest.raises(ValueError):
14741476
original.to_stata(path,
14751477
convert_dates={'wrong_name': 'tc'})
1478+
1479+
@pytest.mark.parametrize('version', [114, 117])
1480+
def test_nonfile_writing(self, version):
1481+
# GH 21041
1482+
bio = io.BytesIO()
1483+
df = tm.makeDataFrame()
1484+
df.index.name = 'index'
1485+
with tm.ensure_clean() as path:
1486+
df.to_stata(bio, version=version)
1487+
bio.seek(0)
1488+
with open(path, 'wb') as dta:
1489+
dta.write(bio.read())
1490+
reread = pd.read_stata(path, index_col='index')
1491+
tm.assert_frame_equal(df, reread)
1492+
1493+
def test_gzip_writing(self):
1494+
# writing version 117 requires seek and cannot be used with gzip
1495+
df = tm.makeDataFrame()
1496+
df.index.name = 'index'
1497+
with tm.ensure_clean() as path:
1498+
with gzip.GzipFile(path, 'wb') as gz:
1499+
df.to_stata(gz, version=114)
1500+
with gzip.GzipFile(path, 'rb') as gz:
1501+
reread = pd.read_stata(gz, index_col='index')
1502+
tm.assert_frame_equal(df, reread)

0 commit comments

Comments
 (0)