diff --git a/doc/source/whatsnew/v0.23.1.txt b/doc/source/whatsnew/v0.23.1.txt index a071d7f3f5534..a7ba0dfbbd1c4 100644 --- a/doc/source/whatsnew/v0.23.1.txt +++ b/doc/source/whatsnew/v0.23.1.txt @@ -83,7 +83,7 @@ Indexing I/O ^^^ -- +- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`) - Plotting diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 0a07e85401638..1d8f225bd4342 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1774,8 +1774,11 @@ def to_stata(self, fname, convert_dates=None, write_index=True, Parameters ---------- - fname : str or buffer - String path of file-like object. + fname : path (string), buffer or path object + string, path object (pathlib.Path or py._path.local.LocalPath) or + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + data has been written. convert_dates : dict Dictionary mapping columns containing datetime types to stata internal format to use when writing the dates. Options are 'tc', diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 8f91c7a497e2d..2797924985c70 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -1758,11 +1758,25 @@ def value_labels(self): return self.value_label_dict -def _open_file_binary_write(fname, encoding): +def _open_file_binary_write(fname): + """ + Open a binary file or no-op if file-like + + Parameters + ---------- + fname : string path, path object or buffer + + Returns + ------- + file : file-like object + File object supporting write + own : bool + True if the file was created, otherwise False + """ if hasattr(fname, 'write'): # if 'b' not in fname.mode: - return fname - return open(fname, "wb") + return fname, False + return open(fname, "wb"), True def _set_endianness(endianness): @@ -1899,7 +1913,9 @@ class StataWriter(StataParser): ---------- fname : path (string), buffer or path object string, path object (pathlib.Path or py._path.local.LocalPath) or - object implementing a binary write() functions. + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. .. versionadded:: 0.23.0 support for pathlib, py.path. @@ -1970,6 +1986,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True, self._time_stamp = time_stamp self._data_label = data_label self._variable_labels = variable_labels + self._own_file = True # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -2183,9 +2200,7 @@ def _prepare_pandas(self, data): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._file = _open_file_binary_write( - self._fname, self._encoding or self._default_encoding - ) + self._file, self._own_file = _open_file_binary_write(self._fname) try: self._write_header(time_stamp=self._time_stamp, data_label=self._data_label) @@ -2205,6 +2220,23 @@ def write_file(self): self._write_file_close_tag() self._write_map() finally: + self._close() + + def _close(self): + """ + Close the file if it was created by the writer. + + If a buffer or file-like object was passed in, for example a GzipFile, + then leave this file open for the caller to close. In either case, + attempt to flush the file contents to ensure they are written to disk + (if supported) + """ + # Some file-like objects might not support flush + try: + self._file.flush() + except AttributeError: + pass + if self._own_file: self._file.close() def _write_map(self): @@ -2374,7 +2406,7 @@ def _prepare_data(self): def _write_data(self): data = self.data - data.tofile(self._file) + self._file.write(data.tobytes()) def _null_terminate(self, s, as_string=False): null_byte = '\x00' @@ -2641,7 +2673,9 @@ class StataWriter117(StataWriter): ---------- fname : path (string), buffer or path object string, path object (pathlib.Path or py._path.local.LocalPath) or - object implementing a binary write() functions. + object implementing a binary write() functions. If using a buffer + then the buffer will not be automatically closed after the file + is written. data : DataFrame Input to save convert_dates : dict @@ -2879,7 +2913,7 @@ def _write_data(self): self._update_map('data') data = self.data self._file.write(b'') - data.tofile(self._file) + self._file.write(data.tobytes()) self._file.write(b'') def _write_strls(self): diff --git a/pandas/tests/io/test_stata.py b/pandas/tests/io/test_stata.py index 110b790a65037..f3a465da4e87f 100644 --- a/pandas/tests/io/test_stata.py +++ b/pandas/tests/io/test_stata.py @@ -2,6 +2,8 @@ # pylint: disable=E1101 import datetime as dt +import io +import gzip import os import struct import warnings @@ -1473,3 +1475,28 @@ def test_invalid_date_conversion(self): with pytest.raises(ValueError): original.to_stata(path, convert_dates={'wrong_name': 'tc'}) + + @pytest.mark.parametrize('version', [114, 117]) + def test_nonfile_writing(self, version): + # GH 21041 + bio = io.BytesIO() + df = tm.makeDataFrame() + df.index.name = 'index' + with tm.ensure_clean() as path: + df.to_stata(bio, version=version) + bio.seek(0) + with open(path, 'wb') as dta: + dta.write(bio.read()) + reread = pd.read_stata(path, index_col='index') + tm.assert_frame_equal(df, reread) + + def test_gzip_writing(self): + # writing version 117 requires seek and cannot be used with gzip + df = tm.makeDataFrame() + df.index.name = 'index' + with tm.ensure_clean() as path: + with gzip.GzipFile(path, 'wb') as gz: + df.to_stata(gz, version=114) + with gzip.GzipFile(path, 'rb') as gz: + reread = pd.read_stata(gz, index_col='index') + tm.assert_frame_equal(df, reread)