Skip to content

BUG: Enable stata files to be written to buffers #21169

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
May 24, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion doc/source/whatsnew/v0.23.1.txt
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ Indexing
I/O
^^^

-
- Bug in :meth:`DataFrame.to_stata` which prevented exporting DataFrames to buffers and most file-like objects (:issue:`21041`)
-

Plotting
Expand Down
7 changes: 5 additions & 2 deletions pandas/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1774,8 +1774,11 @@ def to_stata(self, fname, convert_dates=None, write_index=True,

Parameters
----------
fname : str or buffer
String path of file-like object.
fname : path (string), buffer or path object
string, path object (pathlib.Path or py._path.local.LocalPath) or
object implementing a binary write() functions. If using a buffer
then the buffer will not be automatically closed after the file
data has been written.
convert_dates : dict
Dictionary mapping columns containing datetime types to stata
internal format to use when writing the dates. Options are 'tc',
Expand Down
54 changes: 44 additions & 10 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1758,11 +1758,25 @@ def value_labels(self):
return self.value_label_dict


def _open_file_binary_write(fname, encoding):
def _open_file_binary_write(fname):
"""
Open a binary file or no-op if file-like

Parameters
----------
fname : string path, path object or buffer

Returns
-------
file : file-like object
File object supporting write
own : bool
True if the file was created, otherwise False
"""
if hasattr(fname, 'write'):
# if 'b' not in fname.mode:
return fname
return open(fname, "wb")
return fname, False
return open(fname, "wb"), True


def _set_endianness(endianness):
Expand Down Expand Up @@ -1899,7 +1913,9 @@ class StataWriter(StataParser):
----------
fname : path (string), buffer or path object
string, path object (pathlib.Path or py._path.local.LocalPath) or
object implementing a binary write() functions.
object implementing a binary write() functions. If using a buffer
then the buffer will not be automatically closed after the file
is written.

.. versionadded:: 0.23.0 support for pathlib, py.path.

Expand Down Expand Up @@ -1970,6 +1986,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
self._time_stamp = time_stamp
self._data_label = data_label
self._variable_labels = variable_labels
self._own_file = True
# attach nobs, nvars, data, varlist, typlist
self._prepare_pandas(data)

Expand Down Expand Up @@ -2183,9 +2200,7 @@ def _prepare_pandas(self, data):
self.fmtlist[key] = self._convert_dates[key]

def write_file(self):
self._file = _open_file_binary_write(
self._fname, self._encoding or self._default_encoding
)
self._file, self._own_file = _open_file_binary_write(self._fname)
try:
self._write_header(time_stamp=self._time_stamp,
data_label=self._data_label)
Expand All @@ -2205,6 +2220,23 @@ def write_file(self):
self._write_file_close_tag()
self._write_map()
finally:
self._close()

def _close(self):
"""
Close the file if it was created by the writer.

If a buffer or file-like object was passed in, for example a GzipFile,
then leave this file open for the caller to close. In either case,
attempt to flush the file contents to ensure they are written to disk
(if supported)
"""
# Some file-like objects might not support flush
try:
self._file.flush()
except AttributeError:
pass
if self._own_file:
self._file.close()

def _write_map(self):
Expand Down Expand Up @@ -2374,7 +2406,7 @@ def _prepare_data(self):

def _write_data(self):
data = self.data
data.tofile(self._file)
self._file.write(data.tobytes())

def _null_terminate(self, s, as_string=False):
null_byte = '\x00'
Expand Down Expand Up @@ -2641,7 +2673,9 @@ class StataWriter117(StataWriter):
----------
fname : path (string), buffer or path object
string, path object (pathlib.Path or py._path.local.LocalPath) or
object implementing a binary write() functions.
object implementing a binary write() functions. If using a buffer
then the buffer will not be automatically closed after the file
is written.
data : DataFrame
Input to save
convert_dates : dict
Expand Down Expand Up @@ -2879,7 +2913,7 @@ def _write_data(self):
self._update_map('data')
data = self.data
self._file.write(b'<data>')
data.tofile(self._file)
self._file.write(data.tobytes())
self._file.write(b'</data>')

def _write_strls(self):
Expand Down
27 changes: 27 additions & 0 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
# pylint: disable=E1101

import datetime as dt
import io
import gzip
import os
import struct
import warnings
Expand Down Expand Up @@ -1473,3 +1475,28 @@ def test_invalid_date_conversion(self):
with pytest.raises(ValueError):
original.to_stata(path,
convert_dates={'wrong_name': 'tc'})

@pytest.mark.parametrize('version', [114, 117])
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it is worthwhile a replication of the OP?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this.

def test_nonfile_writing(self, version):
# GH 21041
bio = io.BytesIO()
df = tm.makeDataFrame()
df.index.name = 'index'
with tm.ensure_clean() as path:
df.to_stata(bio, version=version)
bio.seek(0)
with open(path, 'wb') as dta:
dta.write(bio.read())
reread = pd.read_stata(path, index_col='index')
tm.assert_frame_equal(df, reread)

def test_gzip_writing(self):
# writing version 117 requires seek and cannot be used with gzip
df = tm.makeDataFrame()
df.index.name = 'index'
with tm.ensure_clean() as path:
with gzip.GzipFile(path, 'wb') as gz:
df.to_stata(gz, version=114)
with gzip.GzipFile(path, 'rb') as gz:
reread = pd.read_stata(gz, index_col='index')
tm.assert_frame_equal(df, reread)