Skip to content

Commit d3e7634

Browse files
committed
ENH: Add class to write da format 117 files
Add export for dta 117 files which add support for long strings Refactor StataWriter to simplify new writer closes #16450
1 parent 563a6ad commit d3e7634

File tree

4 files changed

+770
-71
lines changed

4 files changed

+770
-71
lines changed

doc/source/whatsnew/v0.23.0.txt

+2
Original file line numberDiff line numberDiff line change
@@ -450,6 +450,8 @@ Other Enhancements
450450
- Updated :meth:`DataFrame.to_gbq` and :meth:`pandas.read_gbq` signature and documentation to reflect changes from
451451
the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ
452452
library. (:issue:`20564`)
453+
- Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`)
454+
453455

454456
.. _whatsnew_0230.api_breaking:
455457

pandas/core/frame.py

+33-4
Original file line numberDiff line numberDiff line change
@@ -1769,7 +1769,8 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
17691769

17701770
def to_stata(self, fname, convert_dates=None, write_index=True,
17711771
encoding="latin-1", byteorder=None, time_stamp=None,
1772-
data_label=None, variable_labels=None):
1772+
data_label=None, variable_labels=None, version=114,
1773+
convert_strl=None):
17731774
"""
17741775
A class for writing Stata binary dta files from array-like objects
17751776
@@ -1801,6 +1802,23 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
18011802
18021803
.. versionadded:: 0.19.0
18031804
1805+
version : {114, 117}
1806+
dta version to use in the output file. Version 114 can be used
1807+
read by Stata 10 and later. Version 117 can be read by Stata 13
1808+
or later. Version 114 limits string variables to 244 characters or
1809+
fewer while 117 allows strings with lengths up to 2,000,000
1810+
characters.
1811+
1812+
.. versionadded:: 0.23.0
1813+
1814+
convert_strl : list, optional
1815+
List of column names to convert to string columns to Stata StrL
1816+
format. Only available if version is 117. Storign strings in the
1817+
StrL format can produce smaller dta files if strings have more than
1818+
8 characters and values are repeated.
1819+
1820+
.. versionadded:: 0.23.0
1821+
18041822
Raises
18051823
------
18061824
NotImplementedError
@@ -1832,12 +1850,23 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
18321850
>>> writer = StataWriter('./date_data_file.dta', data, {2 : 'tw'})
18331851
>>> writer.write_file()
18341852
"""
1835-
from pandas.io.stata import StataWriter
1836-
writer = StataWriter(fname, self, convert_dates=convert_dates,
1853+
kwargs = {}
1854+
if version not in (114, 117):
1855+
raise ValueError('Only formats 114 and 117 supported.')
1856+
if version == 114:
1857+
if convert_strl is not None:
1858+
raise ValueError('strl support is only available when using '
1859+
'format 117')
1860+
from pandas.io.stata import StataWriter as statawriter
1861+
else:
1862+
from pandas.io.stata import StataWriter117 as statawriter
1863+
kwargs['convert_strl'] = convert_strl
1864+
1865+
writer = statawriter(fname, self, convert_dates=convert_dates,
18371866
encoding=encoding, byteorder=byteorder,
18381867
time_stamp=time_stamp, data_label=data_label,
18391868
write_index=write_index,
1840-
variable_labels=variable_labels)
1869+
variable_labels=variable_labels, **kwargs)
18411870
writer.write_file()
18421871

18431872
def to_feather(self, fname):

0 commit comments

Comments
 (0)