Skip to content

Commit 93e7123

Browse files
bashtageTomAugspurger
authored andcommitted
ENH: Add class to write dta format 117 files (#20844)
* ENH: Add class to write dta format 117 files Add export for dta 117 files which add support for long strings Refactor StataWriter to simplify new writer closes #16450
1 parent ade293d commit 93e7123

File tree

4 files changed

+905
-116
lines changed

4 files changed

+905
-116
lines changed

doc/source/whatsnew/v0.23.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -503,6 +503,7 @@ Other Enhancements
503503
- Updated :meth:`DataFrame.to_gbq` and :meth:`pandas.read_gbq` signature and documentation to reflect changes from
504504
the Pandas-GBQ library version 0.4.0. Adds intersphinx mapping to Pandas-GBQ
505505
library. (:issue:`20564`)
506+
- Added new writer for exporting Stata dta files in version 117, ``StataWriter117``. This format supports exporting strings with lengths up to 2,000,000 characters (:issue:`16450`)
506507
- :func:`to_hdf` and :func:`read_hdf` now accept an ``errors`` keyword argument to control encoding error handling (:issue:`20835`)
507508

508509
.. _whatsnew_0230.api_breaking:

pandas/core/frame.py

+44-9
Original file line numberDiff line numberDiff line change
@@ -1769,27 +1769,28 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
17691769

17701770
def to_stata(self, fname, convert_dates=None, write_index=True,
17711771
encoding="latin-1", byteorder=None, time_stamp=None,
1772-
data_label=None, variable_labels=None):
1772+
data_label=None, variable_labels=None, version=114,
1773+
convert_strl=None):
17731774
"""
1774-
A class for writing Stata binary dta files from array-like objects
1775+
Export Stata binary dta files.
17751776
17761777
Parameters
17771778
----------
17781779
fname : str or buffer
1779-
String path of file-like object
1780+
String path of file-like object.
17801781
convert_dates : dict
17811782
Dictionary mapping columns containing datetime types to stata
17821783
internal format to use when writing the dates. Options are 'tc',
17831784
'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either an integer
17841785
or a name. Datetime columns that do not have a conversion type
17851786
specified will be converted to 'tc'. Raises NotImplementedError if
1786-
a datetime column has timezone information
1787+
a datetime column has timezone information.
17871788
write_index : bool
17881789
Write the index to Stata dataset.
17891790
encoding : str
1790-
Default is latin-1. Unicode is not supported
1791+
Default is latin-1. Unicode is not supported.
17911792
byteorder : str
1792-
Can be ">", "<", "little", or "big". default is `sys.byteorder`
1793+
Can be ">", "<", "little", or "big". default is `sys.byteorder`.
17931794
time_stamp : datetime
17941795
A datetime to use as file creation date. Default is the current
17951796
time.
@@ -1801,6 +1802,23 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
18011802
18021803
.. versionadded:: 0.19.0
18031804
1805+
version : {114, 117}
1806+
Version to use in the output dta file. Version 114 can be used
1807+
read by Stata 10 and later. Version 117 can be read by Stata 13
1808+
or later. Version 114 limits string variables to 244 characters or
1809+
fewer while 117 allows strings with lengths up to 2,000,000
1810+
characters.
1811+
1812+
.. versionadded:: 0.23.0
1813+
1814+
convert_strl : list, optional
1815+
List of column names to convert to string columns to Stata StrL
1816+
format. Only available if version is 117. Storing strings in the
1817+
StrL format can produce smaller dta files if strings have more than
1818+
8 characters and values are repeated.
1819+
1820+
.. versionadded:: 0.23.0
1821+
18041822
Raises
18051823
------
18061824
NotImplementedError
@@ -1814,6 +1832,12 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
18141832
18151833
.. versionadded:: 0.19.0
18161834
1835+
See Also
1836+
--------
1837+
pandas.read_stata : Import Stata data files
1838+
pandas.io.stata.StataWriter : low-level writer for Stata data files
1839+
pandas.io.stata.StataWriter117 : low-level writer for version 117 files
1840+
18171841
Examples
18181842
--------
18191843
>>> data.to_stata('./data_file.dta')
@@ -1832,12 +1856,23 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
18321856
>>> writer = StataWriter('./date_data_file.dta', data, {2 : 'tw'})
18331857
>>> writer.write_file()
18341858
"""
1835-
from pandas.io.stata import StataWriter
1836-
writer = StataWriter(fname, self, convert_dates=convert_dates,
1859+
kwargs = {}
1860+
if version not in (114, 117):
1861+
raise ValueError('Only formats 114 and 117 supported.')
1862+
if version == 114:
1863+
if convert_strl is not None:
1864+
raise ValueError('strl support is only available when using '
1865+
'format 117')
1866+
from pandas.io.stata import StataWriter as statawriter
1867+
else:
1868+
from pandas.io.stata import StataWriter117 as statawriter
1869+
kwargs['convert_strl'] = convert_strl
1870+
1871+
writer = statawriter(fname, self, convert_dates=convert_dates,
18371872
encoding=encoding, byteorder=byteorder,
18381873
time_stamp=time_stamp, data_label=data_label,
18391874
write_index=write_index,
1840-
variable_labels=variable_labels)
1875+
variable_labels=variable_labels, **kwargs)
18411876
writer.write_file()
18421877

18431878
def to_feather(self, fname):

0 commit comments

Comments
 (0)