Skip to content

Commit e1c706c

Browse files
committed
ENH: Add Stata 119 writer
Add support for writing Stata 119 format files Rename new writer to StataWriterUTF8 since no longer version specific Improve exception message for unsupported files Fix small issues in to_stata missed in 118
1 parent 439d629 commit e1c706c

File tree

4 files changed

+89
-35
lines changed

4 files changed

+89
-35
lines changed

doc/source/whatsnew/v1.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -224,7 +224,7 @@ Other enhancements
224224
- :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`)
225225
- :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`)
226226
- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`)
227-
- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`)
227+
- Added new writer for exporting Stata dta files in version 118 and 119, ``StataWriterUTF8``. This format supports exporting strings containing Unicode characters (:issue:`23573`)
228228
- :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`)
229229
- The ``pandas.datetime`` class is now deprecated. Import from ``datetime`` instead (:issue:`30296`)
230230
- Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`)

pandas/core/frame.py

+15-9
Original file line numberDiff line numberDiff line change
@@ -1898,12 +1898,16 @@ def to_stata(
18981898
variable_labels : dict
18991899
Dictionary containing columns as keys and variable labels as
19001900
values. Each label must be 80 characters or smaller.
1901-
version : {114, 117}, default 114
1902-
Version to use in the output dta file. Version 114 can be used
1903-
read by Stata 10 and later. Version 117 can be read by Stata 13
1904-
or later. Version 114 limits string variables to 244 characters or
1905-
fewer while 117 allows strings with lengths up to 2,000,000
1906-
characters.
1901+
version : {114, 117, 118, 119, "utf-8"}, default 114
1902+
Version to use in the output dta file. Set to None to let pandas
1903+
decide between 118 or 119 formats depending on the number of
1904+
columns in the frame. Version 114 can be read by Stata 10 and
1905+
later. Version 117 can be read by Stata 13 or later. Version 118
1906+
is supported in Stata 14 and later. Version 119 is supported in
1907+
Stata 15 and later. Version 114 limits string variables to 244
1908+
characters or fewer while 117 allows strings with lengths up to
1909+
2,000,000 characters. Versions 118 and 119 support Unicode
1910+
characters, and version 119 supports more than 32,767 variables.
19071911
19081912
.. versionadded:: 0.23.0
19091913
@@ -1940,7 +1944,7 @@ def to_stata(
19401944
>>> df.to_stata('animals.dta') # doctest: +SKIP
19411945
"""
19421946
kwargs = {}
1943-
if version not in (114, 117, 118):
1947+
if version not in (114, 117, 118, 119, None):
19441948
raise ValueError("Only formats 114, 117 and 118 are supported.")
19451949
if version == 114:
19461950
if convert_strl is not None:
@@ -1949,8 +1953,10 @@ def to_stata(
19491953
else:
19501954
if version == 117:
19511955
from pandas.io.stata import StataWriter117 as statawriter
1952-
else:
1953-
from pandas.io.stata import StataWriter118 as statawriter
1956+
else: # versions 118 and 119
1957+
from pandas.io.stata import StataWriterUTF8 as statawriter
1958+
1959+
kwargs["version"] = version if version != "utf8" else None
19541960

19551961
kwargs["convert_strl"] = convert_strl
19561962

pandas/io/stata.py

+64-22
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,10 @@
4747
from pandas.io.common import get_filepath_or_buffer, stringify_path
4848

4949
_version_error = (
50-
"Version of given Stata file is not 104, 105, 108, "
51-
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
52-
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)"
50+
"Version of given Stata file is {version}. pandas supports importing "
51+
"versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
52+
"114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
53+
"and 119 (Stata 15/16, over 32,767 variables)."
5354
)
5455

5556
_statafile_processing_params1 = """\
@@ -1091,11 +1092,11 @@ def _read_header(self):
10911092
self.col_sizes = [self._calcsize(typ) for typ in self.typlist]
10921093

10931094
def _read_new_header(self, first_char):
1094-
# The first part of the header is common to 117 and 118.
1095+
# The first part of the header is common to 117 - 119.
10951096
self.path_or_buf.read(27) # stata_dta><header><release>
10961097
self.format_version = int(self.path_or_buf.read(3))
10971098
if self.format_version not in [117, 118, 119]:
1098-
raise ValueError(_version_error)
1099+
raise ValueError(_version_error.format(version=self.format_version))
10991100
self._set_encoding()
11001101
self.path_or_buf.read(21) # </release><byteorder>
11011102
self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<"
@@ -1288,7 +1289,7 @@ def _get_seek_variable_labels(self):
12881289
def _read_old_header(self, first_char):
12891290
self.format_version = struct.unpack("b", first_char)[0]
12901291
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
1291-
raise ValueError(_version_error)
1292+
raise ValueError(_version_error.format(version=self.format_version))
12921293
self._set_encoding()
12931294
self.byteorder = (
12941295
struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<"
@@ -2884,7 +2885,6 @@ class StataWriter117(StataWriter):
28842885
"""
28852886

28862887
_max_string_length = 2045
2887-
_dta_version = 117
28882888

28892889
def __init__(
28902890
self,
@@ -2900,6 +2900,7 @@ def __init__(
29002900
):
29012901
# Shallow copy since convert_strl might be modified later
29022902
self._convert_strl = [] if convert_strl is None else convert_strl[:]
2903+
self._dta_version = 117
29032904

29042905
super().__init__(
29052906
fname,
@@ -2934,9 +2935,14 @@ def _write_header(self, data_label=None, time_stamp=None):
29342935
bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release"))
29352936
# byteorder
29362937
bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder"))
2937-
# number of vars, 2 bytes
2938-
assert self.nvar < 2 ** 16
2939-
bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K"))
2938+
if self._dta_version < 119 and self.nvar > 32767:
2939+
raise RuntimeError(
2940+
"You must use version 119 for data sets containing more than"
2941+
"32,767 variables"
2942+
)
2943+
# number of vars, 2 bytes in 117 and 118, 4 byte in 119
2944+
nvar_type = "H" if self._dta_version <= 118 else "I"
2945+
bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K"))
29402946
# 117 uses 4 bytes, 118 uses 8
29412947
nobs_size = "I" if self._dta_version == 117 else "Q"
29422948
bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N"))
@@ -3033,7 +3039,8 @@ def _write_varnames(self):
30333039

30343040
def _write_sortlist(self):
30353041
self._update_map("sortlist")
3036-
self._file.write(self._tag(b"\x00\00" * (self.nvar + 1), "sortlist"))
3042+
sort_size = 2 if self._dta_version < 119 else 4
3043+
self._file.write(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist"))
30373044

30383045
def _write_formats(self):
30393046
self._update_map("formats")
@@ -3173,13 +3180,14 @@ def _set_formats_and_types(self, dtypes):
31733180
)
31743181

31753182

3176-
class StataWriter118(StataWriter117):
3183+
class StataWriterUTF8(StataWriter117):
31773184
"""
3178-
A class for writing Stata binary dta files in Stata 15 format (118)
3185+
Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
31793186
3180-
DTA 118 format files support unicode string data (both fixed and strL)
3181-
format. Unicode is also supported in value labels, variable labels and
3182-
the dataset label.
3187+
DTA 118 and 119 format files support unicode string data (both fixed
3188+
and strL) format. Unicode is also supported in value labels, variable
3189+
labels and the dataset label. Format 119 is automatically used if the
3190+
file contains more than 32,767 variables.
31833191
31843192
.. versionadded:: 1.0.0
31853193
@@ -3216,10 +3224,14 @@ class StataWriter118(StataWriter117):
32163224
Smaller columns can be converted by including the column name. Using
32173225
StrLs can reduce output file size when strings are longer than 8
32183226
characters, and either frequently repeated or sparse.
3227+
version : int, optional
3228+
The dta version to use. By default, uses the size of data to determine
3229+
the version. 118 is used if data.shape[1] <= 32767, and 119 is used
3230+
for storing larger DataFrames.
32193231
32203232
Returns
32213233
-------
3222-
StataWriter118
3234+
StataWriterUTF8
32233235
The instance has a write_file method, which will write the file to the
32243236
given `fname`.
32253237
@@ -3238,22 +3250,52 @@ class StataWriter118(StataWriter117):
32383250
--------
32393251
Using Unicode data and column names
32403252
3241-
>>> from pandas.io.stata import StataWriter118
3253+
>>> from pandas.io.stata import StataWriterUTF8
32423254
>>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])
3243-
>>> writer = StataWriter118('./data_file.dta', data)
3255+
>>> writer = StataWriterUTF8('./data_file.dta', data)
32443256
>>> writer.write_file()
32453257
32463258
Or with long strings stored in strl format
32473259
32483260
>>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],
32493261
... columns=['strls'])
3250-
>>> writer = StataWriter118('./data_file_with_long_strings.dta', data,
3251-
... convert_strl=['strls'])
3262+
>>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data,
3263+
... convert_strl=['strls'])
32523264
>>> writer.write_file()
32533265
"""
32543266

32553267
_encoding = "utf-8"
3256-
_dta_version = 118
3268+
3269+
def __init__(
3270+
self,
3271+
fname,
3272+
data,
3273+
convert_dates=None,
3274+
write_index=True,
3275+
byteorder=None,
3276+
time_stamp=None,
3277+
data_label=None,
3278+
variable_labels=None,
3279+
convert_strl=None,
3280+
version=None,
3281+
):
3282+
if version is None:
3283+
version = 118 if data.shape[1] <= 32767 else 119
3284+
elif version not in (118, 119):
3285+
raise ValueError("version must be either 118 or 119.")
3286+
super().__init__(
3287+
fname,
3288+
data,
3289+
convert_dates=convert_dates,
3290+
write_index=write_index,
3291+
byteorder=byteorder,
3292+
time_stamp=time_stamp,
3293+
data_label=data_label,
3294+
variable_labels=variable_labels,
3295+
convert_strl=convert_strl,
3296+
)
3297+
# Override version set in StataWriter117 init
3298+
self._dta_version = version
32573299

32583300
def _validate_variable_name(self, name):
32593301
"""

pandas/tests/io/test_stata.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
PossiblePrecisionLoss,
2222
StataMissingValue,
2323
StataReader,
24-
StataWriter118,
24+
StataWriterUTF8,
2525
read_stata,
2626
)
2727

@@ -1770,7 +1770,8 @@ def test_stata_119(self):
17701770
assert df.iloc[0, -1] == 1
17711771
assert df.iloc[0, 0] == pd.Timestamp(datetime(2012, 12, 21, 21, 12, 21))
17721772

1773-
def test_118_writer(self):
1773+
@pytest.mark.parametrize("version", [118, 119, None])
1774+
def test_utf8_writer(self, version):
17741775
cat = pd.Categorical(["a", "β", "ĉ"], ordered=True)
17751776
data = pd.DataFrame(
17761777
[
@@ -1791,13 +1792,14 @@ def test_118_writer(self):
17911792
data_label = "ᴅaᵀa-label"
17921793
data["β"] = data["β"].astype(np.int32)
17931794
with tm.ensure_clean() as path:
1794-
writer = StataWriter118(
1795+
writer = StataWriterUTF8(
17951796
path,
17961797
data,
17971798
data_label=data_label,
17981799
convert_strl=["strls"],
17991800
variable_labels=variable_labels,
18001801
write_index=False,
1802+
version=version,
18011803
)
18021804
writer.write_file()
18031805
reread_encoded = read_stata(path)
@@ -1807,3 +1809,7 @@ def test_118_writer(self):
18071809
reader = StataReader(path)
18081810
assert reader.data_label == data_label
18091811
assert reader.variable_labels() == variable_labels
1812+
1813+
data.to_stata(path, version=version, write_index=False)
1814+
reread_to_stata = read_stata(path)
1815+
tm.assert_frame_equal(data, reread_to_stata)

0 commit comments

Comments
 (0)