Skip to content

Commit 85f2172

Browse files
meeseeksmachinebashtage
authored andcommitted
Backport PR #30959: ENH: Add Stata 119 writer (#31009)
Co-authored-by: Kevin Sheppard <[email protected]>
1 parent 8140466 commit 85f2172

File tree

4 files changed

+163
-98
lines changed

4 files changed

+163
-98
lines changed

doc/source/whatsnew/v1.0.0.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,7 @@ Other enhancements
223223
- :meth:`DataFrame.sort_values` and :meth:`Series.sort_values` have gained ``ignore_index`` keyword to be able to reset index after sorting (:issue:`30114`)
224224
- :meth:`DataFrame.sort_index` and :meth:`Series.sort_index` have gained ``ignore_index`` keyword to reset index (:issue:`30114`)
225225
- :meth:`DataFrame.drop_duplicates` has gained ``ignore_index`` keyword to reset index (:issue:`30114`)
226-
- Added new writer for exporting Stata dta files in version 118, ``StataWriter118``. This format supports exporting strings containing Unicode characters (:issue:`23573`)
226+
- Added new writer for exporting Stata dta files in versions 118 and 119, ``StataWriterUTF8``. These files formats support exporting strings containing Unicode characters. Format 119 supports data sets with more than 32,767 variables (:issue:`23573`, :issue:`30959`)
227227
- :meth:`Series.map` now accepts ``collections.abc.Mapping`` subclasses as a mapper (:issue:`29733`)
228228
- Added an experimental :attr:`~DataFrame.attrs` for storing global metadata about a dataset (:issue:`29062`)
229229
- :meth:`Timestamp.fromisocalendar` is now compatible with python 3.8 and above (:issue:`28115`)

pandas/core/frame.py

+26-14
Original file line numberDiff line numberDiff line change
@@ -1898,14 +1898,22 @@ def to_stata(
18981898
variable_labels : dict
18991899
Dictionary containing columns as keys and variable labels as
19001900
values. Each label must be 80 characters or smaller.
1901-
version : {114, 117}, default 114
1902-
Version to use in the output dta file. Version 114 can be used
1903-
read by Stata 10 and later. Version 117 can be read by Stata 13
1904-
or later. Version 114 limits string variables to 244 characters or
1905-
fewer while 117 allows strings with lengths up to 2,000,000
1906-
characters.
1901+
version : {114, 117, 118, 119, None}, default 114
1902+
Version to use in the output dta file. Set to None to let pandas
1903+
decide between 118 or 119 formats depending on the number of
1904+
columns in the frame. Version 114 can be read by Stata 10 and
1905+
later. Version 117 can be read by Stata 13 or later. Version 118
1906+
is supported in Stata 14 and later. Version 119 is supported in
1907+
Stata 15 and later. Version 114 limits string variables to 244
1908+
characters or fewer while versions 117 and later allow strings
1909+
with lengths up to 2,000,000 characters. Versions 118 and 119
1910+
support Unicode characters, and version 119 supports more than
1911+
32,767 variables.
19071912
19081913
.. versionadded:: 0.23.0
1914+
.. versionchanged:: 1.0.0
1915+
1916+
Added support for formats 118 and 119.
19091917
19101918
convert_strl : list, optional
19111919
List of column names to convert to string columns to Stata StrL
@@ -1939,20 +1947,24 @@ def to_stata(
19391947
... 'speed': [350, 18, 361, 15]})
19401948
>>> df.to_stata('animals.dta') # doctest: +SKIP
19411949
"""
1942-
kwargs = {}
1943-
if version not in (114, 117, 118):
1944-
raise ValueError("Only formats 114, 117 and 118 are supported.")
1950+
if version not in (114, 117, 118, 119, None):
1951+
raise ValueError("Only formats 114, 117, 118 and 119 are supported.")
19451952
if version == 114:
19461953
if convert_strl is not None:
19471954
raise ValueError("strl is not supported in format 114")
19481955
from pandas.io.stata import StataWriter as statawriter
1949-
else:
1950-
if version == 117:
1951-
from pandas.io.stata import StataWriter117 as statawriter
1952-
else:
1953-
from pandas.io.stata import StataWriter118 as statawriter
1956+
elif version == 117:
1957+
from pandas.io.stata import StataWriter117 as statawriter
1958+
else: # versions 118 and 119
1959+
from pandas.io.stata import StataWriterUTF8 as statawriter
19541960

1961+
kwargs = {}
1962+
if version is None or version >= 117:
1963+
# strl conversion is only supported >= 117
19551964
kwargs["convert_strl"] = convert_strl
1965+
if version is None or version >= 118:
1966+
# Specifying the version is only supported for UTF8 (118 or 119)
1967+
kwargs["version"] = version
19561968

19571969
writer = statawriter(
19581970
path,

pandas/io/stata.py

+76-32
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,15 @@
1515
import os
1616
import struct
1717
import sys
18-
from typing import Any
18+
from typing import Any, Dict, Hashable, Optional, Sequence
1919
import warnings
2020

2121
from dateutil.relativedelta import relativedelta
2222
import numpy as np
2323

2424
from pandas._libs.lib import infer_dtype
2525
from pandas._libs.writers import max_len_string_array
26+
from pandas._typing import FilePathOrBuffer
2627
from pandas.util._decorators import Appender
2728

2829
from pandas.core.dtypes.common import (
@@ -47,9 +48,10 @@
4748
from pandas.io.common import get_filepath_or_buffer, stringify_path
4849

4950
_version_error = (
50-
"Version of given Stata file is not 104, 105, 108, "
51-
"111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
52-
"115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)"
51+
"Version of given Stata file is {version}. pandas supports importing "
52+
"versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
53+
"114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
54+
"and 119 (Stata 15/16, over 32,767 variables)."
5355
)
5456

5557
_statafile_processing_params1 = """\
@@ -1091,11 +1093,11 @@ def _read_header(self):
10911093
self.col_sizes = [self._calcsize(typ) for typ in self.typlist]
10921094

10931095
def _read_new_header(self, first_char):
1094-
# The first part of the header is common to 117 and 118.
1096+
# The first part of the header is common to 117 - 119.
10951097
self.path_or_buf.read(27) # stata_dta><header><release>
10961098
self.format_version = int(self.path_or_buf.read(3))
10971099
if self.format_version not in [117, 118, 119]:
1098-
raise ValueError(_version_error)
1100+
raise ValueError(_version_error.format(version=self.format_version))
10991101
self._set_encoding()
11001102
self.path_or_buf.read(21) # </release><byteorder>
11011103
self.byteorder = self.path_or_buf.read(3) == b"MSF" and ">" or "<"
@@ -1288,7 +1290,7 @@ def _get_seek_variable_labels(self):
12881290
def _read_old_header(self, first_char):
12891291
self.format_version = struct.unpack("b", first_char)[0]
12901292
if self.format_version not in [104, 105, 108, 111, 113, 114, 115]:
1291-
raise ValueError(_version_error)
1293+
raise ValueError(_version_error.format(version=self.format_version))
12921294
self._set_encoding()
12931295
self.byteorder = (
12941296
struct.unpack("b", self.path_or_buf.read(1))[0] == 0x1 and ">" or "<"
@@ -2695,7 +2697,7 @@ def _convert_key(self, key):
26952697

26962698
def generate_table(self):
26972699
"""
2698-
Generates the GSO lookup table for the DataFRame
2700+
Generates the GSO lookup table for the DataFrame
26992701
27002702
Returns
27012703
-------
@@ -2934,9 +2936,9 @@ def _write_header(self, data_label=None, time_stamp=None):
29342936
bio.write(self._tag(bytes(str(self._dta_version), "utf-8"), "release"))
29352937
# byteorder
29362938
bio.write(self._tag(byteorder == ">" and "MSF" or "LSF", "byteorder"))
2937-
# number of vars, 2 bytes
2938-
assert self.nvar < 2 ** 16
2939-
bio.write(self._tag(struct.pack(byteorder + "H", self.nvar), "K"))
2939+
# number of vars, 2 bytes in 117 and 118, 4 byte in 119
2940+
nvar_type = "H" if self._dta_version <= 118 else "I"
2941+
bio.write(self._tag(struct.pack(byteorder + nvar_type, self.nvar), "K"))
29402942
# 117 uses 4 bytes, 118 uses 8
29412943
nobs_size = "I" if self._dta_version == 117 else "Q"
29422944
bio.write(self._tag(struct.pack(byteorder + nobs_size, self.nobs), "N"))
@@ -3033,7 +3035,8 @@ def _write_varnames(self):
30333035

30343036
def _write_sortlist(self):
30353037
self._update_map("sortlist")
3036-
self._file.write(self._tag(b"\x00\00" * (self.nvar + 1), "sortlist"))
3038+
sort_size = 2 if self._dta_version < 119 else 4
3039+
self._file.write(self._tag(b"\x00" * sort_size * (self.nvar + 1), "sortlist"))
30373040

30383041
def _write_formats(self):
30393042
self._update_map("formats")
@@ -3173,13 +3176,14 @@ def _set_formats_and_types(self, dtypes):
31733176
)
31743177

31753178

3176-
class StataWriter118(StataWriter117):
3179+
class StataWriterUTF8(StataWriter117):
31773180
"""
3178-
A class for writing Stata binary dta files in Stata 15 format (118)
3181+
Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
31793182
3180-
DTA 118 format files support unicode string data (both fixed and strL)
3181-
format. Unicode is also supported in value labels, variable labels and
3182-
the dataset label.
3183+
DTA 118 and 119 format files support unicode string data (both fixed
3184+
and strL) format. Unicode is also supported in value labels, variable
3185+
labels and the dataset label. Format 119 is automatically used if the
3186+
file contains more than 32,767 variables.
31833187
31843188
.. versionadded:: 1.0.0
31853189
@@ -3192,34 +3196,38 @@ class StataWriter118(StataWriter117):
31923196
is written.
31933197
data : DataFrame
31943198
Input to save
3195-
convert_dates : dict
3199+
convert_dates : dict, default None
31963200
Dictionary mapping columns containing datetime types to stata internal
31973201
format to use when writing the dates. Options are 'tc', 'td', 'tm',
31983202
'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
31993203
Datetime columns that do not have a conversion type specified will be
32003204
converted to 'tc'. Raises NotImplementedError if a datetime column has
32013205
timezone information
3202-
write_index : bool
3206+
write_index : bool, default True
32033207
Write the index to Stata dataset.
3204-
byteorder : str
3208+
byteorder : str, default None
32053209
Can be ">", "<", "little", or "big". default is `sys.byteorder`
3206-
time_stamp : datetime
3210+
time_stamp : datetime, default None
32073211
A datetime to use as file creation date. Default is the current time
3208-
data_label : str
3212+
data_label : str, default None
32093213
A label for the data set. Must be 80 characters or smaller.
3210-
variable_labels : dict
3214+
variable_labels : dict, default None
32113215
Dictionary containing columns as keys and variable labels as values.
32123216
Each label must be 80 characters or smaller.
3213-
convert_strl : list
3217+
convert_strl : list, default None
32143218
List of columns names to convert to Stata StrL format. Columns with
32153219
more than 2045 characters are automatically written as StrL.
32163220
Smaller columns can be converted by including the column name. Using
32173221
StrLs can reduce output file size when strings are longer than 8
32183222
characters, and either frequently repeated or sparse.
3223+
version : int, default None
3224+
The dta version to use. By default, uses the size of data to determine
3225+
the version. 118 is used if data.shape[1] <= 32767, and 119 is used
3226+
for storing larger DataFrames.
32193227
32203228
Returns
32213229
-------
3222-
StataWriter118
3230+
StataWriterUTF8
32233231
The instance has a write_file method, which will write the file to the
32243232
given `fname`.
32253233
@@ -3238,24 +3246,60 @@ class StataWriter118(StataWriter117):
32383246
--------
32393247
Using Unicode data and column names
32403248
3241-
>>> from pandas.io.stata import StataWriter118
3249+
>>> from pandas.io.stata import StataWriterUTF8
32423250
>>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])
3243-
>>> writer = StataWriter118('./data_file.dta', data)
3251+
>>> writer = StataWriterUTF8('./data_file.dta', data)
32443252
>>> writer.write_file()
32453253
32463254
Or with long strings stored in strl format
32473255
32483256
>>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],
32493257
... columns=['strls'])
3250-
>>> writer = StataWriter118('./data_file_with_long_strings.dta', data,
3251-
... convert_strl=['strls'])
3258+
>>> writer = StataWriterUTF8('./data_file_with_long_strings.dta', data,
3259+
... convert_strl=['strls'])
32523260
>>> writer.write_file()
32533261
"""
32543262

32553263
_encoding = "utf-8"
3256-
_dta_version = 118
32573264

3258-
def _validate_variable_name(self, name):
3265+
def __init__(
3266+
self,
3267+
fname: FilePathOrBuffer,
3268+
data: DataFrame,
3269+
convert_dates: Optional[Dict[Hashable, str]] = None,
3270+
write_index: bool = True,
3271+
byteorder: Optional[str] = None,
3272+
time_stamp: Optional[datetime.datetime] = None,
3273+
data_label: Optional[str] = None,
3274+
variable_labels: Optional[Dict[Hashable, str]] = None,
3275+
convert_strl: Optional[Sequence[Hashable]] = None,
3276+
version: Optional[int] = None,
3277+
):
3278+
if version is None:
3279+
version = 118 if data.shape[1] <= 32767 else 119
3280+
elif version not in (118, 119):
3281+
raise ValueError("version must be either 118 or 119.")
3282+
elif version == 118 and data.shape[1] > 32767:
3283+
raise ValueError(
3284+
"You must use version 119 for data sets containing more than"
3285+
"32,767 variables"
3286+
)
3287+
3288+
super().__init__(
3289+
fname,
3290+
data,
3291+
convert_dates=convert_dates,
3292+
write_index=write_index,
3293+
byteorder=byteorder,
3294+
time_stamp=time_stamp,
3295+
data_label=data_label,
3296+
variable_labels=variable_labels,
3297+
convert_strl=convert_strl,
3298+
)
3299+
# Override version set in StataWriter117 init
3300+
self._dta_version = version
3301+
3302+
def _validate_variable_name(self, name: str) -> str:
32593303
"""
32603304
Validate variable names for Stata export.
32613305
@@ -3272,7 +3316,7 @@ def _validate_variable_name(self, name):
32723316
32733317
Notes
32743318
-----
3275-
Stata 118 support most unicode characters. The only limatation is in
3319+
Stata 118+ support most unicode characters. The only limitation is in
32763320
the ascii range where the characters supported are a-z, A-Z, 0-9 and _.
32773321
"""
32783322
# High code points appear to be acceptable

0 commit comments

Comments
 (0)