15
15
import os
16
16
import struct
17
17
import sys
18
- from typing import Any
18
+ from typing import Any , Dict , Hashable , Optional , Sequence
19
19
import warnings
20
20
21
21
from dateutil .relativedelta import relativedelta
22
22
import numpy as np
23
23
24
24
from pandas ._libs .lib import infer_dtype
25
25
from pandas ._libs .writers import max_len_string_array
26
+ from pandas ._typing import FilePathOrBuffer
26
27
from pandas .util ._decorators import Appender
27
28
28
29
from pandas .core .dtypes .common import (
47
48
from pandas .io .common import get_filepath_or_buffer , stringify_path
48
49
49
50
_version_error = (
50
- "Version of given Stata file is not 104, 105, 108, "
51
- "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
52
- "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)"
51
+ "Version of given Stata file is {version}. pandas supports importing "
52
+ "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
53
+ "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
54
+ "and 119 (Stata 15/16, over 32,767 variables)."
53
55
)
54
56
55
57
_statafile_processing_params1 = """\
@@ -1091,11 +1093,11 @@ def _read_header(self):
1091
1093
self .col_sizes = [self ._calcsize (typ ) for typ in self .typlist ]
1092
1094
1093
1095
def _read_new_header (self , first_char ):
1094
- # The first part of the header is common to 117 and 118 .
1096
+ # The first part of the header is common to 117 - 119 .
1095
1097
self .path_or_buf .read (27 ) # stata_dta><header><release>
1096
1098
self .format_version = int (self .path_or_buf .read (3 ))
1097
1099
if self .format_version not in [117 , 118 , 119 ]:
1098
- raise ValueError (_version_error )
1100
+ raise ValueError (_version_error . format ( version = self . format_version ) )
1099
1101
self ._set_encoding ()
1100
1102
self .path_or_buf .read (21 ) # </release><byteorder>
1101
1103
self .byteorder = self .path_or_buf .read (3 ) == b"MSF" and ">" or "<"
@@ -1288,7 +1290,7 @@ def _get_seek_variable_labels(self):
1288
1290
def _read_old_header (self , first_char ):
1289
1291
self .format_version = struct .unpack ("b" , first_char )[0 ]
1290
1292
if self .format_version not in [104 , 105 , 108 , 111 , 113 , 114 , 115 ]:
1291
- raise ValueError (_version_error )
1293
+ raise ValueError (_version_error . format ( version = self . format_version ) )
1292
1294
self ._set_encoding ()
1293
1295
self .byteorder = (
1294
1296
struct .unpack ("b" , self .path_or_buf .read (1 ))[0 ] == 0x1 and ">" or "<"
@@ -2695,7 +2697,7 @@ def _convert_key(self, key):
2695
2697
2696
2698
def generate_table (self ):
2697
2699
"""
2698
- Generates the GSO lookup table for the DataFRame
2700
+ Generates the GSO lookup table for the DataFrame
2699
2701
2700
2702
Returns
2701
2703
-------
@@ -2934,9 +2936,9 @@ def _write_header(self, data_label=None, time_stamp=None):
2934
2936
bio .write (self ._tag (bytes (str (self ._dta_version ), "utf-8" ), "release" ))
2935
2937
# byteorder
2936
2938
bio .write (self ._tag (byteorder == ">" and "MSF" or "LSF" , "byteorder" ))
2937
- # number of vars, 2 bytes
2938
- assert self .nvar < 2 ** 16
2939
- bio .write (self ._tag (struct .pack (byteorder + "H" , self .nvar ), "K" ))
2939
+ # number of vars, 2 bytes in 117 and 118, 4 byte in 119
2940
+ nvar_type = "H" if self ._dta_version <= 118 else "I"
2941
+ bio .write (self ._tag (struct .pack (byteorder + nvar_type , self .nvar ), "K" ))
2940
2942
# 117 uses 4 bytes, 118 uses 8
2941
2943
nobs_size = "I" if self ._dta_version == 117 else "Q"
2942
2944
bio .write (self ._tag (struct .pack (byteorder + nobs_size , self .nobs ), "N" ))
@@ -3033,7 +3035,8 @@ def _write_varnames(self):
3033
3035
3034
3036
def _write_sortlist (self ):
3035
3037
self ._update_map ("sortlist" )
3036
- self ._file .write (self ._tag (b"\x00 \00 " * (self .nvar + 1 ), "sortlist" ))
3038
+ sort_size = 2 if self ._dta_version < 119 else 4
3039
+ self ._file .write (self ._tag (b"\x00 " * sort_size * (self .nvar + 1 ), "sortlist" ))
3037
3040
3038
3041
def _write_formats (self ):
3039
3042
self ._update_map ("formats" )
@@ -3173,13 +3176,14 @@ def _set_formats_and_types(self, dtypes):
3173
3176
)
3174
3177
3175
3178
3176
- class StataWriter118 (StataWriter117 ):
3179
+ class StataWriterUTF8 (StataWriter117 ):
3177
3180
"""
3178
- A class for writing Stata binary dta files in Stata 15 format (118)
3181
+ Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
3179
3182
3180
- DTA 118 format files support unicode string data (both fixed and strL)
3181
- format. Unicode is also supported in value labels, variable labels and
3182
- the dataset label.
3183
+ DTA 118 and 119 format files support unicode string data (both fixed
3184
+ and strL) format. Unicode is also supported in value labels, variable
3185
+ labels and the dataset label. Format 119 is automatically used if the
3186
+ file contains more than 32,767 variables.
3183
3187
3184
3188
.. versionadded:: 1.0.0
3185
3189
@@ -3192,34 +3196,38 @@ class StataWriter118(StataWriter117):
3192
3196
is written.
3193
3197
data : DataFrame
3194
3198
Input to save
3195
- convert_dates : dict
3199
+ convert_dates : dict, default None
3196
3200
Dictionary mapping columns containing datetime types to stata internal
3197
3201
format to use when writing the dates. Options are 'tc', 'td', 'tm',
3198
3202
'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
3199
3203
Datetime columns that do not have a conversion type specified will be
3200
3204
converted to 'tc'. Raises NotImplementedError if a datetime column has
3201
3205
timezone information
3202
- write_index : bool
3206
+ write_index : bool, default True
3203
3207
Write the index to Stata dataset.
3204
- byteorder : str
3208
+ byteorder : str, default None
3205
3209
Can be ">", "<", "little", or "big". default is `sys.byteorder`
3206
- time_stamp : datetime
3210
+ time_stamp : datetime, default None
3207
3211
A datetime to use as file creation date. Default is the current time
3208
- data_label : str
3212
+ data_label : str, default None
3209
3213
A label for the data set. Must be 80 characters or smaller.
3210
- variable_labels : dict
3214
+ variable_labels : dict, default None
3211
3215
Dictionary containing columns as keys and variable labels as values.
3212
3216
Each label must be 80 characters or smaller.
3213
- convert_strl : list
3217
+ convert_strl : list, default None
3214
3218
List of columns names to convert to Stata StrL format. Columns with
3215
3219
more than 2045 characters are automatically written as StrL.
3216
3220
Smaller columns can be converted by including the column name. Using
3217
3221
StrLs can reduce output file size when strings are longer than 8
3218
3222
characters, and either frequently repeated or sparse.
3223
+ version : int, default None
3224
+ The dta version to use. By default, uses the size of data to determine
3225
+ the version. 118 is used if data.shape[1] <= 32767, and 119 is used
3226
+ for storing larger DataFrames.
3219
3227
3220
3228
Returns
3221
3229
-------
3222
- StataWriter118
3230
+ StataWriterUTF8
3223
3231
The instance has a write_file method, which will write the file to the
3224
3232
given `fname`.
3225
3233
@@ -3238,24 +3246,60 @@ class StataWriter118(StataWriter117):
3238
3246
--------
3239
3247
Using Unicode data and column names
3240
3248
3241
- >>> from pandas.io.stata import StataWriter118
3249
+ >>> from pandas.io.stata import StataWriterUTF8
3242
3250
>>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])
3243
- >>> writer = StataWriter118 ('./data_file.dta', data)
3251
+ >>> writer = StataWriterUTF8 ('./data_file.dta', data)
3244
3252
>>> writer.write_file()
3245
3253
3246
3254
Or with long strings stored in strl format
3247
3255
3248
3256
>>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],
3249
3257
... columns=['strls'])
3250
- >>> writer = StataWriter118 ('./data_file_with_long_strings.dta', data,
3251
- ... convert_strl=['strls'])
3258
+ >>> writer = StataWriterUTF8 ('./data_file_with_long_strings.dta', data,
3259
+ ... convert_strl=['strls'])
3252
3260
>>> writer.write_file()
3253
3261
"""
3254
3262
3255
3263
_encoding = "utf-8"
3256
- _dta_version = 118
3257
3264
3258
- def _validate_variable_name (self , name ):
3265
+ def __init__ (
3266
+ self ,
3267
+ fname : FilePathOrBuffer ,
3268
+ data : DataFrame ,
3269
+ convert_dates : Optional [Dict [Hashable , str ]] = None ,
3270
+ write_index : bool = True ,
3271
+ byteorder : Optional [str ] = None ,
3272
+ time_stamp : Optional [datetime .datetime ] = None ,
3273
+ data_label : Optional [str ] = None ,
3274
+ variable_labels : Optional [Dict [Hashable , str ]] = None ,
3275
+ convert_strl : Optional [Sequence [Hashable ]] = None ,
3276
+ version : Optional [int ] = None ,
3277
+ ):
3278
+ if version is None :
3279
+ version = 118 if data .shape [1 ] <= 32767 else 119
3280
+ elif version not in (118 , 119 ):
3281
+ raise ValueError ("version must be either 118 or 119." )
3282
+ elif version == 118 and data .shape [1 ] > 32767 :
3283
+ raise ValueError (
3284
+ "You must use version 119 for data sets containing more than"
3285
+ "32,767 variables"
3286
+ )
3287
+
3288
+ super ().__init__ (
3289
+ fname ,
3290
+ data ,
3291
+ convert_dates = convert_dates ,
3292
+ write_index = write_index ,
3293
+ byteorder = byteorder ,
3294
+ time_stamp = time_stamp ,
3295
+ data_label = data_label ,
3296
+ variable_labels = variable_labels ,
3297
+ convert_strl = convert_strl ,
3298
+ )
3299
+ # Override version set in StataWriter117 init
3300
+ self ._dta_version = version
3301
+
3302
+ def _validate_variable_name (self , name : str ) -> str :
3259
3303
"""
3260
3304
Validate variable names for Stata export.
3261
3305
@@ -3272,7 +3316,7 @@ def _validate_variable_name(self, name):
3272
3316
3273
3317
Notes
3274
3318
-----
3275
- Stata 118 support most unicode characters. The only limatation is in
3319
+ Stata 118+ support most unicode characters. The only limitation is in
3276
3320
the ascii range where the characters supported are a-z, A-Z, 0-9 and _.
3277
3321
"""
3278
3322
# High code points appear to be acceptable
0 commit comments