47
47
from pandas .io .common import get_filepath_or_buffer , stringify_path
48
48
49
49
_version_error = (
50
- "Version of given Stata file is not 104, 105, 108, "
51
- "111 (Stata 7SE), 113 (Stata 8/9), 114 (Stata 10/11), "
52
- "115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)"
50
+ "Version of given Stata file is {version}. pandas supports importing "
51
+ "versions 104, 105, 108, 111 (Stata 7SE), 113 (Stata 8/9), "
52
+ "114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), 118 (Stata 14/15/16),"
53
+ "and 119 (Stata 15/16, over 32,767 variables)."
53
54
)
54
55
55
56
_statafile_processing_params1 = """\
@@ -1091,11 +1092,11 @@ def _read_header(self):
1091
1092
self .col_sizes = [self ._calcsize (typ ) for typ in self .typlist ]
1092
1093
1093
1094
def _read_new_header (self , first_char ):
1094
- # The first part of the header is common to 117 and 118 .
1095
+ # The first part of the header is common to 117 - 119 .
1095
1096
self .path_or_buf .read (27 ) # stata_dta><header><release>
1096
1097
self .format_version = int (self .path_or_buf .read (3 ))
1097
1098
if self .format_version not in [117 , 118 , 119 ]:
1098
- raise ValueError (_version_error )
1099
+ raise ValueError (_version_error . format ( version = self . format_version ) )
1099
1100
self ._set_encoding ()
1100
1101
self .path_or_buf .read (21 ) # </release><byteorder>
1101
1102
self .byteorder = self .path_or_buf .read (3 ) == b"MSF" and ">" or "<"
@@ -1288,7 +1289,7 @@ def _get_seek_variable_labels(self):
1288
1289
def _read_old_header (self , first_char ):
1289
1290
self .format_version = struct .unpack ("b" , first_char )[0 ]
1290
1291
if self .format_version not in [104 , 105 , 108 , 111 , 113 , 114 , 115 ]:
1291
- raise ValueError (_version_error )
1292
+ raise ValueError (_version_error . format ( version = self . format_version ) )
1292
1293
self ._set_encoding ()
1293
1294
self .byteorder = (
1294
1295
struct .unpack ("b" , self .path_or_buf .read (1 ))[0 ] == 0x1 and ">" or "<"
@@ -2884,7 +2885,6 @@ class StataWriter117(StataWriter):
2884
2885
"""
2885
2886
2886
2887
_max_string_length = 2045
2887
- _dta_version = 117
2888
2888
2889
2889
def __init__ (
2890
2890
self ,
@@ -2900,6 +2900,7 @@ def __init__(
2900
2900
):
2901
2901
# Shallow copy since convert_strl might be modified later
2902
2902
self ._convert_strl = [] if convert_strl is None else convert_strl [:]
2903
+ self ._dta_version = 117
2903
2904
2904
2905
super ().__init__ (
2905
2906
fname ,
@@ -2934,9 +2935,14 @@ def _write_header(self, data_label=None, time_stamp=None):
2934
2935
bio .write (self ._tag (bytes (str (self ._dta_version ), "utf-8" ), "release" ))
2935
2936
# byteorder
2936
2937
bio .write (self ._tag (byteorder == ">" and "MSF" or "LSF" , "byteorder" ))
2937
- # number of vars, 2 bytes
2938
- assert self .nvar < 2 ** 16
2939
- bio .write (self ._tag (struct .pack (byteorder + "H" , self .nvar ), "K" ))
2938
+ if self ._dta_version < 119 and self .nvar > 32767 :
2939
+ raise RuntimeError (
2940
+ "You must use version 119 for data sets containing more than"
2941
+ "32,767 variables"
2942
+ )
2943
+ # number of vars, 2 bytes in 117 and 118, 4 byte in 119
2944
+ nvar_type = "H" if self ._dta_version <= 118 else "I"
2945
+ bio .write (self ._tag (struct .pack (byteorder + nvar_type , self .nvar ), "K" ))
2940
2946
# 117 uses 4 bytes, 118 uses 8
2941
2947
nobs_size = "I" if self ._dta_version == 117 else "Q"
2942
2948
bio .write (self ._tag (struct .pack (byteorder + nobs_size , self .nobs ), "N" ))
@@ -3033,7 +3039,8 @@ def _write_varnames(self):
3033
3039
3034
3040
def _write_sortlist (self ):
3035
3041
self ._update_map ("sortlist" )
3036
- self ._file .write (self ._tag (b"\x00 \00 " * (self .nvar + 1 ), "sortlist" ))
3042
+ sort_size = 2 if self ._dta_version < 119 else 4
3043
+ self ._file .write (self ._tag (b"\x00 " * sort_size * (self .nvar + 1 ), "sortlist" ))
3037
3044
3038
3045
def _write_formats (self ):
3039
3046
self ._update_map ("formats" )
@@ -3173,13 +3180,14 @@ def _set_formats_and_types(self, dtypes):
3173
3180
)
3174
3181
3175
3182
3176
- class StataWriter118 (StataWriter117 ):
3183
+ class StataWriterUTF8 (StataWriter117 ):
3177
3184
"""
3178
- A class for writing Stata binary dta files in Stata 15 format (118)
3185
+ Stata binary dta file writing in Stata 15 (118) and 16 (119) formats
3179
3186
3180
- DTA 118 format files support unicode string data (both fixed and strL)
3181
- format. Unicode is also supported in value labels, variable labels and
3182
- the dataset label.
3187
+ DTA 118 and 119 format files support unicode string data (both fixed
3188
+ and strL) format. Unicode is also supported in value labels, variable
3189
+ labels and the dataset label. Format 119 is automatically used if the
3190
+ file contains more than 32,767 variables.
3183
3191
3184
3192
.. versionadded:: 1.0.0
3185
3193
@@ -3216,10 +3224,14 @@ class StataWriter118(StataWriter117):
3216
3224
Smaller columns can be converted by including the column name. Using
3217
3225
StrLs can reduce output file size when strings are longer than 8
3218
3226
characters, and either frequently repeated or sparse.
3227
+ version : int, optional
3228
+ The dta version to use. By default, uses the size of data to determine
3229
+ the version. 118 is used if data.shape[1] <= 32767, and 119 is used
3230
+ for storing larger DataFrames.
3219
3231
3220
3232
Returns
3221
3233
-------
3222
- StataWriter118
3234
+ StataWriterUTF8
3223
3235
The instance has a write_file method, which will write the file to the
3224
3236
given `fname`.
3225
3237
@@ -3238,22 +3250,52 @@ class StataWriter118(StataWriter117):
3238
3250
--------
3239
3251
Using Unicode data and column names
3240
3252
3241
- >>> from pandas.io.stata import StataWriter118
3253
+ >>> from pandas.io.stata import StataWriterUTF8
3242
3254
>>> data = pd.DataFrame([[1.0, 1, 'ᴬ']], columns=['a', 'β', 'ĉ'])
3243
- >>> writer = StataWriter118 ('./data_file.dta', data)
3255
+ >>> writer = StataWriterUTF8 ('./data_file.dta', data)
3244
3256
>>> writer.write_file()
3245
3257
3246
3258
Or with long strings stored in strl format
3247
3259
3248
3260
>>> data = pd.DataFrame([['ᴀ relatively long ŝtring'], [''], ['']],
3249
3261
... columns=['strls'])
3250
- >>> writer = StataWriter118 ('./data_file_with_long_strings.dta', data,
3251
- ... convert_strl=['strls'])
3262
+ >>> writer = StataWriterUTF8 ('./data_file_with_long_strings.dta', data,
3263
+ ... convert_strl=['strls'])
3252
3264
>>> writer.write_file()
3253
3265
"""
3254
3266
3255
3267
_encoding = "utf-8"
3256
- _dta_version = 118
3268
+
3269
+ def __init__ (
3270
+ self ,
3271
+ fname ,
3272
+ data ,
3273
+ convert_dates = None ,
3274
+ write_index = True ,
3275
+ byteorder = None ,
3276
+ time_stamp = None ,
3277
+ data_label = None ,
3278
+ variable_labels = None ,
3279
+ convert_strl = None ,
3280
+ version = None ,
3281
+ ):
3282
+ if version is None :
3283
+ version = 118 if data .shape [1 ] <= 32767 else 119
3284
+ elif version not in (118 , 119 ):
3285
+ raise ValueError ("version must be either 118 or 119." )
3286
+ super ().__init__ (
3287
+ fname ,
3288
+ data ,
3289
+ convert_dates = convert_dates ,
3290
+ write_index = write_index ,
3291
+ byteorder = byteorder ,
3292
+ time_stamp = time_stamp ,
3293
+ data_label = data_label ,
3294
+ variable_labels = variable_labels ,
3295
+ convert_strl = convert_strl ,
3296
+ )
3297
+ # Override version set in StataWriter117 init
3298
+ self ._dta_version = version
3257
3299
3258
3300
def _validate_variable_name (self , name ):
3259
3301
"""
0 commit comments