Skip to content

Commit 7d49037

Browse files
committed
Merge pull request #6553 from bashtage/time_stamp-data_label-reserved_words
ENH: Allow timestamp and data label to be set when exporting to Stata
2 parents 3590d8c + 0638be8 commit 7d49037

File tree

5 files changed

+67
-17
lines changed

5 files changed

+67
-17
lines changed

doc/source/release.rst

+1
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ Improvements to existing features
147147
- perf improvements in DataFrame construction with certain offsets, by removing faulty caching
148148
(e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`)
149149
- perf improvements in single-dtyped indexing (:issue:`6484`)
150+
- ``StataWriter`` and ``DataFrame.to_stata`` accept time stamp and data labels (:issue:`6545`)
150151

151152
.. _release.bug_fixes-0.14.0:
152153

doc/source/v0.14.0.txt

+3
Original file line numberDiff line numberDiff line change
@@ -312,6 +312,9 @@ Enhancements
312312
- ``DataFrame.to_stata`` will now check data for compatibility with Stata data types
313313
and will upcast when needed. When it isn't possibly to losslessly upcast, a warning
314314
is raised (:issue:`6327`)
315+
- ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp
316+
and data_label which allow the time stamp and dataset label to be set when creating a
317+
file. (:issue:`6545`)
315318

316319
Performance
317320
~~~~~~~~~~~

pandas/core/frame.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -1216,7 +1216,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='',
12161216

12171217
def to_stata(
12181218
self, fname, convert_dates=None, write_index=True, encoding="latin-1",
1219-
byteorder=None):
1219+
byteorder=None, time_stamp=None, data_label=None):
12201220
"""
12211221
A class for writing Stata binary dta files from array-like objects
12221222
@@ -1247,7 +1247,8 @@ def to_stata(
12471247
"""
12481248
from pandas.io.stata import StataWriter
12491249
writer = StataWriter(fname, self, convert_dates=convert_dates,
1250-
encoding=encoding, byteorder=byteorder)
1250+
encoding=encoding, byteorder=byteorder,
1251+
time_stamp=time_stamp, data_label=data_label)
12511252
writer.write_file()
12521253

12531254
def to_sql(self, name, con, flavor='sqlite', if_exists='fail', **kwargs):

pandas/io/stata.py

+33-10
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,18 @@ def __init__(self, encoding):
375375
'd': np.float64(struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0])
376376
}
377377

378+
# Reserved words cannot be used as variable names
379+
self.RESERVED_WORDS = ('aggregate', 'array', 'boolean', 'break',
380+
'byte', 'case', 'catch', 'class', 'colvector',
381+
'complex', 'const', 'continue', 'default',
382+
'delegate', 'delete', 'do', 'double', 'else',
383+
'eltypedef', 'end', 'enum', 'explicit',
384+
'export', 'external', 'float', 'for', 'friend',
385+
'function', 'global', 'goto', 'if', 'inline',
386+
'int', 'local', 'long', 'NULL', 'pragma',
387+
'protected', 'quad', 'rowvector', 'short',
388+
'typedef', 'typename', 'virtual')
389+
378390
def _decode_bytes(self, str, errors=None):
379391
if compat.PY3 or self._encoding is not None:
380392
return str.decode(self._encoding, errors)
@@ -449,10 +461,10 @@ def _read_header(self):
449461
self.path_or_buf.read(4))[0]
450462
self.path_or_buf.read(11) # </N><label>
451463
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
452-
self.data_label = self.path_or_buf.read(strlen)
464+
self.data_label = self._null_terminate(self.path_or_buf.read(strlen))
453465
self.path_or_buf.read(19) # </label><timestamp>
454466
strlen = struct.unpack('b', self.path_or_buf.read(1))[0]
455-
self.time_stamp = self.path_or_buf.read(strlen)
467+
self.time_stamp = self._null_terminate(self.path_or_buf.read(strlen))
456468
self.path_or_buf.read(26) # </timestamp></header><map>
457469
self.path_or_buf.read(8) # 0x0000000000000000
458470
self.path_or_buf.read(8) # position of <map>
@@ -543,11 +555,11 @@ def _read_header(self):
543555
self.nobs = struct.unpack(self.byteorder + 'I',
544556
self.path_or_buf.read(4))[0]
545557
if self.format_version > 105:
546-
self.data_label = self.path_or_buf.read(81)
558+
self.data_label = self._null_terminate(self.path_or_buf.read(81))
547559
else:
548-
self.data_label = self.path_or_buf.read(32)
560+
self.data_label = self._null_terminate(self.path_or_buf.read(32))
549561
if self.format_version > 104:
550-
self.time_stamp = self.path_or_buf.read(18)
562+
self.time_stamp = self._null_terminate(self.path_or_buf.read(18))
551563

552564
# descriptors
553565
if self.format_version > 108:
@@ -1029,6 +1041,11 @@ class StataWriter(StataParser):
10291041
byteorder : str
10301042
Can be ">", "<", "little", or "big". The default is None which uses
10311043
`sys.byteorder`
1044+
time_stamp : datetime
1045+
A date time to use when writing the file. Can be None, in which
1046+
case the current time is used.
1047+
dataset_label : str
1048+
A label for the data set. Should be 80 characters or smaller.
10321049
10331050
Returns
10341051
-------
@@ -1047,10 +1064,13 @@ class StataWriter(StataParser):
10471064
>>> writer.write_file()
10481065
"""
10491066
def __init__(self, fname, data, convert_dates=None, write_index=True,
1050-
encoding="latin-1", byteorder=None):
1067+
encoding="latin-1", byteorder=None, time_stamp=None,
1068+
data_label=None):
10511069
super(StataWriter, self).__init__(encoding)
10521070
self._convert_dates = convert_dates
10531071
self._write_index = write_index
1072+
self._time_stamp = time_stamp
1073+
self._data_label = data_label
10541074
# attach nobs, nvars, data, varlist, typlist
10551075
self._prepare_pandas(data)
10561076

@@ -1086,7 +1106,7 @@ def __iter__(self):
10861106

10871107
if self._write_index:
10881108
data = data.reset_index()
1089-
# Check columns for compatbaility with stata
1109+
# Check columns for compatibility with stata
10901110
data = _cast_to_stata_types(data)
10911111
self.datarows = DataFrameRowIter(data)
10921112
self.nobs, self.nvar = data.shape
@@ -1110,7 +1130,8 @@ def __iter__(self):
11101130
self.fmtlist[key] = self._convert_dates[key]
11111131

11121132
def write_file(self):
1113-
self._write_header()
1133+
self._write_header(time_stamp=self._time_stamp,
1134+
data_label=self._data_label)
11141135
self._write_descriptors()
11151136
self._write_variable_labels()
11161137
# write 5 zeros for expansion fields
@@ -1147,7 +1168,7 @@ def _write_header(self, data_label=None, time_stamp=None):
11471168
# format dd Mon yyyy hh:mm
11481169
if time_stamp is None:
11491170
time_stamp = datetime.datetime.now()
1150-
elif not isinstance(time_stamp, datetime):
1171+
elif not isinstance(time_stamp, datetime.datetime):
11511172
raise ValueError("time_stamp should be datetime type")
11521173
self._file.write(
11531174
self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M"))
@@ -1169,7 +1190,9 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None,
11691190
for c in name:
11701191
if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_':
11711192
name = name.replace(c, '_')
1172-
1193+
# Variable name must not be a reserved word
1194+
if name in self.RESERVED_WORDS:
1195+
name = '_' + name
11731196
# Variable name may not start with a number
11741197
if name[0] > '0' and name[0] < '9':
11751198
name = '_' + name

pandas/io/tests/test_stata.py

+27-5
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# pylint: disable=E1101
22

33
from datetime import datetime
4+
import datetime as dt
45
import os
56
import warnings
67
import nose
@@ -248,7 +249,7 @@ def test_read_write_dta10(self):
248249

249250
original = DataFrame(data=[["string", "object", 1, 1.1,
250251
np.datetime64('2003-12-25')]],
251-
columns=['string', 'object', 'integer', 'float',
252+
columns=['string', 'object', 'integer', 'floating',
252253
'datetime'])
253254
original["object"] = Series(original["object"], dtype=object)
254255
original.index.name = 'index'
@@ -304,10 +305,20 @@ def test_read_write_dta11(self):
304305
def test_read_write_dta12(self):
305306
# skip_if_not_little_endian()
306307

307-
original = DataFrame([(1, 2, 3, 4)],
308-
columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-'])
309-
formatted = DataFrame([(1, 2, 3, 4)],
310-
columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_'])
308+
original = DataFrame([(1, 2, 3, 4, 5, 6)],
309+
columns=['astringwithmorethan32characters_1',
310+
'astringwithmorethan32characters_2',
311+
'+',
312+
'-',
313+
'short',
314+
'delete'])
315+
formatted = DataFrame([(1, 2, 3, 4, 5, 6)],
316+
columns=['astringwithmorethan32characters_',
317+
'_0astringwithmorethan32character',
318+
'_',
319+
'_1_',
320+
'_short',
321+
'_delete'])
311322
formatted.index.name = 'index'
312323
formatted = formatted.astype(np.int32)
313324

@@ -376,6 +387,17 @@ def test_read_write_reread_dta15(self):
376387
tm.assert_frame_equal(parsed_113, parsed_114)
377388
tm.assert_frame_equal(parsed_114, parsed_115)
378389

390+
def test_timestamp_and_label(self):
391+
original = DataFrame([(1,)], columns=['var'])
392+
time_stamp = datetime(2000, 2, 29, 14, 21)
393+
data_label = 'This is a data file.'
394+
with tm.ensure_clean() as path:
395+
original.to_stata(path, time_stamp=time_stamp, data_label=data_label)
396+
reader = StataReader(path)
397+
parsed_time_stamp = dt.datetime.strptime(reader.time_stamp, ('%d %b %Y %H:%M'))
398+
assert parsed_time_stamp == time_stamp
399+
assert reader.data_label == data_label
400+
379401

380402

381403
if __name__ == '__main__':

0 commit comments

Comments
 (0)