From 0638be8276b2f7b959157a1fc3df9ee1d339bca5 Mon Sep 17 00:00:00 2001 From: bashtage Date: Wed, 5 Mar 2014 14:28:58 +0000 Subject: [PATCH] ENH: Allow timestamp and data label to be set when exporting to Stata Added code which allows the time stamp and the data label to be set using either StataWriter or to_stata. Also simplified reading these values using StataReader by removing null bytes from the string values read. Added basic test for both. Also fixed one small bug where variables could be stored using Stata reserved words. --- doc/source/release.rst | 1 + doc/source/v0.14.0.txt | 3 +++ pandas/core/frame.py | 5 ++-- pandas/io/stata.py | 43 +++++++++++++++++++++++++++-------- pandas/io/tests/test_stata.py | 32 ++++++++++++++++++++++---- 5 files changed, 67 insertions(+), 17 deletions(-) diff --git a/doc/source/release.rst b/doc/source/release.rst index 183d662f1578d..192d96884a02b 100644 --- a/doc/source/release.rst +++ b/doc/source/release.rst @@ -147,6 +147,7 @@ Improvements to existing features - perf improvements in DataFrame construction with certain offsets, by removing faulty caching (e.g. MonthEnd,BusinessMonthEnd), (:issue:`6479`) - perf improvements in single-dtyped indexing (:issue:`6484`) +- ``StataWriter`` and ``DataFrame.to_stata`` accept time stamp and data labels (:issue:`6545`) .. _release.bug_fixes-0.14.0: diff --git a/doc/source/v0.14.0.txt b/doc/source/v0.14.0.txt index 7c6e6a01cd041..86034c20f63d8 100644 --- a/doc/source/v0.14.0.txt +++ b/doc/source/v0.14.0.txt @@ -312,6 +312,9 @@ Enhancements - ``DataFrame.to_stata`` will now check data for compatibility with Stata data types and will upcast when needed. When it isn't possibly to losslessly upcast, a warning is raised (:issue:`6327`) +- ``DataFrame.to_stata`` and ``StataWriter`` will accept keyword arguments time_stamp + and data_label which allow the time stamp and dataset label to be set when creating a + file. (:issue:`6545`) Performance ~~~~~~~~~~~ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 4c02c8abab353..6885ce95a8505 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -1216,7 +1216,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', def to_stata( self, fname, convert_dates=None, write_index=True, encoding="latin-1", - byteorder=None): + byteorder=None, time_stamp=None, data_label=None): """ A class for writing Stata binary dta files from array-like objects @@ -1247,7 +1247,8 @@ def to_stata( """ from pandas.io.stata import StataWriter writer = StataWriter(fname, self, convert_dates=convert_dates, - encoding=encoding, byteorder=byteorder) + encoding=encoding, byteorder=byteorder, + time_stamp=time_stamp, data_label=data_label) writer.write_file() def to_sql(self, name, con, flavor='sqlite', if_exists='fail', **kwargs): diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 2ecdb22a5cc7b..7d9d272eea1b6 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -375,6 +375,18 @@ def __init__(self, encoding): 'd': np.float64(struct.unpack(' strlen = struct.unpack('b', self.path_or_buf.read(1))[0] - self.time_stamp = self.path_or_buf.read(strlen) + self.time_stamp = self._null_terminate(self.path_or_buf.read(strlen)) self.path_or_buf.read(26) # self.path_or_buf.read(8) # 0x0000000000000000 self.path_or_buf.read(8) # position of @@ -543,11 +555,11 @@ def _read_header(self): self.nobs = struct.unpack(self.byteorder + 'I', self.path_or_buf.read(4))[0] if self.format_version > 105: - self.data_label = self.path_or_buf.read(81) + self.data_label = self._null_terminate(self.path_or_buf.read(81)) else: - self.data_label = self.path_or_buf.read(32) + self.data_label = self._null_terminate(self.path_or_buf.read(32)) if self.format_version > 104: - self.time_stamp = self.path_or_buf.read(18) + self.time_stamp = self._null_terminate(self.path_or_buf.read(18)) # descriptors if self.format_version > 108: @@ -1029,6 +1041,11 @@ class StataWriter(StataParser): byteorder : str Can be ">", "<", "little", or "big". The default is None which uses `sys.byteorder` + time_stamp : datetime + A date time to use when writing the file. Can be None, in which + case the current time is used. + dataset_label : str + A label for the data set. Should be 80 characters or smaller. Returns ------- @@ -1047,10 +1064,13 @@ class StataWriter(StataParser): >>> writer.write_file() """ def __init__(self, fname, data, convert_dates=None, write_index=True, - encoding="latin-1", byteorder=None): + encoding="latin-1", byteorder=None, time_stamp=None, + data_label=None): super(StataWriter, self).__init__(encoding) self._convert_dates = convert_dates self._write_index = write_index + self._time_stamp = time_stamp + self._data_label = data_label # attach nobs, nvars, data, varlist, typlist self._prepare_pandas(data) @@ -1086,7 +1106,7 @@ def __iter__(self): if self._write_index: data = data.reset_index() - # Check columns for compatbaility with stata + # Check columns for compatibility with stata data = _cast_to_stata_types(data) self.datarows = DataFrameRowIter(data) self.nobs, self.nvar = data.shape @@ -1110,7 +1130,8 @@ def __iter__(self): self.fmtlist[key] = self._convert_dates[key] def write_file(self): - self._write_header() + self._write_header(time_stamp=self._time_stamp, + data_label=self._data_label) self._write_descriptors() self._write_variable_labels() # write 5 zeros for expansion fields @@ -1147,7 +1168,7 @@ def _write_header(self, data_label=None, time_stamp=None): # format dd Mon yyyy hh:mm if time_stamp is None: time_stamp = datetime.datetime.now() - elif not isinstance(time_stamp, datetime): + elif not isinstance(time_stamp, datetime.datetime): raise ValueError("time_stamp should be datetime type") self._file.write( self._null_terminate(time_stamp.strftime("%d %b %Y %H:%M")) @@ -1169,7 +1190,9 @@ def _write_descriptors(self, typlist=None, varlist=None, srtlist=None, for c in name: if (c < 'A' or c > 'Z') and (c < 'a' or c > 'z') and (c < '0' or c > '9') and c != '_': name = name.replace(c, '_') - + # Variable name must not be a reserved word + if name in self.RESERVED_WORDS: + name = '_' + name # Variable name may not start with a number if name[0] > '0' and name[0] < '9': name = '_' + name diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index ac4b9662fc57e..307cd1bd591fb 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -1,6 +1,7 @@ # pylint: disable=E1101 from datetime import datetime +import datetime as dt import os import warnings import nose @@ -248,7 +249,7 @@ def test_read_write_dta10(self): original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], - columns=['string', 'object', 'integer', 'float', + columns=['string', 'object', 'integer', 'floating', 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' @@ -304,10 +305,20 @@ def test_read_write_dta11(self): def test_read_write_dta12(self): # skip_if_not_little_endian() - original = DataFrame([(1, 2, 3, 4)], - columns=['astringwithmorethan32characters_1', 'astringwithmorethan32characters_2', '+', '-']) - formatted = DataFrame([(1, 2, 3, 4)], - columns=['astringwithmorethan32characters_', '_0astringwithmorethan32character', '_', '_1_']) + original = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_1', + 'astringwithmorethan32characters_2', + '+', + '-', + 'short', + 'delete']) + formatted = DataFrame([(1, 2, 3, 4, 5, 6)], + columns=['astringwithmorethan32characters_', + '_0astringwithmorethan32character', + '_', + '_1_', + '_short', + '_delete']) formatted.index.name = 'index' formatted = formatted.astype(np.int32) @@ -376,6 +387,17 @@ def test_read_write_reread_dta15(self): tm.assert_frame_equal(parsed_113, parsed_114) tm.assert_frame_equal(parsed_114, parsed_115) + def test_timestamp_and_label(self): + original = DataFrame([(1,)], columns=['var']) + time_stamp = datetime(2000, 2, 29, 14, 21) + data_label = 'This is a data file.' + with tm.ensure_clean() as path: + original.to_stata(path, time_stamp=time_stamp, data_label=data_label) + reader = StataReader(path) + parsed_time_stamp = dt.datetime.strptime(reader.time_stamp, ('%d %b %Y %H:%M')) + assert parsed_time_stamp == time_stamp + assert reader.data_label == data_label + if __name__ == '__main__':