Skip to content

Commit 0fe5a34

Browse files
bashtagejreback
authored andcommitted
ENH: Enable automatic writing of dates to Stata files
Automatically select type %tc for datetime[ns] columns Change ValueErrors to NotImplementedError for unsupported types Add tests for select exceptions Improve to_stata and StataWriter docstrings closes pandas-dev#12259 closes pandas-dev#13710
1 parent b37ec14 commit 0fe5a34

File tree

4 files changed

+124
-49
lines changed

4 files changed

+124
-49
lines changed

doc/source/whatsnew/v0.19.0.txt

+1
Original file line numberDiff line numberDiff line change
@@ -310,6 +310,7 @@ Other enhancements
310310
- ``Series`` has gained the properties ``.is_monotonic``, ``.is_monotonic_increasing``, ``.is_monotonic_decreasing``, similar to ``Index`` (:issue:`13336`)
311311
- ``Series.append`` now supports the ``ignore_index`` option (:issue:`13677`)
312312
- ``.to_stata()`` and ``StataWriter`` can now write variable labels to Stata dta files using a dictionary to make column names to labels (:issue:`13535`, :issue:`13536`)
313+
- ``.to_stata()`` and ``StataWriter`` will automatically convert ``datetime64[ns]`` columns to Stata format ``%tc``, rather than raising a ``ValueError`` (:issue:`12259`)
313314

314315
.. _whatsnew_0190.api:
315316

pandas/core/frame.py

+25-14
Original file line numberDiff line numberDiff line change
@@ -1473,31 +1473,42 @@ def to_stata(self, fname, convert_dates=None, write_index=True,
14731473
14741474
Parameters
14751475
----------
1476-
fname : file path or buffer
1477-
Where to save the dta file.
1476+
fname : str or buffer
1477+
String path of file-like object
14781478
convert_dates : dict
1479-
Dictionary mapping column of datetime types to the stata internal
1480-
format that you want to use for the dates. Options are
1481-
'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
1482-
number or a name.
1479+
Dictionary mapping columns containing datetime types to stata internal
1480+
format to use when wirting the dates. Options are 'tc', 'td', 'tm',
1481+
'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
1482+
Datetime columns that do not have a conversion type specified will be
1483+
converted to 'tc'. Raises NotImplementedError if a datetime column has
1484+
timezone information
14831485
write_index : bool
14841486
Write the index to Stata dataset.
14851487
encoding : str
1486-
Default is latin-1. Note that Stata does not support unicode.
1488+
Default is latin-1. Unicode is not supported
14871489
byteorder : str
1488-
Can be ">", "<", "little", or "big". The default is None which uses
1489-
`sys.byteorder`
1490+
Can be ">", "<", "little", or "big". default is `sys.byteorder`
14901491
time_stamp : datetime
1491-
A date time to use when writing the file. Can be None, in which
1492-
case the current time is used.
1492+
A datetime to use as file creation date. Default is the current time
14931493
dataset_label : str
1494-
A label for the data set. Should be 80 characters or smaller.
1494+
A label for the data set. Must be 80 characters or smaller.
14951495
14961496
.. versionadded:: 0.19.0
14971497
14981498
variable_labels : dict
1499-
Dictionary containing columns as keys and variable labels as
1500-
values. Each label must be 80 characters or smaller.
1499+
Dictionary containing columns as keys and variable labels as values.
1500+
Each label must be 80 characters or smaller.
1501+
1502+
Raises
1503+
------
1504+
NotImplementedError
1505+
* If datetimes contain timezone information
1506+
* Column dtype is not representable in Stata
1507+
ValueError
1508+
* Columns listed in convert_dates are noth either datetime64[ns]
1509+
or datetime.datetime
1510+
* Column listed in convert_dates is not in DataFrame
1511+
* Categorical label contains more than 32,000 characters
15011512
15021513
Examples
15031514
--------

pandas/io/stata.py

+50-33
Original file line numberDiff line numberDiff line change
@@ -432,7 +432,7 @@ def parse_dates_safe(dates, delta=False, year=False, days=False):
432432
d = parse_dates_safe(dates, year=True)
433433
conv_dates = d.year
434434
else:
435-
raise ValueError("fmt %s not understood" % fmt)
435+
raise ValueError("Format %s is not a known Stata date format" % fmt)
436436

437437
conv_dates = Series(conv_dates, dtype=np.float64)
438438
missing_value = struct.unpack('<d', b'\x00\x00\x00\x00\x00\x00\xe0\x7f')[0]
@@ -1709,7 +1709,7 @@ def _convert_datetime_to_stata_type(fmt):
17091709
"%tq", "th", "%th", "ty", "%ty"]:
17101710
return np.float64 # Stata expects doubles for SIFs
17111711
else:
1712-
raise ValueError("fmt %s not understood" % fmt)
1712+
raise NotImplementedError("Format %s not implemented" % fmt)
17131713

17141714

17151715
def _maybe_convert_to_int_keys(convert_dates, varlist):
@@ -1721,9 +1721,8 @@ def _maybe_convert_to_int_keys(convert_dates, varlist):
17211721
new_dict.update({varlist.index(key): convert_dates[key]})
17221722
else:
17231723
if not isinstance(key, int):
1724-
raise ValueError(
1725-
"convert_dates key is not in varlist and is not an int"
1726-
)
1724+
raise ValueError("convert_dates key must be a "
1725+
"column or an integer")
17271726
new_dict.update({key: convert_dates[key]})
17281727
return new_dict
17291728

@@ -1763,8 +1762,7 @@ def _dtype_to_stata_type(dtype, column):
17631762
elif dtype == np.int8:
17641763
return chr(251)
17651764
else: # pragma : no cover
1766-
raise ValueError("Data type %s not currently understood. "
1767-
"Please report an error to the developers." % dtype)
1765+
raise NotImplementedError("Data type %s not supported." % dtype)
17681766

17691767

17701768
def _dtype_to_default_stata_fmt(dtype, column):
@@ -1801,35 +1799,36 @@ def _dtype_to_default_stata_fmt(dtype, column):
18011799
elif dtype == np.int8 or dtype == np.int16:
18021800
return "%8.0g"
18031801
else: # pragma : no cover
1804-
raise ValueError("Data type %s not currently understood. "
1805-
"Please report an error to the developers." % dtype)
1802+
raise NotImplementedError("Data type %s not supported." % dtype)
18061803

18071804

18081805
class StataWriter(StataParser):
18091806
"""
1810-
A class for writing Stata binary dta files from array-like objects
1807+
A class for writing Stata binary dta files
18111808
18121809
Parameters
18131810
----------
1814-
fname : file path or buffer
1815-
Where to save the dta file.
1816-
data : array-like
1817-
Array-like input to save. Pandas objects are also accepted.
1811+
fname : str or buffer
1812+
String path of file-like object
1813+
data : DataFrame
1814+
Input to save
18181815
convert_dates : dict
1819-
Dictionary mapping column of datetime types to the stata internal
1820-
format that you want to use for the dates. Options are
1821-
'tc', 'td', 'tm', 'tw', 'th', 'tq', 'ty'. Column can be either a
1822-
number or a name.
1816+
Dictionary mapping columns containing datetime types to stata internal
1817+
format to use when wirting the dates. Options are 'tc', 'td', 'tm',
1818+
'tw', 'th', 'tq', 'ty'. Column can be either an integer or a name.
1819+
Datetime columns that do not have a conversion type specified will be
1820+
converted to 'tc'. Raises NotImplementedError if a datetime column has
1821+
timezone information
1822+
write_index : bool
1823+
Write the index to Stata dataset.
18231824
encoding : str
1824-
Default is latin-1. Note that Stata does not support unicode.
1825+
Default is latin-1. Unicode is not supported
18251826
byteorder : str
1826-
Can be ">", "<", "little", or "big". The default is None which uses
1827-
`sys.byteorder`
1827+
Can be ">", "<", "little", or "big". default is `sys.byteorder`
18281828
time_stamp : datetime
1829-
A date time to use when writing the file. Can be None, in which
1830-
case the current time is used.
1829+
A datetime to use as file creation date. Default is the current time
18311830
dataset_label : str
1832-
A label for the data set. Should be 80 characters or smaller.
1831+
A label for the data set. Must be 80 characters or smaller.
18331832
18341833
.. versionadded:: 0.19.0
18351834
@@ -1843,6 +1842,17 @@ class StataWriter(StataParser):
18431842
The StataWriter instance has a write_file method, which will
18441843
write the file to the given `fname`.
18451844
1845+
Raises
1846+
------
1847+
NotImplementedError
1848+
* If datetimes contain timezone information
1849+
ValueError
1850+
* Columns listed in convert_dates are noth either datetime64[ns]
1851+
or datetime.datetime
1852+
* Column dtype is not representable in Stata
1853+
* Column listed in convert_dates is not in DataFrame
1854+
* Categorical label contains more than 32,000 characters
1855+
18461856
Examples
18471857
--------
18481858
>>> import pandas as pd
@@ -1861,7 +1871,7 @@ def __init__(self, fname, data, convert_dates=None, write_index=True,
18611871
encoding="latin-1", byteorder=None, time_stamp=None,
18621872
data_label=None, variable_labels=None):
18631873
super(StataWriter, self).__init__(encoding)
1864-
self._convert_dates = convert_dates
1874+
self._convert_dates = {} if convert_dates is None else convert_dates
18651875
self._write_index = write_index
18661876
self._time_stamp = time_stamp
18671877
self._data_label = data_label
@@ -2041,15 +2051,22 @@ def _prepare_pandas(self, data):
20412051
self.varlist = data.columns.tolist()
20422052

20432053
dtypes = data.dtypes
2044-
if self._convert_dates is not None:
2045-
self._convert_dates = _maybe_convert_to_int_keys(
2046-
self._convert_dates, self.varlist
2054+
2055+
# Ensure all date columns are converted
2056+
for col in data:
2057+
if col in self._convert_dates:
2058+
continue
2059+
if is_datetime64_dtype(data[col]):
2060+
self._convert_dates[col] = 'tc'
2061+
2062+
self._convert_dates = _maybe_convert_to_int_keys(self._convert_dates,
2063+
self.varlist)
2064+
for key in self._convert_dates:
2065+
new_type = _convert_datetime_to_stata_type(
2066+
self._convert_dates[key]
20472067
)
2048-
for key in self._convert_dates:
2049-
new_type = _convert_datetime_to_stata_type(
2050-
self._convert_dates[key]
2051-
)
2052-
dtypes[key] = np.dtype(new_type)
2068+
dtypes[key] = np.dtype(new_type)
2069+
20532070
self.typlist = []
20542071
self.fmtlist = []
20552072
for col, dtype in dtypes.iteritems():

pandas/io/tests/test_stata.py

+48-2
Original file line numberDiff line numberDiff line change
@@ -11,17 +11,17 @@
1111

1212
import nose
1313
import numpy as np
14+
from pandas.tslib import NaT
1415

1516
import pandas as pd
1617
import pandas.util.testing as tm
1718
from pandas import compat
1819
from pandas.compat import iterkeys
1920
from pandas.core.frame import DataFrame, Series
20-
from pandas.types.common import is_categorical_dtype
21-
from pandas.tslib import NaT
2221
from pandas.io.parsers import read_csv
2322
from pandas.io.stata import (read_stata, StataReader, InvalidColumnName,
2423
PossiblePrecisionLoss, StataMissingValue)
24+
from pandas.types.common import is_categorical_dtype
2525

2626

2727
class TestStata(tm.TestCase):
@@ -1165,6 +1165,52 @@ def test_write_variable_label_errors(self):
11651165
with tm.ensure_clean() as path:
11661166
original.to_stata(path, variable_labels=variable_labels_long)
11671167

1168+
def test_default_date_conversion(self):
1169+
# GH 12259
1170+
dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
1171+
dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
1172+
dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
1173+
original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
1174+
'strs': ['apple', 'banana', 'cherry'],
1175+
'dates': dates})
1176+
1177+
with tm.ensure_clean() as path:
1178+
original.to_stata(path, write_index=False)
1179+
reread = read_stata(path, convert_dates=True)
1180+
tm.assert_frame_equal(original, reread)
1181+
1182+
original.to_stata(path,
1183+
write_index=False,
1184+
convert_dates={'dates': 'tc'})
1185+
direct = read_stata(path, convert_dates=True)
1186+
tm.assert_frame_equal(reread, direct)
1187+
1188+
def test_unsupported_type(self):
1189+
original = pd.DataFrame({'a': [1 + 2j, 2 + 4j]})
1190+
1191+
with tm.assertRaises(NotImplementedError):
1192+
with tm.ensure_clean() as path:
1193+
original.to_stata(path)
1194+
1195+
def test_unsupported_datetype(self):
1196+
dates = [dt.datetime(1999, 12, 31, 12, 12, 12, 12000),
1197+
dt.datetime(2012, 12, 21, 12, 21, 12, 21000),
1198+
dt.datetime(1776, 7, 4, 7, 4, 7, 4000)]
1199+
original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
1200+
'strs': ['apple', 'banana', 'cherry'],
1201+
'dates': dates})
1202+
1203+
with tm.assertRaises(NotImplementedError):
1204+
with tm.ensure_clean() as path:
1205+
original.to_stata(path, convert_dates={'dates': 'tC'})
1206+
1207+
dates = pd.date_range('1-1-1990',periods=3,tz='Asia/Hong_Kong')
1208+
original = pd.DataFrame({'nums': [1.0, 2.0, 3.0],
1209+
'strs': ['apple', 'banana', 'cherry'],
1210+
'dates': dates})
1211+
with tm.assertRaises(NotImplementedError):
1212+
with tm.ensure_clean() as path:
1213+
original.to_stata(path)
11681214

11691215
if __name__ == '__main__':
11701216
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],

0 commit comments

Comments
 (0)